Skip to main content

core/stdarch/crates/core_arch/src/x86/
avx512fp16.rs

1use crate::arch::asm;
2use crate::core_arch::{simd::*, x86::*};
3use crate::intrinsics::{fmaf16, simd::*};
4use crate::ptr;
5
6/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
7///
8/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ph)
9#[inline]
10#[target_feature(enable = "avx512fp16")]
11#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
13pub const fn _mm_set_ph(
14    e7: f16,
15    e6: f16,
16    e5: f16,
17    e4: f16,
18    e3: f16,
19    e2: f16,
20    e1: f16,
21    e0: f16,
22) -> __m128h {
23    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
24}
25
26/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
27///
28/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_ph)
29#[inline]
30#[target_feature(enable = "avx512fp16")]
31#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
32#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
33pub const fn _mm256_set_ph(
34    e15: f16,
35    e14: f16,
36    e13: f16,
37    e12: f16,
38    e11: f16,
39    e10: f16,
40    e9: f16,
41    e8: f16,
42    e7: f16,
43    e6: f16,
44    e5: f16,
45    e4: f16,
46    e3: f16,
47    e2: f16,
48    e1: f16,
49    e0: f16,
50) -> __m256h {
51    __m256h([
52        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
53    ])
54}
55
56/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
57///
58/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_ph)
59#[inline]
60#[target_feature(enable = "avx512fp16")]
61#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
62#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
63pub const fn _mm512_set_ph(
64    e31: f16,
65    e30: f16,
66    e29: f16,
67    e28: f16,
68    e27: f16,
69    e26: f16,
70    e25: f16,
71    e24: f16,
72    e23: f16,
73    e22: f16,
74    e21: f16,
75    e20: f16,
76    e19: f16,
77    e18: f16,
78    e17: f16,
79    e16: f16,
80    e15: f16,
81    e14: f16,
82    e13: f16,
83    e12: f16,
84    e11: f16,
85    e10: f16,
86    e9: f16,
87    e8: f16,
88    e7: f16,
89    e6: f16,
90    e5: f16,
91    e4: f16,
92    e3: f16,
93    e2: f16,
94    e1: f16,
95    e0: f16,
96) -> __m512h {
97    __m512h([
98        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
99        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
100    ])
101}
102
103/// Copy half-precision (16-bit) floating-point elements from a to the lower element of dst and zero
104/// the upper 7 elements.
105///
106/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sh)
107#[inline]
108#[target_feature(enable = "avx512fp16")]
109#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
110#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
111pub const fn _mm_set_sh(a: f16) -> __m128h {
112    __m128h([a, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
113}
114
115/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
116///
117/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_ph)
118#[inline]
119#[target_feature(enable = "avx512fp16")]
120#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
121#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
122pub const fn _mm_set1_ph(a: f16) -> __m128h {
123    unsafe { transmute(f16x8::splat(a)) }
124}
125
126/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
127///
128/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_ph)
129#[inline]
130#[target_feature(enable = "avx512fp16")]
131#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
132#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
133pub const fn _mm256_set1_ph(a: f16) -> __m256h {
134    unsafe { transmute(f16x16::splat(a)) }
135}
136
137/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
138///
139/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_ph)
140#[inline]
141#[target_feature(enable = "avx512fp16")]
142#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
143#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
144pub const fn _mm512_set1_ph(a: f16) -> __m512h {
145    unsafe { transmute(f16x32::splat(a)) }
146}
147
148/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
149///
150/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_ph)
151#[inline]
152#[target_feature(enable = "avx512fp16")]
153#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
154#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
155pub const fn _mm_setr_ph(
156    e0: f16,
157    e1: f16,
158    e2: f16,
159    e3: f16,
160    e4: f16,
161    e5: f16,
162    e6: f16,
163    e7: f16,
164) -> __m128h {
165    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
166}
167
168/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
169///
170/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_ph)
171#[inline]
172#[target_feature(enable = "avx512fp16")]
173#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
174#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
175pub const fn _mm256_setr_ph(
176    e0: f16,
177    e1: f16,
178    e2: f16,
179    e3: f16,
180    e4: f16,
181    e5: f16,
182    e6: f16,
183    e7: f16,
184    e8: f16,
185    e9: f16,
186    e10: f16,
187    e11: f16,
188    e12: f16,
189    e13: f16,
190    e14: f16,
191    e15: f16,
192) -> __m256h {
193    __m256h([
194        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
195    ])
196}
197
198/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
199///
200/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_ph)
201#[inline]
202#[target_feature(enable = "avx512fp16")]
203#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
204#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
205pub const fn _mm512_setr_ph(
206    e0: f16,
207    e1: f16,
208    e2: f16,
209    e3: f16,
210    e4: f16,
211    e5: f16,
212    e6: f16,
213    e7: f16,
214    e8: f16,
215    e9: f16,
216    e10: f16,
217    e11: f16,
218    e12: f16,
219    e13: f16,
220    e14: f16,
221    e15: f16,
222    e16: f16,
223    e17: f16,
224    e18: f16,
225    e19: f16,
226    e20: f16,
227    e21: f16,
228    e22: f16,
229    e23: f16,
230    e24: f16,
231    e25: f16,
232    e26: f16,
233    e27: f16,
234    e28: f16,
235    e29: f16,
236    e30: f16,
237    e31: f16,
238) -> __m512h {
239    __m512h([
240        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
241        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
242    ])
243}
244
245/// Return vector of type __m128h with all elements set to zero.
246///
247/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_ph)
248#[inline]
249#[target_feature(enable = "avx512fp16,avx512vl")]
250#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
251#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
252pub const fn _mm_setzero_ph() -> __m128h {
253    unsafe { transmute(f16x8::ZERO) }
254}
255
256/// Return vector of type __m256h with all elements set to zero.
257///
258/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_ph)
259#[inline]
260#[target_feature(enable = "avx512fp16,avx512vl")]
261#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
262#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
263pub const fn _mm256_setzero_ph() -> __m256h {
264    f16x16::ZERO.as_m256h()
265}
266
267/// Return vector of type __m512h with all elements set to zero.
268///
269/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_ph)
270#[inline]
271#[target_feature(enable = "avx512fp16")]
272#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
273#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
274pub const fn _mm512_setzero_ph() -> __m512h {
275    f16x32::ZERO.as_m512h()
276}
277
278/// Return vector of type `__m128h` with indetermination elements.
279/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
280/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
281/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
282///
283/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ph)
284#[inline]
285#[target_feature(enable = "avx512fp16,avx512vl")]
286#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
287#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
288pub const fn _mm_undefined_ph() -> __m128h {
289    f16x8::ZERO.as_m128h()
290}
291
292/// Return vector of type `__m256h` with indetermination elements.
293/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
294/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
295/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
296///
297/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_ph)
298#[inline]
299#[target_feature(enable = "avx512fp16,avx512vl")]
300#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
301#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
302pub const fn _mm256_undefined_ph() -> __m256h {
303    f16x16::ZERO.as_m256h()
304}
305
306/// Return vector of type `__m512h` with indetermination elements.
307/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
308/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
309/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
310///
311/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ph)
312#[inline]
313#[target_feature(enable = "avx512fp16")]
314#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
315#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
316pub const fn _mm512_undefined_ph() -> __m512h {
317    f16x32::ZERO.as_m512h()
318}
319
320/// Cast vector of type `__m128d` to type `__m128h`. This intrinsic is only used for compilation and
321/// does not generate any instructions, thus it has zero latency.
322///
323/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ph)
324#[inline]
325#[target_feature(enable = "avx512fp16")]
326#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
327#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
328pub const fn _mm_castpd_ph(a: __m128d) -> __m128h {
329    unsafe { transmute(a) }
330}
331
332/// Cast vector of type `__m256d` to type `__m256h`. This intrinsic is only used for compilation and
333/// does not generate any instructions, thus it has zero latency.
334///
335/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_ph)
336#[inline]
337#[target_feature(enable = "avx512fp16")]
338#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
339#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
340pub const fn _mm256_castpd_ph(a: __m256d) -> __m256h {
341    unsafe { transmute(a) }
342}
343
344/// Cast vector of type `__m512d` to type `__m512h`. This intrinsic is only used for compilation and
345/// does not generate any instructions, thus it has zero latency.
346///
347/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ph)
348#[inline]
349#[target_feature(enable = "avx512fp16")]
350#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
351#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
352pub const fn _mm512_castpd_ph(a: __m512d) -> __m512h {
353    unsafe { transmute(a) }
354}
355
356/// Cast vector of type `__m128h` to type `__m128d`. This intrinsic is only used for compilation and
357/// does not generate any instructions, thus it has zero latency.
358///
359/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_pd)
360#[inline]
361#[target_feature(enable = "avx512fp16")]
362#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
363#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
364pub const fn _mm_castph_pd(a: __m128h) -> __m128d {
365    unsafe { transmute(a) }
366}
367
368/// Cast vector of type `__m256h` to type `__m256d`. This intrinsic is only used for compilation and
369/// does not generate any instructions, thus it has zero latency.
370///
371/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_pd)
372#[inline]
373#[target_feature(enable = "avx512fp16")]
374#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
375#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
376pub const fn _mm256_castph_pd(a: __m256h) -> __m256d {
377    unsafe { transmute(a) }
378}
379
380/// Cast vector of type `__m512h` to type `__m512d`. This intrinsic is only used for compilation and
381/// does not generate any instructions, thus it has zero latency.
382///
383/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_pd)
384#[inline]
385#[target_feature(enable = "avx512fp16")]
386#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
387#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
388pub const fn _mm512_castph_pd(a: __m512h) -> __m512d {
389    unsafe { transmute(a) }
390}
391
392/// Cast vector of type `__m128` to type `__m128h`. This intrinsic is only used for compilation and
393/// does not generate any instructions, thus it has zero latency.
394///
395/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_ph)
396#[inline]
397#[target_feature(enable = "avx512fp16")]
398#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
399#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
400pub const fn _mm_castps_ph(a: __m128) -> __m128h {
401    unsafe { transmute(a) }
402}
403
404/// Cast vector of type `__m256` to type `__m256h`. This intrinsic is only used for compilation and
405/// does not generate any instructions, thus it has zero latency.
406///
407/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps_ph)
408#[inline]
409#[target_feature(enable = "avx512fp16")]
410#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
411#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
412pub const fn _mm256_castps_ph(a: __m256) -> __m256h {
413    unsafe { transmute(a) }
414}
415
416/// Cast vector of type `__m512` to type `__m512h`. This intrinsic is only used for compilation and
417/// does not generate any instructions, thus it has zero latency.
418///
419/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_ph)
420#[inline]
421#[target_feature(enable = "avx512fp16")]
422#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
423#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
424pub const fn _mm512_castps_ph(a: __m512) -> __m512h {
425    unsafe { transmute(a) }
426}
427
428/// Cast vector of type `__m128h` to type `__m128`. This intrinsic is only used for compilation and
429/// does not generate any instructions, thus it has zero latency.
430///
431/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_ps)
432#[inline]
433#[target_feature(enable = "avx512fp16")]
434#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
435#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
436pub const fn _mm_castph_ps(a: __m128h) -> __m128 {
437    unsafe { transmute(a) }
438}
439
440/// Cast vector of type `__m256h` to type `__m256`. This intrinsic is only used for compilation and
441/// does not generate any instructions, thus it has zero latency.
442///
443/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_ps)
444#[inline]
445#[target_feature(enable = "avx512fp16")]
446#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
447#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
448pub const fn _mm256_castph_ps(a: __m256h) -> __m256 {
449    unsafe { transmute(a) }
450}
451
452/// Cast vector of type `__m512h` to type `__m512`. This intrinsic is only used for compilation and
453/// does not generate any instructions, thus it has zero latency.
454///
455/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_ps)
456#[inline]
457#[target_feature(enable = "avx512fp16")]
458#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
459#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
460pub const fn _mm512_castph_ps(a: __m512h) -> __m512 {
461    unsafe { transmute(a) }
462}
463
464/// Cast vector of type `__m128i` to type `__m128h`. This intrinsic is only used for compilation and
465/// does not generate any instructions, thus it has zero latency.
466///
467/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_ph)
468#[inline]
469#[target_feature(enable = "avx512fp16")]
470#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
471#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
472pub const fn _mm_castsi128_ph(a: __m128i) -> __m128h {
473    unsafe { transmute(a) }
474}
475
476/// Cast vector of type `__m256i` to type `__m256h`. This intrinsic is only used for compilation and
477/// does not generate any instructions, thus it has zero latency.
478///
479/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_ph)
480#[inline]
481#[target_feature(enable = "avx512fp16")]
482#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
483#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
484pub const fn _mm256_castsi256_ph(a: __m256i) -> __m256h {
485    unsafe { transmute(a) }
486}
487
488/// Cast vector of type `__m512i` to type `__m512h`. This intrinsic is only used for compilation and
489/// does not generate any instructions, thus it has zero latency.
490///
491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_ph)
492#[inline]
493#[target_feature(enable = "avx512fp16")]
494#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
495#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
496pub const fn _mm512_castsi512_ph(a: __m512i) -> __m512h {
497    unsafe { transmute(a) }
498}
499
500/// Cast vector of type `__m128h` to type `__m128i`. This intrinsic is only used for compilation and
501/// does not generate any instructions, thus it has zero latency.
502///
503/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_si128)
504#[inline]
505#[target_feature(enable = "avx512fp16")]
506#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
507#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
508pub const fn _mm_castph_si128(a: __m128h) -> __m128i {
509    unsafe { transmute(a) }
510}
511
512/// Cast vector of type `__m256h` to type `__m256i`. This intrinsic is only used for compilation and
513/// does not generate any instructions, thus it has zero latency.
514///
515/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_si256)
516#[inline]
517#[target_feature(enable = "avx512fp16")]
518#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
519#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
520pub const fn _mm256_castph_si256(a: __m256h) -> __m256i {
521    unsafe { transmute(a) }
522}
523
524/// Cast vector of type `__m512h` to type `__m512i`. This intrinsic is only used for compilation and
525/// does not generate any instructions, thus it has zero latency.
526///
527/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_si512)
528#[inline]
529#[target_feature(enable = "avx512fp16")]
530#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
531#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
532pub const fn _mm512_castph_si512(a: __m512h) -> __m512i {
533    unsafe { transmute(a) }
534}
535
536/// Cast vector of type `__m256h` to type `__m128h`. This intrinsic is only used for compilation and
537/// does not generate any instructions, thus it has zero latency.
538///
539/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph256_ph128)
540#[inline]
541#[target_feature(enable = "avx512fp16")]
542#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
543#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
544pub const fn _mm256_castph256_ph128(a: __m256h) -> __m128h {
545    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
546}
547
548/// Cast vector of type `__m512h` to type `__m128h`. This intrinsic is only used for compilation and
549/// does not generate any instructions, thus it has zero latency.
550///
551/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph128)
552#[inline]
553#[target_feature(enable = "avx512fp16")]
554#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
555#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
556pub const fn _mm512_castph512_ph128(a: __m512h) -> __m128h {
557    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
558}
559
560/// Cast vector of type `__m512h` to type `__m256h`. This intrinsic is only used for compilation and
561/// does not generate any instructions, thus it has zero latency.
562///
563/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph256)
564#[inline]
565#[target_feature(enable = "avx512fp16")]
566#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
567#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
568pub const fn _mm512_castph512_ph256(a: __m512h) -> __m256h {
569    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
570}
571
572/// Cast vector of type `__m128h` to type `__m256h`. The upper 8 elements of the result are undefined.
573/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
574/// but most of the time it does not generate any instructions.
575///
576/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph128_ph256)
577#[inline]
578#[target_feature(enable = "avx512fp16")]
579#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
580#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
581pub const fn _mm256_castph128_ph256(a: __m128h) -> __m256h {
582    unsafe {
583        simd_shuffle!(
584            a,
585            _mm_undefined_ph(),
586            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
587        )
588    }
589}
590
591/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are undefined.
592/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
593/// but most of the time it does not generate any instructions.
594///
595/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph128_ph512)
596#[inline]
597#[target_feature(enable = "avx512fp16")]
598#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
599#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
600pub const fn _mm512_castph128_ph512(a: __m128h) -> __m512h {
601    unsafe {
602        simd_shuffle!(
603            a,
604            _mm_undefined_ph(),
605            [
606                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
607                8, 8, 8, 8
608            ]
609        )
610    }
611}
612
613/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are undefined.
614/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
615/// but most of the time it does not generate any instructions.
616///
617/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph256_ph512)
618#[inline]
619#[target_feature(enable = "avx512fp16")]
620#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
621#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
622pub const fn _mm512_castph256_ph512(a: __m256h) -> __m512h {
623    unsafe {
624        simd_shuffle!(
625            a,
626            _mm256_undefined_ph(),
627            [
628                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
629                16, 16, 16, 16, 16, 16, 16, 16, 16
630            ]
631        )
632    }
633}
634
635/// Cast vector of type `__m256h` to type `__m128h`. The upper 8 elements of the result are zeroed.
636/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
637/// any instructions.
638///
639/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextph128_ph256)
640#[inline]
641#[target_feature(enable = "avx512fp16")]
642#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
643#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
644pub const fn _mm256_zextph128_ph256(a: __m128h) -> __m256h {
645    unsafe {
646        simd_shuffle!(
647            a,
648            _mm_setzero_ph(),
649            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
650        )
651    }
652}
653
654/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are zeroed.
655/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
656/// any instructions.
657///
658/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph256_ph512)
659#[inline]
660#[target_feature(enable = "avx512fp16")]
661#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
662#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
663pub const fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
664    unsafe {
665        simd_shuffle!(
666            a,
667            _mm256_setzero_ph(),
668            [
669                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
670                16, 16, 16, 16, 16, 16, 16, 16, 16
671            ]
672        )
673    }
674}
675
676/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are zeroed.
677/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
678/// any instructions.
679///
680/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph128_ph512)
681#[inline]
682#[target_feature(enable = "avx512fp16")]
683#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
684#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
685pub const fn _mm512_zextph128_ph512(a: __m128h) -> __m512h {
686    unsafe {
687        simd_shuffle!(
688            a,
689            _mm_setzero_ph(),
690            [
691                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
692                8, 8, 8, 8
693            ]
694        )
695    }
696}
697
698/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
699/// operand specified by imm8, and store the results in mask vector k.
700///
701/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ph_mask)
702#[inline]
703#[target_feature(enable = "avx512fp16,avx512vl")]
704#[rustc_legacy_const_generics(2)]
705#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
706pub fn _mm_cmp_ph_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
707    _mm_mask_cmp_ph_mask::<IMM5>(!0, a, b)
708}
709
710/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
711/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
712/// zeroed out when the corresponding mask bit is not set).
713///
714/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ph_mask)
715#[inline]
716#[target_feature(enable = "avx512fp16,avx512vl")]
717#[rustc_legacy_const_generics(3)]
718#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
719pub fn _mm_mask_cmp_ph_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
720    unsafe {
721        static_assert_uimm_bits!(IMM5, 5);
722        vcmpph_128(a, b, IMM5, k1)
723    }
724}
725
726/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
727/// operand specified by imm8, and store the results in mask vector k.
728///
729/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_ph_mask)
730#[inline]
731#[target_feature(enable = "avx512fp16,avx512vl")]
732#[rustc_legacy_const_generics(2)]
733#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
734pub fn _mm256_cmp_ph_mask<const IMM5: i32>(a: __m256h, b: __m256h) -> __mmask16 {
735    _mm256_mask_cmp_ph_mask::<IMM5>(!0, a, b)
736}
737
738/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
739/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
740/// zeroed out when the corresponding mask bit is not set).
741///
742/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_ph_mask)
743#[inline]
744#[target_feature(enable = "avx512fp16,avx512vl")]
745#[rustc_legacy_const_generics(3)]
746#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
747pub fn _mm256_mask_cmp_ph_mask<const IMM5: i32>(
748    k1: __mmask16,
749    a: __m256h,
750    b: __m256h,
751) -> __mmask16 {
752    unsafe {
753        static_assert_uimm_bits!(IMM5, 5);
754        vcmpph_256(a, b, IMM5, k1)
755    }
756}
757
758/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
759/// operand specified by imm8, and store the results in mask vector k.
760///
761/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_ph_mask)
762#[inline]
763#[target_feature(enable = "avx512fp16")]
764#[rustc_legacy_const_generics(2)]
765#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
766pub fn _mm512_cmp_ph_mask<const IMM5: i32>(a: __m512h, b: __m512h) -> __mmask32 {
767    _mm512_mask_cmp_ph_mask::<IMM5>(!0, a, b)
768}
769
770/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
771/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
772/// zeroed out when the corresponding mask bit is not set).
773///
774/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_ph_mask)
775#[inline]
776#[target_feature(enable = "avx512fp16")]
777#[rustc_legacy_const_generics(3)]
778#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
779pub fn _mm512_mask_cmp_ph_mask<const IMM5: i32>(
780    k1: __mmask32,
781    a: __m512h,
782    b: __m512h,
783) -> __mmask32 {
784    _mm512_mask_cmp_round_ph_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(k1, a, b)
785}
786
787/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
788/// operand specified by imm8, and store the results in mask vector k.
789///
790/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
791///
792/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_ph_mask)
793#[inline]
794#[target_feature(enable = "avx512fp16")]
795#[rustc_legacy_const_generics(2, 3)]
796#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
797pub fn _mm512_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
798    a: __m512h,
799    b: __m512h,
800) -> __mmask32 {
801    _mm512_mask_cmp_round_ph_mask::<IMM5, SAE>(!0, a, b)
802}
803
804/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
805/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
806/// zeroed out when the corresponding mask bit is not set).
807///
808/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
809///
810/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_ph_mask)
811#[inline]
812#[target_feature(enable = "avx512fp16")]
813#[rustc_legacy_const_generics(3, 4)]
814#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
815pub fn _mm512_mask_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
816    k1: __mmask32,
817    a: __m512h,
818    b: __m512h,
819) -> __mmask32 {
820    unsafe {
821        static_assert_uimm_bits!(IMM5, 5);
822        static_assert_sae!(SAE);
823        vcmpph_512(a, b, IMM5, k1, SAE)
824    }
825}
826
827/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
828/// operand specified by imm8, and store the result in mask vector k. Exceptions can be suppressed by
829/// passing _MM_FROUND_NO_EXC in the sae parameter.
830///
831/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sh_mask)
832#[inline]
833#[target_feature(enable = "avx512fp16")]
834#[rustc_legacy_const_generics(2, 3)]
835#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
836pub fn _mm_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __mmask8 {
837    static_assert_uimm_bits!(IMM5, 5);
838    static_assert_sae!(SAE);
839    _mm_mask_cmp_round_sh_mask::<IMM5, SAE>(0xff, a, b)
840}
841
842/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
843/// operand specified by imm8, and store the result in mask vector k using zeromask k1. Exceptions can be
844/// suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
845///
846/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sh_mask)
847#[inline]
848#[target_feature(enable = "avx512fp16")]
849#[rustc_legacy_const_generics(3, 4)]
850#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
851pub fn _mm_mask_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(
852    k1: __mmask8,
853    a: __m128h,
854    b: __m128h,
855) -> __mmask8 {
856    unsafe {
857        static_assert_uimm_bits!(IMM5, 5);
858        static_assert_sae!(SAE);
859        vcmpsh(a, b, IMM5, k1, SAE)
860    }
861}
862
863/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
864/// operand specified by imm8, and store the result in mask vector k.
865///
866/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sh_mask)
867#[inline]
868#[target_feature(enable = "avx512fp16")]
869#[rustc_legacy_const_generics(2)]
870#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
871pub fn _mm_cmp_sh_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
872    static_assert_uimm_bits!(IMM5, 5);
873    _mm_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
874}
875
876/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
877/// operand specified by imm8, and store the result in mask vector k using zeromask k1.
878///
879/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sh_mask)
880#[inline]
881#[target_feature(enable = "avx512fp16")]
882#[rustc_legacy_const_generics(3)]
883#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
884pub fn _mm_mask_cmp_sh_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
885    static_assert_uimm_bits!(IMM5, 5);
886    _mm_mask_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(k1, a, b)
887}
888
889/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
890/// operand specified by imm8, and return the boolean result (0 or 1).
891/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
892///
893/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_round_sh)
894#[inline]
895#[target_feature(enable = "avx512fp16")]
896#[rustc_legacy_const_generics(2, 3)]
897#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
898pub fn _mm_comi_round_sh<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> i32 {
899    unsafe {
900        static_assert_uimm_bits!(IMM5, 5);
901        static_assert_sae!(SAE);
902        vcomish(a, b, IMM5, SAE)
903    }
904}
905
906/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
907/// operand specified by imm8, and return the boolean result (0 or 1).
908///
909/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_sh)
910#[inline]
911#[target_feature(enable = "avx512fp16")]
912#[rustc_legacy_const_generics(2)]
913#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
914pub fn _mm_comi_sh<const IMM5: i32>(a: __m128h, b: __m128h) -> i32 {
915    static_assert_uimm_bits!(IMM5, 5);
916    _mm_comi_round_sh::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
917}
918
919/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and return
920/// the boolean result (0 or 1).
921///
922/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sh)
923#[inline]
924#[target_feature(enable = "avx512fp16")]
925#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
926pub fn _mm_comieq_sh(a: __m128h, b: __m128h) -> i32 {
927    _mm_comi_sh::<_CMP_EQ_OS>(a, b)
928}
929
930/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
931/// and return the boolean result (0 or 1).
932///
933/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sh)
934#[inline]
935#[target_feature(enable = "avx512fp16")]
936#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
937pub fn _mm_comige_sh(a: __m128h, b: __m128h) -> i32 {
938    _mm_comi_sh::<_CMP_GE_OS>(a, b)
939}
940
941/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
942/// the boolean result (0 or 1).
943///
944/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sh)
945#[inline]
946#[target_feature(enable = "avx512fp16")]
947#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
948pub fn _mm_comigt_sh(a: __m128h, b: __m128h) -> i32 {
949    _mm_comi_sh::<_CMP_GT_OS>(a, b)
950}
951
952/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
953/// return the boolean result (0 or 1).
954///
955/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sh)
956#[inline]
957#[target_feature(enable = "avx512fp16")]
958#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
959pub fn _mm_comile_sh(a: __m128h, b: __m128h) -> i32 {
960    _mm_comi_sh::<_CMP_LE_OS>(a, b)
961}
962
963/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
964/// the boolean result (0 or 1).
965///
966/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sh)
967#[inline]
968#[target_feature(enable = "avx512fp16")]
969#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
970pub fn _mm_comilt_sh(a: __m128h, b: __m128h) -> i32 {
971    _mm_comi_sh::<_CMP_LT_OS>(a, b)
972}
973
974/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
975/// the boolean result (0 or 1).
976///
977/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sh)
978#[inline]
979#[target_feature(enable = "avx512fp16")]
980#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
981pub fn _mm_comineq_sh(a: __m128h, b: __m128h) -> i32 {
982    _mm_comi_sh::<_CMP_NEQ_US>(a, b)
983}
984
985/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and
986/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
987///
988/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_sh)
989#[inline]
990#[target_feature(enable = "avx512fp16")]
991#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
992pub fn _mm_ucomieq_sh(a: __m128h, b: __m128h) -> i32 {
993    _mm_comi_sh::<_CMP_EQ_OQ>(a, b)
994}
995
996/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
997/// and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
998///
999/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_sh)
1000#[inline]
1001#[target_feature(enable = "avx512fp16")]
1002#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1003pub fn _mm_ucomige_sh(a: __m128h, b: __m128h) -> i32 {
1004    _mm_comi_sh::<_CMP_GE_OQ>(a, b)
1005}
1006
1007/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
1008/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1009///
1010/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_sh)
1011#[inline]
1012#[target_feature(enable = "avx512fp16")]
1013#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1014pub fn _mm_ucomigt_sh(a: __m128h, b: __m128h) -> i32 {
1015    _mm_comi_sh::<_CMP_GT_OQ>(a, b)
1016}
1017
1018/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
1019/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1020///
1021/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_sh)
1022#[inline]
1023#[target_feature(enable = "avx512fp16")]
1024#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1025pub fn _mm_ucomile_sh(a: __m128h, b: __m128h) -> i32 {
1026    _mm_comi_sh::<_CMP_LE_OQ>(a, b)
1027}
1028
1029/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
1030/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1031///
1032/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_sh)
1033#[inline]
1034#[target_feature(enable = "avx512fp16")]
1035#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1036pub fn _mm_ucomilt_sh(a: __m128h, b: __m128h) -> i32 {
1037    _mm_comi_sh::<_CMP_LT_OQ>(a, b)
1038}
1039
1040/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
1041/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1042///
1043/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_sh)
1044#[inline]
1045#[target_feature(enable = "avx512fp16")]
1046#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1047pub fn _mm_ucomineq_sh(a: __m128h, b: __m128h) -> i32 {
1048    _mm_comi_sh::<_CMP_NEQ_UQ>(a, b)
1049}
1050
1051/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
1052/// a new vector. The address must be aligned to 16 bytes or a general-protection exception may be generated.
1053///
1054/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ph)
1055#[inline]
1056#[target_feature(enable = "avx512fp16,avx512vl")]
1057#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1058#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1059pub const unsafe fn _mm_load_ph(mem_addr: *const f16) -> __m128h {
1060    *mem_addr.cast()
1061}
1062
1063/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
1064/// a new vector. The address must be aligned to 32 bytes or a general-protection exception may be generated.
1065///
1066/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_ph)
1067#[inline]
1068#[target_feature(enable = "avx512fp16,avx512vl")]
1069#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1070#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1071pub const unsafe fn _mm256_load_ph(mem_addr: *const f16) -> __m256h {
1072    *mem_addr.cast()
1073}
1074
1075/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
1076/// a new vector. The address must be aligned to 64 bytes or a general-protection exception may be generated.
1077///
1078/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_ph)
1079#[inline]
1080#[target_feature(enable = "avx512fp16")]
1081#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1082#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1083pub const unsafe fn _mm512_load_ph(mem_addr: *const f16) -> __m512h {
1084    *mem_addr.cast()
1085}
1086
1087/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector,
1088/// and zero the upper elements
1089///
1090/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sh)
1091#[inline]
1092#[target_feature(enable = "avx512fp16")]
1093#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1094#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1095pub const unsafe fn _mm_load_sh(mem_addr: *const f16) -> __m128h {
1096    _mm_set_sh(*mem_addr)
1097}
1098
1099/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
1100/// using writemask k (the element is copied from src when mask bit 0 is not set), and zero the upper elements.
1101///
1102/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sh)
1103#[inline]
1104#[target_feature(enable = "avx512fp16")]
1105#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1106pub unsafe fn _mm_mask_load_sh(src: __m128h, k: __mmask8, mem_addr: *const f16) -> __m128h {
1107    let mut dst = src;
1108    asm!(
1109        vpl!("vmovsh {dst}{{{k}}}"),
1110        dst = inout(xmm_reg) dst,
1111        k = in(kreg) k,
1112        p = in(reg) mem_addr,
1113        options(pure, readonly, nostack, preserves_flags)
1114    );
1115    dst
1116}
1117
1118/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
1119/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and zero the upper elements.
1120///
1121/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sh)
1122#[inline]
1123#[target_feature(enable = "avx512fp16")]
1124#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1125pub unsafe fn _mm_maskz_load_sh(k: __mmask8, mem_addr: *const f16) -> __m128h {
1126    let mut dst: __m128h;
1127    asm!(
1128        vpl!("vmovsh {dst}{{{k}}}{{z}}"),
1129        dst = out(xmm_reg) dst,
1130        k = in(kreg) k,
1131        p = in(reg) mem_addr,
1132        options(pure, readonly, nostack, preserves_flags)
1133    );
1134    dst
1135}
1136
1137/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
1138/// a new vector. The address does not need to be aligned to any particular boundary.
1139///
1140/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_ph)
1141#[inline]
1142#[target_feature(enable = "avx512fp16,avx512vl")]
1143#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1144#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1145pub const unsafe fn _mm_loadu_ph(mem_addr: *const f16) -> __m128h {
1146    ptr::read_unaligned(mem_addr.cast())
1147}
1148
1149/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
1150/// a new vector. The address does not need to be aligned to any particular boundary.
1151///
1152/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_ph)
1153#[inline]
1154#[target_feature(enable = "avx512fp16,avx512vl")]
1155#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1156#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1157pub const unsafe fn _mm256_loadu_ph(mem_addr: *const f16) -> __m256h {
1158    ptr::read_unaligned(mem_addr.cast())
1159}
1160
1161/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
1162/// a new vector. The address does not need to be aligned to any particular boundary.
1163///
1164/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_ph)
1165#[inline]
1166#[target_feature(enable = "avx512fp16")]
1167#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1168#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1169pub const unsafe fn _mm512_loadu_ph(mem_addr: *const f16) -> __m512h {
1170    ptr::read_unaligned(mem_addr.cast())
1171}
1172
1173/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
1174/// using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper
1175/// 7 packed elements from a to the upper elements of dst.
1176///
1177/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_sh)
1178#[inline]
1179#[target_feature(enable = "avx512fp16")]
1180#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1181#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1182pub const fn _mm_mask_move_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1183    unsafe {
1184        let mut mov: f16 = simd_extract!(src, 0);
1185        if (k & 1) != 0 {
1186            mov = simd_extract!(b, 0);
1187        }
1188        simd_insert!(a, 0, mov)
1189    }
1190}
1191
1192/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
1193/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
1194/// elements from a to the upper elements of dst.
1195///
1196/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sh)
1197#[inline]
1198#[target_feature(enable = "avx512fp16")]
1199#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1200#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1201pub const fn _mm_maskz_move_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1202    unsafe {
1203        let mut mov: f16 = 0.;
1204        if (k & 1) != 0 {
1205            mov = simd_extract!(b, 0);
1206        }
1207        simd_insert!(a, 0, mov)
1208    }
1209}
1210
1211/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst,
1212/// and copy the upper 7 packed elements from a to the upper elements of dst.
1213///
1214/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sh)
1215#[inline]
1216#[target_feature(enable = "avx512fp16")]
1217#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1218#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1219pub const fn _mm_move_sh(a: __m128h, b: __m128h) -> __m128h {
1220    unsafe {
1221        let mov: f16 = simd_extract!(b, 0);
1222        simd_insert!(a, 0, mov)
1223    }
1224}
1225
1226/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
1227/// The address must be aligned to 16 bytes or a general-protection exception may be generated.
1228///
1229/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ph)
1230#[inline]
1231#[target_feature(enable = "avx512fp16,avx512vl")]
1232#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1233#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1234pub const unsafe fn _mm_store_ph(mem_addr: *mut f16, a: __m128h) {
1235    *mem_addr.cast() = a;
1236}
1237
1238/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
1239/// The address must be aligned to 32 bytes or a general-protection exception may be generated.
1240///
1241/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_ph)
1242#[inline]
1243#[target_feature(enable = "avx512fp16,avx512vl")]
1244#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1245#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1246pub const unsafe fn _mm256_store_ph(mem_addr: *mut f16, a: __m256h) {
1247    *mem_addr.cast() = a;
1248}
1249
1250/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
1251/// The address must be aligned to 64 bytes or a general-protection exception may be generated.
1252///
1253/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_store_ph)
1254#[inline]
1255#[target_feature(enable = "avx512fp16")]
1256#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1257#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1258pub const unsafe fn _mm512_store_ph(mem_addr: *mut f16, a: __m512h) {
1259    *mem_addr.cast() = a;
1260}
1261
1262/// Store the lower half-precision (16-bit) floating-point element from a into memory.
1263///
1264/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_sh)
1265#[inline]
1266#[target_feature(enable = "avx512fp16")]
1267#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1268#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1269pub const unsafe fn _mm_store_sh(mem_addr: *mut f16, a: __m128h) {
1270    *mem_addr = simd_extract!(a, 0);
1271}
1272
1273/// Store the lower half-precision (16-bit) floating-point element from a into memory using writemask k
1274///
1275/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sh)
1276#[inline]
1277#[target_feature(enable = "avx512fp16")]
1278#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1279pub unsafe fn _mm_mask_store_sh(mem_addr: *mut f16, k: __mmask8, a: __m128h) {
1280    asm!(
1281        vps!("vmovdqu16", "{{{k}}}, {src}"),
1282        p = in(reg) mem_addr,
1283        k = in(kreg) k,
1284        src = in(xmm_reg) a,
1285        options(nostack, preserves_flags)
1286    );
1287}
1288
1289/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
1290/// The address does not need to be aligned to any particular boundary.
1291///
1292/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_ph)
1293#[inline]
1294#[target_feature(enable = "avx512fp16,avx512vl")]
1295#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1296#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1297pub const unsafe fn _mm_storeu_ph(mem_addr: *mut f16, a: __m128h) {
1298    ptr::write_unaligned(mem_addr.cast(), a);
1299}
1300
1301/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
1302/// The address does not need to be aligned to any particular boundary.
1303///
1304/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_ph)
1305#[inline]
1306#[target_feature(enable = "avx512fp16,avx512vl")]
1307#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1308#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1309pub const unsafe fn _mm256_storeu_ph(mem_addr: *mut f16, a: __m256h) {
1310    ptr::write_unaligned(mem_addr.cast(), a);
1311}
1312
1313/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
1314/// The address does not need to be aligned to any particular boundary.
1315///
1316/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_ph)
1317#[inline]
1318#[target_feature(enable = "avx512fp16")]
1319#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1320#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1321pub const unsafe fn _mm512_storeu_ph(mem_addr: *mut f16, a: __m512h) {
1322    ptr::write_unaligned(mem_addr.cast(), a);
1323}
1324
1325/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1326///
1327/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ph)
1328#[inline]
1329#[target_feature(enable = "avx512fp16,avx512vl")]
1330#[cfg_attr(test, assert_instr(vaddph))]
1331#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1332#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1333pub const fn _mm_add_ph(a: __m128h, b: __m128h) -> __m128h {
1334    unsafe { simd_add(a, b) }
1335}
1336
1337/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1338/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1339///
1340/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ph)
1341#[inline]
1342#[target_feature(enable = "avx512fp16,avx512vl")]
1343#[cfg_attr(test, assert_instr(vaddph))]
1344#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1345#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1346pub const fn _mm_mask_add_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1347    unsafe {
1348        let r = _mm_add_ph(a, b);
1349        simd_select_bitmask(k, r, src)
1350    }
1351}
1352
1353/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1354/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1355///
1356/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ph)
1357#[inline]
1358#[target_feature(enable = "avx512fp16,avx512vl")]
1359#[cfg_attr(test, assert_instr(vaddph))]
1360#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1361#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1362pub const fn _mm_maskz_add_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1363    unsafe {
1364        let r = _mm_add_ph(a, b);
1365        simd_select_bitmask(k, r, _mm_setzero_ph())
1366    }
1367}
1368
1369/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1370///
1371/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_ph)
1372#[inline]
1373#[target_feature(enable = "avx512fp16,avx512vl")]
1374#[cfg_attr(test, assert_instr(vaddph))]
1375#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1376#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1377pub const fn _mm256_add_ph(a: __m256h, b: __m256h) -> __m256h {
1378    unsafe { simd_add(a, b) }
1379}
1380
1381/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1382/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1383///
1384/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_ph)
1385#[inline]
1386#[target_feature(enable = "avx512fp16,avx512vl")]
1387#[cfg_attr(test, assert_instr(vaddph))]
1388#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1389#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1390pub const fn _mm256_mask_add_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1391    unsafe {
1392        let r = _mm256_add_ph(a, b);
1393        simd_select_bitmask(k, r, src)
1394    }
1395}
1396
1397/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1398/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1399///
1400/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_ph)
1401#[inline]
1402#[target_feature(enable = "avx512fp16,avx512vl")]
1403#[cfg_attr(test, assert_instr(vaddph))]
1404#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1405#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1406pub const fn _mm256_maskz_add_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1407    unsafe {
1408        let r = _mm256_add_ph(a, b);
1409        simd_select_bitmask(k, r, _mm256_setzero_ph())
1410    }
1411}
1412
1413/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1414///
1415/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_ph)
1416#[inline]
1417#[target_feature(enable = "avx512fp16")]
1418#[cfg_attr(test, assert_instr(vaddph))]
1419#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1420#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1421pub const fn _mm512_add_ph(a: __m512h, b: __m512h) -> __m512h {
1422    unsafe { simd_add(a, b) }
1423}
1424
1425/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1426/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1427///
1428/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_ph)
1429#[inline]
1430#[target_feature(enable = "avx512fp16")]
1431#[cfg_attr(test, assert_instr(vaddph))]
1432#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1433#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1434pub const fn _mm512_mask_add_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1435    unsafe {
1436        let r = _mm512_add_ph(a, b);
1437        simd_select_bitmask(k, r, src)
1438    }
1439}
1440
1441/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1442/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1443///
1444/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_ph)
1445#[inline]
1446#[target_feature(enable = "avx512fp16")]
1447#[cfg_attr(test, assert_instr(vaddph))]
1448#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1449#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1450pub const fn _mm512_maskz_add_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1451    unsafe {
1452        let r = _mm512_add_ph(a, b);
1453        simd_select_bitmask(k, r, _mm512_setzero_ph())
1454    }
1455}
1456
1457/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1458/// Rounding is done according to the rounding parameter, which can be one of:
1459///
1460/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1461/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1462/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1463/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1464/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1465///
1466/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_round_ph)
1467#[inline]
1468#[target_feature(enable = "avx512fp16")]
1469#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1470#[rustc_legacy_const_generics(2)]
1471#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1472pub fn _mm512_add_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
1473    unsafe {
1474        static_assert_rounding!(ROUNDING);
1475        vaddph(a, b, ROUNDING)
1476    }
1477}
1478
1479/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1480/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1481/// Rounding is done according to the rounding parameter, which can be one of:
1482///
1483/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1484/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1485/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1486/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1487/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1488///
1489/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_round_ph)
1490#[inline]
1491#[target_feature(enable = "avx512fp16")]
1492#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1493#[rustc_legacy_const_generics(4)]
1494#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1495pub fn _mm512_mask_add_round_ph<const ROUNDING: i32>(
1496    src: __m512h,
1497    k: __mmask32,
1498    a: __m512h,
1499    b: __m512h,
1500) -> __m512h {
1501    unsafe {
1502        static_assert_rounding!(ROUNDING);
1503        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
1504        simd_select_bitmask(k, r, src)
1505    }
1506}
1507
1508/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1509/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1510/// Rounding is done according to the rounding parameter, which can be one of:
1511///
1512/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1513/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1514/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1515/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1516///
1517/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_round_ph)
1518#[inline]
1519#[target_feature(enable = "avx512fp16")]
1520#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1521#[rustc_legacy_const_generics(3)]
1522#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1523pub fn _mm512_maskz_add_round_ph<const ROUNDING: i32>(
1524    k: __mmask32,
1525    a: __m512h,
1526    b: __m512h,
1527) -> __m512h {
1528    unsafe {
1529        static_assert_rounding!(ROUNDING);
1530        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
1531        simd_select_bitmask(k, r, _mm512_setzero_ph())
1532    }
1533}
1534
1535/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1536/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1537/// Rounding is done according to the rounding parameter, which can be one of:
1538///
1539/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1540/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1541/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1542/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1543/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1544///
1545/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sh)
1546#[inline]
1547#[target_feature(enable = "avx512fp16")]
1548#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1549#[rustc_legacy_const_generics(2)]
1550#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1551pub fn _mm_add_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
1552    static_assert_rounding!(ROUNDING);
1553    _mm_mask_add_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
1554}
1555
1556/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1557/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1558/// writemask k (the element is copied from src when mask bit 0 is not set).
1559/// Rounding is done according to the rounding parameter, which can be one of:
1560///
1561/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1562/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1563/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1564/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1565/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1566///
1567/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_sh)
1568#[inline]
1569#[target_feature(enable = "avx512fp16")]
1570#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1571#[rustc_legacy_const_generics(4)]
1572#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1573pub fn _mm_mask_add_round_sh<const ROUNDING: i32>(
1574    src: __m128h,
1575    k: __mmask8,
1576    a: __m128h,
1577    b: __m128h,
1578) -> __m128h {
1579    unsafe {
1580        static_assert_rounding!(ROUNDING);
1581        vaddsh(a, b, src, k, ROUNDING)
1582    }
1583}
1584
1585/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1586/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1587/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1588/// Rounding is done according to the rounding parameter, which can be one of:
1589///
1590/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1591/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1592/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1593/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1594/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1595///
1596/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sh)
1597#[inline]
1598#[target_feature(enable = "avx512fp16")]
1599#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1600#[rustc_legacy_const_generics(3)]
1601#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1602pub fn _mm_maskz_add_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1603    static_assert_rounding!(ROUNDING);
1604    _mm_mask_add_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
1605}
1606
1607/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1608/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1609///
1610/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sh)
1611#[inline]
1612#[target_feature(enable = "avx512fp16")]
1613#[cfg_attr(test, assert_instr(vaddsh))]
1614#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1615#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1616pub const fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
1617    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) + _mm_cvtsh_h(b)) }
1618}
1619
1620/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1621/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1622/// writemask k (the element is copied from src when mask bit 0 is not set).
1623///
1624/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_sh)
1625#[inline]
1626#[target_feature(enable = "avx512fp16")]
1627#[cfg_attr(test, assert_instr(vaddsh))]
1628#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1629#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1630pub const fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1631    unsafe {
1632        let extractsrc: f16 = simd_extract!(src, 0);
1633        let mut add: f16 = extractsrc;
1634        if (k & 0b00000001) != 0 {
1635            let extracta: f16 = simd_extract!(a, 0);
1636            let extractb: f16 = simd_extract!(b, 0);
1637            add = extracta + extractb;
1638        }
1639        simd_insert!(a, 0, add)
1640    }
1641}
1642
1643/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1644/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1645/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1646///
1647/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sh)
1648#[inline]
1649#[target_feature(enable = "avx512fp16")]
1650#[cfg_attr(test, assert_instr(vaddsh))]
1651#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1652#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1653pub const fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1654    unsafe {
1655        let mut add: f16 = 0.;
1656        if (k & 0b00000001) != 0 {
1657            let extracta: f16 = simd_extract!(a, 0);
1658            let extractb: f16 = simd_extract!(b, 0);
1659            add = extracta + extractb;
1660        }
1661        simd_insert!(a, 0, add)
1662    }
1663}
1664
1665/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1666///
1667/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ph)
1668#[inline]
1669#[target_feature(enable = "avx512fp16,avx512vl")]
1670#[cfg_attr(test, assert_instr(vsubph))]
1671#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1672#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1673pub const fn _mm_sub_ph(a: __m128h, b: __m128h) -> __m128h {
1674    unsafe { simd_sub(a, b) }
1675}
1676
1677/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1678/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1679///
1680/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ph)
1681#[inline]
1682#[target_feature(enable = "avx512fp16,avx512vl")]
1683#[cfg_attr(test, assert_instr(vsubph))]
1684#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1685#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1686pub const fn _mm_mask_sub_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1687    unsafe {
1688        let r = _mm_sub_ph(a, b);
1689        simd_select_bitmask(k, r, src)
1690    }
1691}
1692
1693/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1694/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1695///
1696/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ph)
1697#[inline]
1698#[target_feature(enable = "avx512fp16,avx512vl")]
1699#[cfg_attr(test, assert_instr(vsubph))]
1700#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1701#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1702pub const fn _mm_maskz_sub_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1703    unsafe {
1704        let r = _mm_sub_ph(a, b);
1705        simd_select_bitmask(k, r, _mm_setzero_ph())
1706    }
1707}
1708
1709/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1710///
1711/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_ph)
1712#[inline]
1713#[target_feature(enable = "avx512fp16,avx512vl")]
1714#[cfg_attr(test, assert_instr(vsubph))]
1715#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1716#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1717pub const fn _mm256_sub_ph(a: __m256h, b: __m256h) -> __m256h {
1718    unsafe { simd_sub(a, b) }
1719}
1720
1721/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1722/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1723///
1724/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_ph)
1725#[inline]
1726#[target_feature(enable = "avx512fp16,avx512vl")]
1727#[cfg_attr(test, assert_instr(vsubph))]
1728#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1729#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1730pub const fn _mm256_mask_sub_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1731    unsafe {
1732        let r = _mm256_sub_ph(a, b);
1733        simd_select_bitmask(k, r, src)
1734    }
1735}
1736
1737/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1738/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1739///
1740/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_ph)
1741#[inline]
1742#[target_feature(enable = "avx512fp16,avx512vl")]
1743#[cfg_attr(test, assert_instr(vsubph))]
1744#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1745#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1746pub const fn _mm256_maskz_sub_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1747    unsafe {
1748        let r = _mm256_sub_ph(a, b);
1749        simd_select_bitmask(k, r, _mm256_setzero_ph())
1750    }
1751}
1752
1753/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1754///
1755/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_ph)
1756#[inline]
1757#[target_feature(enable = "avx512fp16")]
1758#[cfg_attr(test, assert_instr(vsubph))]
1759#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1760#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1761pub const fn _mm512_sub_ph(a: __m512h, b: __m512h) -> __m512h {
1762    unsafe { simd_sub(a, b) }
1763}
1764
1765/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1766/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1767///
1768/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_ph)
1769#[inline]
1770#[target_feature(enable = "avx512fp16")]
1771#[cfg_attr(test, assert_instr(vsubph))]
1772#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1773#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1774pub const fn _mm512_mask_sub_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1775    unsafe {
1776        let r = _mm512_sub_ph(a, b);
1777        simd_select_bitmask(k, r, src)
1778    }
1779}
1780
1781/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1782/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1783///
1784/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_ph)
1785#[inline]
1786#[target_feature(enable = "avx512fp16")]
1787#[cfg_attr(test, assert_instr(vsubph))]
1788#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1789#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1790pub const fn _mm512_maskz_sub_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1791    unsafe {
1792        let r = _mm512_sub_ph(a, b);
1793        simd_select_bitmask(k, r, _mm512_setzero_ph())
1794    }
1795}
1796
1797/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1798/// Rounding is done according to the rounding parameter, which can be one of:
1799///
1800/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1801/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1802/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1803/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1804/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1805///
1806/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_ph)
1807#[inline]
1808#[target_feature(enable = "avx512fp16")]
1809#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1810#[rustc_legacy_const_generics(2)]
1811#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1812pub fn _mm512_sub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
1813    unsafe {
1814        static_assert_rounding!(ROUNDING);
1815        vsubph(a, b, ROUNDING)
1816    }
1817}
1818
1819/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1820/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1821/// Rounding is done according to the rounding parameter, which can be one of:
1822///
1823/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1824/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1825/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1826/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1827/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1828///
1829/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ph)
1830#[inline]
1831#[target_feature(enable = "avx512fp16")]
1832#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1833#[rustc_legacy_const_generics(4)]
1834#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1835pub fn _mm512_mask_sub_round_ph<const ROUNDING: i32>(
1836    src: __m512h,
1837    k: __mmask32,
1838    a: __m512h,
1839    b: __m512h,
1840) -> __m512h {
1841    unsafe {
1842        static_assert_rounding!(ROUNDING);
1843        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
1844        simd_select_bitmask(k, r, src)
1845    }
1846}
1847
1848/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1849/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1850/// Rounding is done according to the rounding parameter, which can be one of:
1851///
1852/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1853/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1854/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1855/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1856/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1857///
1858/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ph)
1859#[inline]
1860#[target_feature(enable = "avx512fp16")]
1861#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1862#[rustc_legacy_const_generics(3)]
1863#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1864pub fn _mm512_maskz_sub_round_ph<const ROUNDING: i32>(
1865    k: __mmask32,
1866    a: __m512h,
1867    b: __m512h,
1868) -> __m512h {
1869    unsafe {
1870        static_assert_rounding!(ROUNDING);
1871        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
1872        simd_select_bitmask(k, r, _mm512_setzero_ph())
1873    }
1874}
1875
1876/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1877/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1878/// Rounding is done according to the rounding parameter, which can be one of:
1879///
1880/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1881/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1882/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1883/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1884/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1885///
1886/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sh)
1887#[inline]
1888#[target_feature(enable = "avx512fp16")]
1889#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1890#[rustc_legacy_const_generics(2)]
1891#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1892pub fn _mm_sub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
1893    static_assert_rounding!(ROUNDING);
1894    _mm_mask_sub_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
1895}
1896
1897/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1898/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1899/// writemask k (the element is copied from src when mask bit 0 is not set).
1900/// Rounding is done according to the rounding parameter, which can be one of:
1901///
1902/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1903/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1904/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1905/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1906/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1907///
1908/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sh)
1909#[inline]
1910#[target_feature(enable = "avx512fp16")]
1911#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1912#[rustc_legacy_const_generics(4)]
1913#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1914pub fn _mm_mask_sub_round_sh<const ROUNDING: i32>(
1915    src: __m128h,
1916    k: __mmask8,
1917    a: __m128h,
1918    b: __m128h,
1919) -> __m128h {
1920    unsafe {
1921        static_assert_rounding!(ROUNDING);
1922        vsubsh(a, b, src, k, ROUNDING)
1923    }
1924}
1925
1926/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1927/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1928/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1929/// Rounding is done according to the rounding parameter, which can be one of:
1930///
1931/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1932/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1933/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1934/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1935/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1936///
1937/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sh)
1938#[inline]
1939#[target_feature(enable = "avx512fp16")]
1940#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1941#[rustc_legacy_const_generics(3)]
1942#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1943pub fn _mm_maskz_sub_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1944    static_assert_rounding!(ROUNDING);
1945    _mm_mask_sub_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
1946}
1947
1948/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1949/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1950///
1951/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sh)
1952#[inline]
1953#[target_feature(enable = "avx512fp16")]
1954#[cfg_attr(test, assert_instr(vsubsh))]
1955#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1956#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1957pub const fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
1958    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) - _mm_cvtsh_h(b)) }
1959}
1960
1961/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1962/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1963/// writemask k (the element is copied from src when mask bit 0 is not set).
1964///
1965/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_sh)
1966#[inline]
1967#[target_feature(enable = "avx512fp16")]
1968#[cfg_attr(test, assert_instr(vsubsh))]
1969#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1970#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1971pub const fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1972    unsafe {
1973        let extractsrc: f16 = simd_extract!(src, 0);
1974        let mut add: f16 = extractsrc;
1975        if (k & 0b00000001) != 0 {
1976            let extracta: f16 = simd_extract!(a, 0);
1977            let extractb: f16 = simd_extract!(b, 0);
1978            add = extracta - extractb;
1979        }
1980        simd_insert!(a, 0, add)
1981    }
1982}
1983
1984/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1985/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1986/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1987///
1988/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sh)
1989#[inline]
1990#[target_feature(enable = "avx512fp16")]
1991#[cfg_attr(test, assert_instr(vsubsh))]
1992#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1993#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1994pub const fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1995    unsafe {
1996        let mut add: f16 = 0.;
1997        if (k & 0b00000001) != 0 {
1998            let extracta: f16 = simd_extract!(a, 0);
1999            let extractb: f16 = simd_extract!(b, 0);
2000            add = extracta - extractb;
2001        }
2002        simd_insert!(a, 0, add)
2003    }
2004}
2005
2006/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2007///
2008/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ph)
2009#[inline]
2010#[target_feature(enable = "avx512fp16,avx512vl")]
2011#[cfg_attr(test, assert_instr(vmulph))]
2012#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2013#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2014pub const fn _mm_mul_ph(a: __m128h, b: __m128h) -> __m128h {
2015    unsafe { simd_mul(a, b) }
2016}
2017
2018/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2019/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2020///
2021/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ph)
2022#[inline]
2023#[target_feature(enable = "avx512fp16,avx512vl")]
2024#[cfg_attr(test, assert_instr(vmulph))]
2025#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2026#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2027pub const fn _mm_mask_mul_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2028    unsafe {
2029        let r = _mm_mul_ph(a, b);
2030        simd_select_bitmask(k, r, src)
2031    }
2032}
2033
2034/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2035/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2036///
2037/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ph)
2038#[inline]
2039#[target_feature(enable = "avx512fp16,avx512vl")]
2040#[cfg_attr(test, assert_instr(vmulph))]
2041#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2042#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2043pub const fn _mm_maskz_mul_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2044    unsafe {
2045        let r = _mm_mul_ph(a, b);
2046        simd_select_bitmask(k, r, _mm_setzero_ph())
2047    }
2048}
2049
2050/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2051///
2052/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_ph)
2053#[inline]
2054#[target_feature(enable = "avx512fp16,avx512vl")]
2055#[cfg_attr(test, assert_instr(vmulph))]
2056#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2057#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2058pub const fn _mm256_mul_ph(a: __m256h, b: __m256h) -> __m256h {
2059    unsafe { simd_mul(a, b) }
2060}
2061
2062/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2063/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2064///
2065/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_ph)
2066#[inline]
2067#[target_feature(enable = "avx512fp16,avx512vl")]
2068#[cfg_attr(test, assert_instr(vmulph))]
2069#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2070#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2071pub const fn _mm256_mask_mul_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2072    unsafe {
2073        let r = _mm256_mul_ph(a, b);
2074        simd_select_bitmask(k, r, src)
2075    }
2076}
2077
2078/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2079/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2080///
2081/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_ph)
2082#[inline]
2083#[target_feature(enable = "avx512fp16,avx512vl")]
2084#[cfg_attr(test, assert_instr(vmulph))]
2085#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2086#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2087pub const fn _mm256_maskz_mul_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2088    unsafe {
2089        let r = _mm256_mul_ph(a, b);
2090        simd_select_bitmask(k, r, _mm256_setzero_ph())
2091    }
2092}
2093
2094/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2095///
2096/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_ph)
2097#[inline]
2098#[target_feature(enable = "avx512fp16")]
2099#[cfg_attr(test, assert_instr(vmulph))]
2100#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2101#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2102pub const fn _mm512_mul_ph(a: __m512h, b: __m512h) -> __m512h {
2103    unsafe { simd_mul(a, b) }
2104}
2105
2106/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2107/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2108///
2109/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_ph)
2110#[inline]
2111#[target_feature(enable = "avx512fp16")]
2112#[cfg_attr(test, assert_instr(vmulph))]
2113#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2114#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2115pub const fn _mm512_mask_mul_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2116    unsafe {
2117        let r = _mm512_mul_ph(a, b);
2118        simd_select_bitmask(k, r, src)
2119    }
2120}
2121
2122/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2123/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2124///
2125/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_ph)
2126#[inline]
2127#[target_feature(enable = "avx512fp16")]
2128#[cfg_attr(test, assert_instr(vmulph))]
2129#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2130#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2131pub const fn _mm512_maskz_mul_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2132    unsafe {
2133        let r = _mm512_mul_ph(a, b);
2134        simd_select_bitmask(k, r, _mm512_setzero_ph())
2135    }
2136}
2137
2138/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2139/// Rounding is done according to the rounding parameter, which can be one of:
2140///
2141/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2142/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2143/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2144/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2145/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2146///
2147/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_ph)
2148#[inline]
2149#[target_feature(enable = "avx512fp16")]
2150#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2151#[rustc_legacy_const_generics(2)]
2152#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2153pub fn _mm512_mul_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2154    unsafe {
2155        static_assert_rounding!(ROUNDING);
2156        vmulph(a, b, ROUNDING)
2157    }
2158}
2159
2160/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2161/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2162/// Rounding is done according to the rounding parameter, which can be one of:
2163///
2164/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2165/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2166/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2167/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2168/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2169///
2170/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_ph)
2171#[inline]
2172#[target_feature(enable = "avx512fp16")]
2173#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2174#[rustc_legacy_const_generics(4)]
2175#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2176pub fn _mm512_mask_mul_round_ph<const ROUNDING: i32>(
2177    src: __m512h,
2178    k: __mmask32,
2179    a: __m512h,
2180    b: __m512h,
2181) -> __m512h {
2182    unsafe {
2183        static_assert_rounding!(ROUNDING);
2184        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
2185        simd_select_bitmask(k, r, src)
2186    }
2187}
2188
2189/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2190/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2191/// Rounding is done according to the rounding parameter, which can be one of:
2192///
2193/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2194/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2195/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2196/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2197/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2198///
2199/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_ph)
2200#[inline]
2201#[target_feature(enable = "avx512fp16")]
2202#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2203#[rustc_legacy_const_generics(3)]
2204#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2205pub fn _mm512_maskz_mul_round_ph<const ROUNDING: i32>(
2206    k: __mmask32,
2207    a: __m512h,
2208    b: __m512h,
2209) -> __m512h {
2210    unsafe {
2211        static_assert_rounding!(ROUNDING);
2212        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
2213        simd_select_bitmask(k, r, _mm512_setzero_ph())
2214    }
2215}
2216
2217/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2218/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2219/// Rounding is done according to the rounding parameter, which can be one of:
2220///
2221/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2222/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2223/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2224/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2225/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2226///
2227/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sh)
2228#[inline]
2229#[target_feature(enable = "avx512fp16")]
2230#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2231#[rustc_legacy_const_generics(2)]
2232#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2233pub fn _mm_mul_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2234    static_assert_rounding!(ROUNDING);
2235    _mm_mask_mul_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
2236}
2237
2238/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2239/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2240/// writemask k (the element is copied from src when mask bit 0 is not set).
2241/// Rounding is done according to the rounding parameter, which can be one of:
2242///
2243/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2244/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2245/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2246/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2247/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2248///
2249/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sh)
2250#[inline]
2251#[target_feature(enable = "avx512fp16")]
2252#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2253#[rustc_legacy_const_generics(4)]
2254#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2255pub fn _mm_mask_mul_round_sh<const ROUNDING: i32>(
2256    src: __m128h,
2257    k: __mmask8,
2258    a: __m128h,
2259    b: __m128h,
2260) -> __m128h {
2261    unsafe {
2262        static_assert_rounding!(ROUNDING);
2263        vmulsh(a, b, src, k, ROUNDING)
2264    }
2265}
2266
2267/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2268/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2269/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2270/// Rounding is done according to the rounding parameter, which can be one of:
2271///
2272/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2273/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2274/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2275/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2276/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2277///
2278/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sh)
2279#[inline]
2280#[target_feature(enable = "avx512fp16")]
2281#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2282#[rustc_legacy_const_generics(3)]
2283#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2284pub fn _mm_maskz_mul_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2285    static_assert_rounding!(ROUNDING);
2286    _mm_mask_mul_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
2287}
2288
2289/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2290/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2291///
2292/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sh)
2293#[inline]
2294#[target_feature(enable = "avx512fp16")]
2295#[cfg_attr(test, assert_instr(vmulsh))]
2296#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2297#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2298pub const fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
2299    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) * _mm_cvtsh_h(b)) }
2300}
2301
2302/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2303/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2304/// writemask k (the element is copied from src when mask bit 0 is not set).
2305///
2306/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sh)
2307#[inline]
2308#[target_feature(enable = "avx512fp16")]
2309#[cfg_attr(test, assert_instr(vmulsh))]
2310#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2311#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2312pub const fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2313    unsafe {
2314        let extractsrc: f16 = simd_extract!(src, 0);
2315        let mut add: f16 = extractsrc;
2316        if (k & 0b00000001) != 0 {
2317            let extracta: f16 = simd_extract!(a, 0);
2318            let extractb: f16 = simd_extract!(b, 0);
2319            add = extracta * extractb;
2320        }
2321        simd_insert!(a, 0, add)
2322    }
2323}
2324
2325/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2326/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2327/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2328///
2329/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sh)
2330#[inline]
2331#[target_feature(enable = "avx512fp16")]
2332#[cfg_attr(test, assert_instr(vmulsh))]
2333#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2334#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2335pub const fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2336    unsafe {
2337        let mut add: f16 = 0.;
2338        if (k & 0b00000001) != 0 {
2339            let extracta: f16 = simd_extract!(a, 0);
2340            let extractb: f16 = simd_extract!(b, 0);
2341            add = extracta * extractb;
2342        }
2343        simd_insert!(a, 0, add)
2344    }
2345}
2346
2347/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2348///
2349/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ph)
2350#[inline]
2351#[target_feature(enable = "avx512fp16,avx512vl")]
2352#[cfg_attr(test, assert_instr(vdivph))]
2353#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2354#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2355pub const fn _mm_div_ph(a: __m128h, b: __m128h) -> __m128h {
2356    unsafe { simd_div(a, b) }
2357}
2358
2359/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2360/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2361///
2362/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ph)
2363#[inline]
2364#[target_feature(enable = "avx512fp16,avx512vl")]
2365#[cfg_attr(test, assert_instr(vdivph))]
2366#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2367#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2368pub const fn _mm_mask_div_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2369    unsafe {
2370        let r = _mm_div_ph(a, b);
2371        simd_select_bitmask(k, r, src)
2372    }
2373}
2374
2375/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2376/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2377///
2378/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ph)
2379#[inline]
2380#[target_feature(enable = "avx512fp16,avx512vl")]
2381#[cfg_attr(test, assert_instr(vdivph))]
2382#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2383#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2384pub const fn _mm_maskz_div_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2385    unsafe {
2386        let r = _mm_div_ph(a, b);
2387        simd_select_bitmask(k, r, _mm_setzero_ph())
2388    }
2389}
2390
2391/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2392///
2393/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_div_ph)
2394#[inline]
2395#[target_feature(enable = "avx512fp16,avx512vl")]
2396#[cfg_attr(test, assert_instr(vdivph))]
2397#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2398#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2399pub const fn _mm256_div_ph(a: __m256h, b: __m256h) -> __m256h {
2400    unsafe { simd_div(a, b) }
2401}
2402
2403/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2404/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2405///
2406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_div_ph)
2407#[inline]
2408#[target_feature(enable = "avx512fp16,avx512vl")]
2409#[cfg_attr(test, assert_instr(vdivph))]
2410#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2411#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2412pub const fn _mm256_mask_div_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2413    unsafe {
2414        let r = _mm256_div_ph(a, b);
2415        simd_select_bitmask(k, r, src)
2416    }
2417}
2418
2419/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2420/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2421///
2422/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_div_ph)
2423#[inline]
2424#[target_feature(enable = "avx512fp16,avx512vl")]
2425#[cfg_attr(test, assert_instr(vdivph))]
2426#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2427#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2428pub const fn _mm256_maskz_div_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2429    unsafe {
2430        let r = _mm256_div_ph(a, b);
2431        simd_select_bitmask(k, r, _mm256_setzero_ph())
2432    }
2433}
2434
2435/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2436///
2437/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_ph)
2438#[inline]
2439#[target_feature(enable = "avx512fp16")]
2440#[cfg_attr(test, assert_instr(vdivph))]
2441#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2442#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2443pub const fn _mm512_div_ph(a: __m512h, b: __m512h) -> __m512h {
2444    unsafe { simd_div(a, b) }
2445}
2446
2447/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2448/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2449///
2450/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_ph)
2451#[inline]
2452#[target_feature(enable = "avx512fp16")]
2453#[cfg_attr(test, assert_instr(vdivph))]
2454#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2455#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2456pub const fn _mm512_mask_div_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2457    unsafe {
2458        let r = _mm512_div_ph(a, b);
2459        simd_select_bitmask(k, r, src)
2460    }
2461}
2462
2463/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2464/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2465///
2466/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_ph)
2467#[inline]
2468#[target_feature(enable = "avx512fp16")]
2469#[cfg_attr(test, assert_instr(vdivph))]
2470#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2471#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2472pub const fn _mm512_maskz_div_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2473    unsafe {
2474        let r = _mm512_div_ph(a, b);
2475        simd_select_bitmask(k, r, _mm512_setzero_ph())
2476    }
2477}
2478
2479/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2480/// Rounding is done according to the rounding parameter, which can be one of:
2481///
2482/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2483/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2484/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2485/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2486/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2487///
2488/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_round_ph)
2489#[inline]
2490#[target_feature(enable = "avx512fp16")]
2491#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2492#[rustc_legacy_const_generics(2)]
2493#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2494pub fn _mm512_div_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2495    unsafe {
2496        static_assert_rounding!(ROUNDING);
2497        vdivph(a, b, ROUNDING)
2498    }
2499}
2500
2501/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2502/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2503/// Rounding is done according to the rounding parameter, which can be one of:
2504///
2505/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2506/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2507/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2508/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2509/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2510///
2511/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_round_ph)
2512#[inline]
2513#[target_feature(enable = "avx512fp16")]
2514#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2515#[rustc_legacy_const_generics(4)]
2516#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2517pub fn _mm512_mask_div_round_ph<const ROUNDING: i32>(
2518    src: __m512h,
2519    k: __mmask32,
2520    a: __m512h,
2521    b: __m512h,
2522) -> __m512h {
2523    unsafe {
2524        static_assert_rounding!(ROUNDING);
2525        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
2526        simd_select_bitmask(k, r, src)
2527    }
2528}
2529
2530/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2531/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2532/// Rounding is done according to the rounding parameter, which can be one of:
2533///
2534/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2535/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2536/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2537/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2538/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2539///
2540/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_round_ph)
2541#[inline]
2542#[target_feature(enable = "avx512fp16")]
2543#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2544#[rustc_legacy_const_generics(3)]
2545#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2546pub fn _mm512_maskz_div_round_ph<const ROUNDING: i32>(
2547    k: __mmask32,
2548    a: __m512h,
2549    b: __m512h,
2550) -> __m512h {
2551    unsafe {
2552        static_assert_rounding!(ROUNDING);
2553        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
2554        simd_select_bitmask(k, r, _mm512_setzero_ph())
2555    }
2556}
2557
2558/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2559/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2560/// Rounding is done according to the rounding parameter, which can be one of:
2561///
2562/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2563/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2564/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2565/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2566/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2567///
2568/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_sh)
2569#[inline]
2570#[target_feature(enable = "avx512fp16")]
2571#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2572#[rustc_legacy_const_generics(2)]
2573#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2574pub fn _mm_div_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2575    static_assert_rounding!(ROUNDING);
2576    _mm_mask_div_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
2577}
2578
2579/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2580/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2581/// writemask k (the element is copied from src when mask bit 0 is not set).
2582/// Rounding is done according to the rounding parameter, which can be one of:
2583///
2584/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2585/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2586/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2587/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2588/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2589///
2590/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_sh)
2591#[inline]
2592#[target_feature(enable = "avx512fp16")]
2593#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2594#[rustc_legacy_const_generics(4)]
2595#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2596pub fn _mm_mask_div_round_sh<const ROUNDING: i32>(
2597    src: __m128h,
2598    k: __mmask8,
2599    a: __m128h,
2600    b: __m128h,
2601) -> __m128h {
2602    unsafe {
2603        static_assert_rounding!(ROUNDING);
2604        vdivsh(a, b, src, k, ROUNDING)
2605    }
2606}
2607
2608/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2609/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2610/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2611/// Rounding is done according to the rounding parameter, which can be one of:
2612///
2613/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2614/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2615/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2616/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2617/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2618///
2619/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_sh)
2620#[inline]
2621#[target_feature(enable = "avx512fp16")]
2622#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2623#[rustc_legacy_const_generics(3)]
2624#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2625pub fn _mm_maskz_div_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2626    static_assert_rounding!(ROUNDING);
2627    _mm_mask_div_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
2628}
2629
2630/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2631/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2632///
2633/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sh)
2634#[inline]
2635#[target_feature(enable = "avx512fp16")]
2636#[cfg_attr(test, assert_instr(vdivsh))]
2637#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2638#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2639pub const fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
2640    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) / _mm_cvtsh_h(b)) }
2641}
2642
2643/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2644/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2645/// writemask k (the element is copied from src when mask bit 0 is not set).
2646///
2647/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_sh)
2648#[inline]
2649#[target_feature(enable = "avx512fp16")]
2650#[cfg_attr(test, assert_instr(vdivsh))]
2651#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2652#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2653pub const fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2654    unsafe {
2655        let extractsrc: f16 = simd_extract!(src, 0);
2656        let mut add: f16 = extractsrc;
2657        if (k & 0b00000001) != 0 {
2658            let extracta: f16 = simd_extract!(a, 0);
2659            let extractb: f16 = simd_extract!(b, 0);
2660            add = extracta / extractb;
2661        }
2662        simd_insert!(a, 0, add)
2663    }
2664}
2665
2666/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2667/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2668/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2669///
2670/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_sh)
2671#[inline]
2672#[target_feature(enable = "avx512fp16")]
2673#[cfg_attr(test, assert_instr(vdivsh))]
2674#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2675#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2676pub const fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2677    unsafe {
2678        let mut add: f16 = 0.;
2679        if (k & 0b00000001) != 0 {
2680            let extracta: f16 = simd_extract!(a, 0);
2681            let extractb: f16 = simd_extract!(b, 0);
2682            add = extracta / extractb;
2683        }
2684        simd_insert!(a, 0, add)
2685    }
2686}
2687
2688/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2689/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2690/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2691///
2692/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pch)
2693#[inline]
2694#[target_feature(enable = "avx512fp16,avx512vl")]
2695#[cfg_attr(test, assert_instr(vfmulcph))]
2696#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2697pub fn _mm_mul_pch(a: __m128h, b: __m128h) -> __m128h {
2698    _mm_mask_mul_pch(_mm_undefined_ph(), 0xff, a, b)
2699}
2700
2701/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2702/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2703/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2704///
2705/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_pch)
2706#[inline]
2707#[target_feature(enable = "avx512fp16,avx512vl")]
2708#[cfg_attr(test, assert_instr(vfmulcph))]
2709#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2710pub fn _mm_mask_mul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2711    unsafe { transmute(vfmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
2712}
2713
2714/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2715/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2716/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2717///
2718/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_pch)
2719#[inline]
2720#[target_feature(enable = "avx512fp16,avx512vl")]
2721#[cfg_attr(test, assert_instr(vfmulcph))]
2722#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2723pub fn _mm_maskz_mul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2724    _mm_mask_mul_pch(_mm_setzero_ph(), k, a, b)
2725}
2726
2727/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2728/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2729/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2730///
2731/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_pch)
2732#[inline]
2733#[target_feature(enable = "avx512fp16,avx512vl")]
2734#[cfg_attr(test, assert_instr(vfmulcph))]
2735#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2736pub fn _mm256_mul_pch(a: __m256h, b: __m256h) -> __m256h {
2737    _mm256_mask_mul_pch(_mm256_undefined_ph(), 0xff, a, b)
2738}
2739
2740/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2741/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2742/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2743///
2744/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_pch)
2745#[inline]
2746#[target_feature(enable = "avx512fp16,avx512vl")]
2747#[cfg_attr(test, assert_instr(vfmulcph))]
2748#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2749pub fn _mm256_mask_mul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2750    unsafe { transmute(vfmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
2751}
2752
2753/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2754/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2755/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2756///
2757/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_pch)
2758#[inline]
2759#[target_feature(enable = "avx512fp16,avx512vl")]
2760#[cfg_attr(test, assert_instr(vfmulcph))]
2761#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2762pub fn _mm256_maskz_mul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2763    _mm256_mask_mul_pch(_mm256_setzero_ph(), k, a, b)
2764}
2765
2766/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2767/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2768/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2769///
2770/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_pch)
2771#[inline]
2772#[target_feature(enable = "avx512fp16")]
2773#[cfg_attr(test, assert_instr(vfmulcph))]
2774#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2775pub fn _mm512_mul_pch(a: __m512h, b: __m512h) -> __m512h {
2776    _mm512_mask_mul_pch(_mm512_undefined_ph(), 0xffff, a, b)
2777}
2778
2779/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2780/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2781/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2782///
2783/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_pch)
2784#[inline]
2785#[target_feature(enable = "avx512fp16")]
2786#[cfg_attr(test, assert_instr(vfmulcph))]
2787#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2788pub fn _mm512_mask_mul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
2789    _mm512_mask_mul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2790}
2791
2792/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2793/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2794/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2795///
2796/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_pch)
2797#[inline]
2798#[target_feature(enable = "avx512fp16")]
2799#[cfg_attr(test, assert_instr(vfmulcph))]
2800#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2801pub fn _mm512_maskz_mul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
2802    _mm512_mask_mul_pch(_mm512_setzero_ph(), k, a, b)
2803}
2804
2805/// Multiply the packed complex numbers in a and b, and store the results in dst. Each complex number is
2806/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2807/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2808///
2809/// Rounding is done according to the rounding parameter, which can be one of:
2810///
2811/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2812/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2813/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2814/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2815/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2816///
2817/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_pch)
2818#[inline]
2819#[target_feature(enable = "avx512fp16")]
2820#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2821#[rustc_legacy_const_generics(2)]
2822#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2823pub fn _mm512_mul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2824    static_assert_rounding!(ROUNDING);
2825    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
2826}
2827
2828/// Multiply the packed complex numbers in a and b, and store the results in dst using writemask k (the element
2829/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2830/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2831///
2832/// Rounding is done according to the rounding parameter, which can be one of:
2833///
2834/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2835/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2836/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2837/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2838/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2839///
2840/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_pch)
2841#[inline]
2842#[target_feature(enable = "avx512fp16")]
2843#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2844#[rustc_legacy_const_generics(4)]
2845#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2846pub fn _mm512_mask_mul_round_pch<const ROUNDING: i32>(
2847    src: __m512h,
2848    k: __mmask16,
2849    a: __m512h,
2850    b: __m512h,
2851) -> __m512h {
2852    unsafe {
2853        static_assert_rounding!(ROUNDING);
2854        transmute(vfmulcph_512(
2855            transmute(a),
2856            transmute(b),
2857            transmute(src),
2858            k,
2859            ROUNDING,
2860        ))
2861    }
2862}
2863
2864/// Multiply the packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2865/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2866/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2867///
2868/// Rounding is done according to the rounding parameter, which can be one of:
2869///
2870/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2871/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2872/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2873/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2874/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2875///
2876/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_pch)
2877#[inline]
2878#[target_feature(enable = "avx512fp16")]
2879#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2880#[rustc_legacy_const_generics(3)]
2881#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2882pub fn _mm512_maskz_mul_round_pch<const ROUNDING: i32>(
2883    k: __mmask16,
2884    a: __m512h,
2885    b: __m512h,
2886) -> __m512h {
2887    static_assert_rounding!(ROUNDING);
2888    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
2889}
2890
2891/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
2892/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
2893/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2894/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2895///
2896/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sch)
2897#[inline]
2898#[target_feature(enable = "avx512fp16")]
2899#[cfg_attr(test, assert_instr(vfmulcsh))]
2900#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2901pub fn _mm_mul_sch(a: __m128h, b: __m128h) -> __m128h {
2902    _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b)
2903}
2904
2905/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2906/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
2907/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent
2908/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2909///
2910/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sch)
2911#[inline]
2912#[target_feature(enable = "avx512fp16")]
2913#[cfg_attr(test, assert_instr(vfmulcsh))]
2914#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2915pub fn _mm_mask_mul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2916    _mm_mask_mul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2917}
2918
2919/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2920/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
2921/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2922/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2923///
2924/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sch)
2925#[inline]
2926#[target_feature(enable = "avx512fp16")]
2927#[cfg_attr(test, assert_instr(vfmulcsh))]
2928#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2929pub fn _mm_maskz_mul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2930    _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), k, a, b)
2931}
2932
2933/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
2934/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
2935/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2936/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2937///
2938/// Rounding is done according to the rounding parameter, which can be one of:
2939///
2940/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2941/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2942/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2943/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2944/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2945///
2946/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sch)
2947#[inline]
2948#[target_feature(enable = "avx512fp16")]
2949#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
2950#[rustc_legacy_const_generics(2)]
2951#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2952pub fn _mm_mul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2953    static_assert_rounding!(ROUNDING);
2954    _mm_mask_mul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
2955}
2956
2957/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2958/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
2959/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2960/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2961///
2962/// Rounding is done according to the rounding parameter, which can be one of:
2963///
2964/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2965/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2966/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2967/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2968/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2969///
2970/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sch)
2971#[inline]
2972#[target_feature(enable = "avx512fp16")]
2973#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
2974#[rustc_legacy_const_generics(4)]
2975#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2976pub fn _mm_mask_mul_round_sch<const ROUNDING: i32>(
2977    src: __m128h,
2978    k: __mmask8,
2979    a: __m128h,
2980    b: __m128h,
2981) -> __m128h {
2982    unsafe {
2983        static_assert_rounding!(ROUNDING);
2984        transmute(vfmulcsh(
2985            transmute(a),
2986            transmute(b),
2987            transmute(src),
2988            k,
2989            ROUNDING,
2990        ))
2991    }
2992}
2993
2994/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2995/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
2996/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2997/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2998///
2999/// Rounding is done according to the rounding parameter, which can be one of:
3000///
3001/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3002/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3003/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3004/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3005/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3006///
3007/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sch)
3008#[inline]
3009#[target_feature(enable = "avx512fp16")]
3010#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3011#[rustc_legacy_const_generics(3)]
3012#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3013pub fn _mm_maskz_mul_round_sch<const ROUNDING: i32>(
3014    k: __mmask8,
3015    a: __m128h,
3016    b: __m128h,
3017) -> __m128h {
3018    static_assert_rounding!(ROUNDING);
3019    _mm_mask_mul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
3020}
3021
3022/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
3023/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3024/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3025///
3026/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_pch)
3027#[inline]
3028#[target_feature(enable = "avx512fp16,avx512vl")]
3029#[cfg_attr(test, assert_instr(vfmulcph))]
3030#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3031pub fn _mm_fmul_pch(a: __m128h, b: __m128h) -> __m128h {
3032    _mm_mul_pch(a, b)
3033}
3034
3035/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3036/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
3037/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3038///
3039/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_pch)
3040#[inline]
3041#[target_feature(enable = "avx512fp16,avx512vl")]
3042#[cfg_attr(test, assert_instr(vfmulcph))]
3043#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3044pub fn _mm_mask_fmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3045    _mm_mask_mul_pch(src, k, a, b)
3046}
3047
3048/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3049/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3050/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3051///
3052/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_pch)
3053#[inline]
3054#[target_feature(enable = "avx512fp16,avx512vl")]
3055#[cfg_attr(test, assert_instr(vfmulcph))]
3056#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3057pub fn _mm_maskz_fmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3058    _mm_maskz_mul_pch(k, a, b)
3059}
3060
3061/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
3062/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3063/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3064///
3065/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmul_pch)
3066#[inline]
3067#[target_feature(enable = "avx512fp16,avx512vl")]
3068#[cfg_attr(test, assert_instr(vfmulcph))]
3069#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3070pub fn _mm256_fmul_pch(a: __m256h, b: __m256h) -> __m256h {
3071    _mm256_mul_pch(a, b)
3072}
3073
3074/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3075/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3076/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3077///
3078/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmul_pch)
3079#[inline]
3080#[target_feature(enable = "avx512fp16,avx512vl")]
3081#[cfg_attr(test, assert_instr(vfmulcph))]
3082#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3083pub fn _mm256_mask_fmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3084    _mm256_mask_mul_pch(src, k, a, b)
3085}
3086
3087/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3088/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3089/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3090///
3091/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmul_pch)
3092#[inline]
3093#[target_feature(enable = "avx512fp16,avx512vl")]
3094#[cfg_attr(test, assert_instr(vfmulcph))]
3095#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3096pub fn _mm256_maskz_fmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3097    _mm256_maskz_mul_pch(k, a, b)
3098}
3099
3100/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
3101/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3102///
3103/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_pch)
3104#[inline]
3105#[target_feature(enable = "avx512fp16")]
3106#[cfg_attr(test, assert_instr(vfmulcph))]
3107#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3108pub fn _mm512_fmul_pch(a: __m512h, b: __m512h) -> __m512h {
3109    _mm512_mul_pch(a, b)
3110}
3111
3112/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3113/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3114/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3115///
3116/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_pch)
3117#[inline]
3118#[target_feature(enable = "avx512fp16")]
3119#[cfg_attr(test, assert_instr(vfmulcph))]
3120#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3121pub fn _mm512_mask_fmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3122    _mm512_mask_mul_pch(src, k, a, b)
3123}
3124
3125/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3126/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3127/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3128///
3129/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_pch)
3130#[inline]
3131#[target_feature(enable = "avx512fp16")]
3132#[cfg_attr(test, assert_instr(vfmulcph))]
3133#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3134pub fn _mm512_maskz_fmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3135    _mm512_maskz_mul_pch(k, a, b)
3136}
3137
3138/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
3139/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3140/// Rounding is done according to the rounding parameter, which can be one of:
3141///
3142/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3143/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3144/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3145/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3146/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3147///
3148/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_round_pch)
3149#[inline]
3150#[target_feature(enable = "avx512fp16")]
3151#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3152#[rustc_legacy_const_generics(2)]
3153#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3154pub fn _mm512_fmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3155    static_assert_rounding!(ROUNDING);
3156    _mm512_mul_round_pch::<ROUNDING>(a, b)
3157}
3158
3159/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3160/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3161/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3162/// Rounding is done according to the rounding parameter, which can be one of:
3163///
3164/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3165/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3166/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3167/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3168/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3169///
3170/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_round_pch)
3171#[inline]
3172#[target_feature(enable = "avx512fp16")]
3173#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3174#[rustc_legacy_const_generics(4)]
3175#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3176pub fn _mm512_mask_fmul_round_pch<const ROUNDING: i32>(
3177    src: __m512h,
3178    k: __mmask16,
3179    a: __m512h,
3180    b: __m512h,
3181) -> __m512h {
3182    static_assert_rounding!(ROUNDING);
3183    _mm512_mask_mul_round_pch::<ROUNDING>(src, k, a, b)
3184}
3185
3186/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3187/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3188/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3189/// Rounding is done according to the rounding parameter, which can be one of:
3190///
3191/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3192/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3193/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3194/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3195/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3196///
3197/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_round_pch)
3198#[inline]
3199#[target_feature(enable = "avx512fp16")]
3200#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3201#[rustc_legacy_const_generics(3)]
3202#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3203pub fn _mm512_maskz_fmul_round_pch<const ROUNDING: i32>(
3204    k: __mmask16,
3205    a: __m512h,
3206    b: __m512h,
3207) -> __m512h {
3208    static_assert_rounding!(ROUNDING);
3209    _mm512_maskz_mul_round_pch::<ROUNDING>(k, a, b)
3210}
3211
3212/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is
3213/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3214/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3215///
3216/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_sch)
3217#[inline]
3218#[target_feature(enable = "avx512fp16")]
3219#[cfg_attr(test, assert_instr(vfmulcsh))]
3220#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3221pub fn _mm_fmul_sch(a: __m128h, b: __m128h) -> __m128h {
3222    _mm_mul_sch(a, b)
3223}
3224
3225/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
3226/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3227/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3228///
3229/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_sch)
3230#[inline]
3231#[target_feature(enable = "avx512fp16")]
3232#[cfg_attr(test, assert_instr(vfmulcsh))]
3233#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3234pub fn _mm_mask_fmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3235    _mm_mask_mul_sch(src, k, a, b)
3236}
3237
3238/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
3239/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3240/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3241///
3242/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_sch)
3243#[inline]
3244#[target_feature(enable = "avx512fp16")]
3245#[cfg_attr(test, assert_instr(vfmulcsh))]
3246#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3247pub fn _mm_maskz_fmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3248    _mm_maskz_mul_sch(k, a, b)
3249}
3250
3251/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is composed
3252/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3253///
3254/// Rounding is done according to the rounding parameter, which can be one of:
3255///
3256/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3257/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3258/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3259/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3260/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3261///
3262/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_round_sch)
3263#[inline]
3264#[target_feature(enable = "avx512fp16")]
3265#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3266#[rustc_legacy_const_generics(2)]
3267#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3268pub fn _mm_fmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3269    static_assert_rounding!(ROUNDING);
3270    _mm_mul_round_sch::<ROUNDING>(a, b)
3271}
3272
3273/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
3274/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3275/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3276///
3277/// Rounding is done according to the rounding parameter, which can be one of:
3278///
3279/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3280/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3281/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3282/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3283/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3284///
3285/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_round_sch)
3286#[inline]
3287#[target_feature(enable = "avx512fp16")]
3288#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3289#[rustc_legacy_const_generics(4)]
3290#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3291pub fn _mm_mask_fmul_round_sch<const ROUNDING: i32>(
3292    src: __m128h,
3293    k: __mmask8,
3294    a: __m128h,
3295    b: __m128h,
3296) -> __m128h {
3297    static_assert_rounding!(ROUNDING);
3298    _mm_mask_mul_round_sch::<ROUNDING>(src, k, a, b)
3299}
3300
3301/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
3302/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3303/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3304///
3305/// Rounding is done according to the rounding parameter, which can be one of:
3306///
3307/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3308/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3309/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3310/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3311/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3312///
3313/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_round_sch)
3314#[inline]
3315#[target_feature(enable = "avx512fp16")]
3316#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3317#[rustc_legacy_const_generics(3)]
3318#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3319pub fn _mm_maskz_fmul_round_sch<const ROUNDING: i32>(
3320    k: __mmask8,
3321    a: __m128h,
3322    b: __m128h,
3323) -> __m128h {
3324    static_assert_rounding!(ROUNDING);
3325    _mm_maskz_mul_round_sch::<ROUNDING>(k, a, b)
3326}
3327
3328/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3329/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3330/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3331/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3332///
3333/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_pch)
3334#[inline]
3335#[target_feature(enable = "avx512fp16,avx512vl")]
3336#[cfg_attr(test, assert_instr(vfcmulcph))]
3337#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3338pub fn _mm_cmul_pch(a: __m128h, b: __m128h) -> __m128h {
3339    _mm_mask_cmul_pch(_mm_undefined_ph(), 0xff, a, b)
3340}
3341
3342/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3343/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3344/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3345/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3346///
3347/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_pch)
3348#[inline]
3349#[target_feature(enable = "avx512fp16,avx512vl")]
3350#[cfg_attr(test, assert_instr(vfcmulcph))]
3351#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3352pub fn _mm_mask_cmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3353    unsafe { transmute(vfcmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
3354}
3355
3356/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3357/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3358/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3359/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3360///
3361/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_pch)
3362#[inline]
3363#[target_feature(enable = "avx512fp16,avx512vl")]
3364#[cfg_attr(test, assert_instr(vfcmulcph))]
3365#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3366pub fn _mm_maskz_cmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3367    _mm_mask_cmul_pch(_mm_setzero_ph(), k, a, b)
3368}
3369
3370/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3371/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3372/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3373/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3374///
3375/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmul_pch)
3376#[inline]
3377#[target_feature(enable = "avx512fp16,avx512vl")]
3378#[cfg_attr(test, assert_instr(vfcmulcph))]
3379#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3380pub fn _mm256_cmul_pch(a: __m256h, b: __m256h) -> __m256h {
3381    _mm256_mask_cmul_pch(_mm256_undefined_ph(), 0xff, a, b)
3382}
3383
3384/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3385/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3386/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3387/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3388///
3389/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmul_pch)
3390#[inline]
3391#[target_feature(enable = "avx512fp16,avx512vl")]
3392#[cfg_attr(test, assert_instr(vfcmulcph))]
3393#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3394pub fn _mm256_mask_cmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3395    unsafe { transmute(vfcmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
3396}
3397
3398/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3399/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3400/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3401/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3402///
3403/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cmul_pch)
3404#[inline]
3405#[target_feature(enable = "avx512fp16,avx512vl")]
3406#[cfg_attr(test, assert_instr(vfcmulcph))]
3407#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3408pub fn _mm256_maskz_cmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3409    _mm256_mask_cmul_pch(_mm256_setzero_ph(), k, a, b)
3410}
3411
3412/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3413/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3414/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3415/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3416///
3417/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_pch)
3418#[inline]
3419#[target_feature(enable = "avx512fp16")]
3420#[cfg_attr(test, assert_instr(vfcmulcph))]
3421#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3422pub fn _mm512_cmul_pch(a: __m512h, b: __m512h) -> __m512h {
3423    _mm512_mask_cmul_pch(_mm512_undefined_ph(), 0xffff, a, b)
3424}
3425
3426/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3427/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3428/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3429/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3430///
3431/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_pch)
3432#[inline]
3433#[target_feature(enable = "avx512fp16")]
3434#[cfg_attr(test, assert_instr(vfcmulcph))]
3435#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3436pub fn _mm512_mask_cmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3437    _mm512_mask_cmul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
3438}
3439
3440/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3441/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3442/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3443/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3444///
3445/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_pch)
3446#[inline]
3447#[target_feature(enable = "avx512fp16")]
3448#[cfg_attr(test, assert_instr(vfcmulcph))]
3449#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3450pub fn _mm512_maskz_cmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3451    _mm512_mask_cmul_pch(_mm512_setzero_ph(), k, a, b)
3452}
3453
3454/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3455/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3456/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3457/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3458///
3459/// Rounding is done according to the rounding parameter, which can be one of:
3460///
3461/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3462/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3463/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3464/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3465/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3466///
3467/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_round_pch)
3468#[inline]
3469#[target_feature(enable = "avx512fp16")]
3470#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3471#[rustc_legacy_const_generics(2)]
3472#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3473pub fn _mm512_cmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3474    static_assert_rounding!(ROUNDING);
3475    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
3476}
3477
3478/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3479/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3480/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3481/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3482///
3483/// Rounding is done according to the rounding parameter, which can be one of:
3484///
3485/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3486/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3487/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3488/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3489/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3490///
3491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_round_pch)
3492#[inline]
3493#[target_feature(enable = "avx512fp16")]
3494#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3495#[rustc_legacy_const_generics(4)]
3496#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3497pub fn _mm512_mask_cmul_round_pch<const ROUNDING: i32>(
3498    src: __m512h,
3499    k: __mmask16,
3500    a: __m512h,
3501    b: __m512h,
3502) -> __m512h {
3503    unsafe {
3504        static_assert_rounding!(ROUNDING);
3505        transmute(vfcmulcph_512(
3506            transmute(a),
3507            transmute(b),
3508            transmute(src),
3509            k,
3510            ROUNDING,
3511        ))
3512    }
3513}
3514
3515/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3516/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3517/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3518/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3519///
3520/// Rounding is done according to the rounding parameter, which can be one of:
3521///
3522/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3523/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3524/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3525/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3526/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3527///
3528/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_round_pch)
3529#[inline]
3530#[target_feature(enable = "avx512fp16")]
3531#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3532#[rustc_legacy_const_generics(3)]
3533#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3534pub fn _mm512_maskz_cmul_round_pch<const ROUNDING: i32>(
3535    k: __mmask16,
3536    a: __m512h,
3537    b: __m512h,
3538) -> __m512h {
3539    static_assert_rounding!(ROUNDING);
3540    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
3541}
3542
3543/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3544/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3545/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3546///
3547/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_sch)
3548#[inline]
3549#[target_feature(enable = "avx512fp16")]
3550#[cfg_attr(test, assert_instr(vfcmulcsh))]
3551#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3552pub fn _mm_cmul_sch(a: __m128h, b: __m128h) -> __m128h {
3553    _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b)
3554}
3555
3556/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3557/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3558/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3559/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3560///
3561/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_sch)
3562#[inline]
3563#[target_feature(enable = "avx512fp16")]
3564#[cfg_attr(test, assert_instr(vfcmulcsh))]
3565#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3566pub fn _mm_mask_cmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3567    _mm_mask_cmul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
3568}
3569
3570/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3571/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3572/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3573/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3574///
3575/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_sch)
3576#[inline]
3577#[target_feature(enable = "avx512fp16")]
3578#[cfg_attr(test, assert_instr(vfcmulcsh))]
3579#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3580pub fn _mm_maskz_cmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3581    _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), k, a, b)
3582}
3583
3584/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3585/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3586/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3587///
3588/// Rounding is done according to the rounding parameter, which can be one of:
3589///
3590/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3591/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3592/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3593/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3594/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3595///
3596/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_round_sch)
3597#[inline]
3598#[target_feature(enable = "avx512fp16")]
3599#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3600#[rustc_legacy_const_generics(2)]
3601#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3602pub fn _mm_cmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3603    static_assert_rounding!(ROUNDING);
3604    _mm_mask_cmul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
3605}
3606
3607/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3608/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3609/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3610/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3611///
3612/// Rounding is done according to the rounding parameter, which can be one of:
3613///
3614/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3615/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3616/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3617/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3618/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3619///
3620/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_round_sch)
3621#[inline]
3622#[target_feature(enable = "avx512fp16")]
3623#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3624#[rustc_legacy_const_generics(4)]
3625#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3626pub fn _mm_mask_cmul_round_sch<const ROUNDING: i32>(
3627    src: __m128h,
3628    k: __mmask8,
3629    a: __m128h,
3630    b: __m128h,
3631) -> __m128h {
3632    unsafe {
3633        static_assert_rounding!(ROUNDING);
3634        transmute(vfcmulcsh(
3635            transmute(a),
3636            transmute(b),
3637            transmute(src),
3638            k,
3639            ROUNDING,
3640        ))
3641    }
3642}
3643
3644/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3645/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3646/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3647/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3648///
3649/// Rounding is done according to the rounding parameter, which can be one of:
3650///
3651/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3652/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3653/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3654/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3655/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3656///
3657/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_round_sch)
3658#[inline]
3659#[target_feature(enable = "avx512fp16")]
3660#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3661#[rustc_legacy_const_generics(3)]
3662#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3663pub fn _mm_maskz_cmul_round_sch<const ROUNDING: i32>(
3664    k: __mmask8,
3665    a: __m128h,
3666    b: __m128h,
3667) -> __m128h {
3668    static_assert_rounding!(ROUNDING);
3669    _mm_mask_cmul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
3670}
3671
3672/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3673/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3674/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3675/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3676///
3677/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_pch)
3678#[inline]
3679#[target_feature(enable = "avx512fp16,avx512vl")]
3680#[cfg_attr(test, assert_instr(vfcmulcph))]
3681#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3682pub fn _mm_fcmul_pch(a: __m128h, b: __m128h) -> __m128h {
3683    _mm_cmul_pch(a, b)
3684}
3685
3686/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3687/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3688/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3689/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3690///
3691/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_pch)
3692#[inline]
3693#[target_feature(enable = "avx512fp16,avx512vl")]
3694#[cfg_attr(test, assert_instr(vfcmulcph))]
3695#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3696pub fn _mm_mask_fcmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3697    _mm_mask_cmul_pch(src, k, a, b)
3698}
3699
3700/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3701/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3702/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3703/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3704///
3705/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_pch)
3706#[inline]
3707#[target_feature(enable = "avx512fp16,avx512vl")]
3708#[cfg_attr(test, assert_instr(vfcmulcph))]
3709#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3710pub fn _mm_maskz_fcmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3711    _mm_maskz_cmul_pch(k, a, b)
3712}
3713
3714/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3715/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3716/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3717/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3718///
3719/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmul_pch)
3720#[inline]
3721#[target_feature(enable = "avx512fp16,avx512vl")]
3722#[cfg_attr(test, assert_instr(vfcmulcph))]
3723#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3724pub fn _mm256_fcmul_pch(a: __m256h, b: __m256h) -> __m256h {
3725    _mm256_cmul_pch(a, b)
3726}
3727
3728/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3729/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3730/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3731/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3732///
3733/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmul_pch)
3734#[inline]
3735#[target_feature(enable = "avx512fp16,avx512vl")]
3736#[cfg_attr(test, assert_instr(vfcmulcph))]
3737#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3738pub fn _mm256_mask_fcmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3739    _mm256_mask_cmul_pch(src, k, a, b)
3740}
3741
3742/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3743/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3744/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3745/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3746///
3747/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmul_pch)
3748#[inline]
3749#[target_feature(enable = "avx512fp16,avx512vl")]
3750#[cfg_attr(test, assert_instr(vfcmulcph))]
3751#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3752pub fn _mm256_maskz_fcmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3753    _mm256_maskz_cmul_pch(k, a, b)
3754}
3755
3756/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3757/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3758/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3759/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3760///
3761/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_pch)
3762#[inline]
3763#[target_feature(enable = "avx512fp16")]
3764#[cfg_attr(test, assert_instr(vfcmulcph))]
3765#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3766pub fn _mm512_fcmul_pch(a: __m512h, b: __m512h) -> __m512h {
3767    _mm512_cmul_pch(a, b)
3768}
3769
3770/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3771/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3772/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3773/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3774///
3775/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_pch)
3776#[inline]
3777#[target_feature(enable = "avx512fp16")]
3778#[cfg_attr(test, assert_instr(vfcmulcph))]
3779#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3780pub fn _mm512_mask_fcmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3781    _mm512_mask_cmul_pch(src, k, a, b)
3782}
3783
3784/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3785/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3786/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3787/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3788///
3789/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_pch)
3790#[inline]
3791#[target_feature(enable = "avx512fp16")]
3792#[cfg_attr(test, assert_instr(vfcmulcph))]
3793#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3794pub fn _mm512_maskz_fcmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3795    _mm512_maskz_cmul_pch(k, a, b)
3796}
3797
3798/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3799/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3800/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3801///
3802/// Rounding is done according to the rounding parameter, which can be one of:
3803///
3804/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3805/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3806/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3807/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3808/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3809///
3810/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_round_pch)
3811#[inline]
3812#[target_feature(enable = "avx512fp16")]
3813#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3814#[rustc_legacy_const_generics(2)]
3815#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3816pub fn _mm512_fcmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3817    static_assert_rounding!(ROUNDING);
3818    _mm512_cmul_round_pch::<ROUNDING>(a, b)
3819}
3820
3821/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3822/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3823/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3824/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3825///
3826/// Rounding is done according to the rounding parameter, which can be one of:
3827///
3828/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3829/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3830/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3831/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3832/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3833///
3834/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_round_pch)
3835#[inline]
3836#[target_feature(enable = "avx512fp16")]
3837#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3838#[rustc_legacy_const_generics(4)]
3839#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3840pub fn _mm512_mask_fcmul_round_pch<const ROUNDING: i32>(
3841    src: __m512h,
3842    k: __mmask16,
3843    a: __m512h,
3844    b: __m512h,
3845) -> __m512h {
3846    static_assert_rounding!(ROUNDING);
3847    _mm512_mask_cmul_round_pch::<ROUNDING>(src, k, a, b)
3848}
3849
3850/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3851/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3852/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3853/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3854///
3855/// Rounding is done according to the rounding parameter, which can be one of:
3856///
3857/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3858/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3859/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3860/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3861/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3862///
3863/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_round_pch)
3864#[inline]
3865#[target_feature(enable = "avx512fp16")]
3866#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3867#[rustc_legacy_const_generics(3)]
3868#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3869pub fn _mm512_maskz_fcmul_round_pch<const ROUNDING: i32>(
3870    k: __mmask16,
3871    a: __m512h,
3872    b: __m512h,
3873) -> __m512h {
3874    static_assert_rounding!(ROUNDING);
3875    _mm512_maskz_cmul_round_pch::<ROUNDING>(k, a, b)
3876}
3877
3878/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3879/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3880/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3881/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3882///
3883/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_sch)
3884#[inline]
3885#[target_feature(enable = "avx512fp16")]
3886#[cfg_attr(test, assert_instr(vfcmulcsh))]
3887#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3888pub fn _mm_fcmul_sch(a: __m128h, b: __m128h) -> __m128h {
3889    _mm_cmul_sch(a, b)
3890}
3891
3892/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3893/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3894/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3895/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3896///
3897/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_sch)
3898#[inline]
3899#[target_feature(enable = "avx512fp16")]
3900#[cfg_attr(test, assert_instr(vfcmulcsh))]
3901#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3902pub fn _mm_mask_fcmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3903    _mm_mask_cmul_sch(src, k, a, b)
3904}
3905
3906/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3907/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3908/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3909/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3910///
3911/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_sch)
3912#[inline]
3913#[target_feature(enable = "avx512fp16")]
3914#[cfg_attr(test, assert_instr(vfcmulcsh))]
3915#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3916pub fn _mm_maskz_fcmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3917    _mm_maskz_cmul_sch(k, a, b)
3918}
3919
3920/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3921/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3922/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3923///
3924/// Rounding is done according to the rounding parameter, which can be one of:
3925///
3926/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3927/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3928/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3929/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3930/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3931///
3932/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_round_sch)
3933#[inline]
3934#[target_feature(enable = "avx512fp16")]
3935#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3936#[rustc_legacy_const_generics(2)]
3937#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3938pub fn _mm_fcmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3939    static_assert_rounding!(ROUNDING);
3940    _mm_cmul_round_sch::<ROUNDING>(a, b)
3941}
3942
3943/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3944/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3945/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3946/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3947///
3948/// Rounding is done according to the rounding parameter, which can be one of:
3949///
3950/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3951/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3952/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3953/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3954/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3955///
3956/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_round_sch)
3957#[inline]
3958#[target_feature(enable = "avx512fp16")]
3959#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3960#[rustc_legacy_const_generics(4)]
3961#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3962pub fn _mm_mask_fcmul_round_sch<const ROUNDING: i32>(
3963    src: __m128h,
3964    k: __mmask8,
3965    a: __m128h,
3966    b: __m128h,
3967) -> __m128h {
3968    static_assert_rounding!(ROUNDING);
3969    _mm_mask_cmul_round_sch::<ROUNDING>(src, k, a, b)
3970}
3971
3972/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3973/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3974/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3975/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3976///
3977/// Rounding is done according to the rounding parameter, which can be one of:
3978///
3979/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3980/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3981/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3982/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3983/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3984///
3985/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_round_sch)
3986#[inline]
3987#[target_feature(enable = "avx512fp16")]
3988#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3989#[rustc_legacy_const_generics(3)]
3990#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3991pub fn _mm_maskz_fcmul_round_sch<const ROUNDING: i32>(
3992    k: __mmask8,
3993    a: __m128h,
3994    b: __m128h,
3995) -> __m128h {
3996    static_assert_rounding!(ROUNDING);
3997    _mm_maskz_cmul_round_sch::<ROUNDING>(k, a, b)
3998}
3999
4000/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
4001/// the results in dst.
4002///
4003/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_ph)
4004#[inline]
4005#[target_feature(enable = "avx512fp16,avx512vl")]
4006#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4007#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4008pub const fn _mm_abs_ph(v2: __m128h) -> __m128h {
4009    unsafe { transmute(_mm_and_si128(transmute(v2), _mm_set1_epi16(i16::MAX))) }
4010}
4011
4012/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
4013/// the result in dst.
4014///
4015/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_ph)
4016#[inline]
4017#[target_feature(enable = "avx512fp16,avx512vl")]
4018#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4019#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4020pub const fn _mm256_abs_ph(v2: __m256h) -> __m256h {
4021    unsafe { transmute(_mm256_and_si256(transmute(v2), _mm256_set1_epi16(i16::MAX))) }
4022}
4023
4024/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
4025/// the result in dst.
4026///
4027/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_ph)
4028#[inline]
4029#[target_feature(enable = "avx512fp16")]
4030#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4031#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4032pub const fn _mm512_abs_ph(v2: __m512h) -> __m512h {
4033    unsafe { transmute(_mm512_and_si512(transmute(v2), _mm512_set1_epi16(i16::MAX))) }
4034}
4035
4036/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex
4037/// number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines
4038/// the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate
4039/// `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4040///
4041/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_conj_pch)
4042#[inline]
4043#[target_feature(enable = "avx512fp16,avx512vl")]
4044#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4045#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4046pub const fn _mm_conj_pch(a: __m128h) -> __m128h {
4047    unsafe { transmute(_mm_xor_si128(transmute(a), _mm_set1_epi32(i32::MIN))) }
4048}
4049
4050/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
4051/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
4052/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4053/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4054///
4055/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_conj_pch)
4056#[inline]
4057#[target_feature(enable = "avx512fp16,avx512vl")]
4058#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4059#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4060pub const fn _mm_mask_conj_pch(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
4061    unsafe {
4062        let r: __m128 = transmute(_mm_conj_pch(a));
4063        transmute(simd_select_bitmask(k, r, transmute(src)))
4064    }
4065}
4066
4067/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
4068/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
4069/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4070/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4071///
4072/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_conj_pch)
4073#[inline]
4074#[target_feature(enable = "avx512fp16,avx512vl")]
4075#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4076#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4077pub const fn _mm_maskz_conj_pch(k: __mmask8, a: __m128h) -> __m128h {
4078    _mm_mask_conj_pch(_mm_setzero_ph(), k, a)
4079}
4080
4081/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
4082/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4083/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4084///
4085/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_conj_pch)
4086#[inline]
4087#[target_feature(enable = "avx512fp16,avx512vl")]
4088#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4089#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4090pub const fn _mm256_conj_pch(a: __m256h) -> __m256h {
4091    unsafe { transmute(_mm256_xor_si256(transmute(a), _mm256_set1_epi32(i32::MIN))) }
4092}
4093
4094/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
4095/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
4096/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4097/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4098///
4099/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_conj_pch)
4100#[inline]
4101#[target_feature(enable = "avx512fp16,avx512vl")]
4102#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4103#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4104pub const fn _mm256_mask_conj_pch(src: __m256h, k: __mmask8, a: __m256h) -> __m256h {
4105    unsafe {
4106        let r: __m256 = transmute(_mm256_conj_pch(a));
4107        transmute(simd_select_bitmask(k, r, transmute(src)))
4108    }
4109}
4110
4111/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
4112/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
4113/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4114/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4115///
4116/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_conj_pch)
4117#[inline]
4118#[target_feature(enable = "avx512fp16,avx512vl")]
4119#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4120#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4121pub const fn _mm256_maskz_conj_pch(k: __mmask8, a: __m256h) -> __m256h {
4122    _mm256_mask_conj_pch(_mm256_setzero_ph(), k, a)
4123}
4124
4125/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
4126/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4127/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4128///
4129/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_conj_pch)
4130#[inline]
4131#[target_feature(enable = "avx512fp16")]
4132#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4133#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4134pub const fn _mm512_conj_pch(a: __m512h) -> __m512h {
4135    unsafe { transmute(_mm512_xor_si512(transmute(a), _mm512_set1_epi32(i32::MIN))) }
4136}
4137
4138/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
4139/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
4140/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4141/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4142///
4143/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_conj_pch)
4144#[inline]
4145#[target_feature(enable = "avx512fp16")]
4146#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4147#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4148pub const fn _mm512_mask_conj_pch(src: __m512h, k: __mmask16, a: __m512h) -> __m512h {
4149    unsafe {
4150        let r: __m512 = transmute(_mm512_conj_pch(a));
4151        transmute(simd_select_bitmask(k, r, transmute(src)))
4152    }
4153}
4154
4155/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
4156/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
4157/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4158/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4159///
4160/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_conj_pch)
4161#[inline]
4162#[target_feature(enable = "avx512fp16")]
4163#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4164#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4165pub const fn _mm512_maskz_conj_pch(k: __mmask16, a: __m512h) -> __m512h {
4166    _mm512_mask_conj_pch(_mm512_setzero_ph(), k, a)
4167}
4168
4169/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4170/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4171/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4172///
4173/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_pch)
4174#[inline]
4175#[target_feature(enable = "avx512fp16,avx512vl")]
4176#[cfg_attr(test, assert_instr(vfmaddcph))]
4177#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4178pub fn _mm_fmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4179    _mm_mask3_fmadd_pch(a, b, c, 0xff)
4180}
4181
4182/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4183/// and store the results in dst using writemask k (the element is copied from a when the corresponding
4184/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4185/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4186///
4187/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_pch)
4188#[inline]
4189#[target_feature(enable = "avx512fp16,avx512vl")]
4190#[cfg_attr(test, assert_instr(vfmaddcph))]
4191#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4192pub fn _mm_mask_fmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4193    unsafe {
4194        let r: __m128 = transmute(_mm_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4195        transmute(simd_select_bitmask(k, r, transmute(a)))
4196    }
4197}
4198
4199/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4200/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4201/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4202/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4203///
4204/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_pch)
4205#[inline]
4206#[target_feature(enable = "avx512fp16,avx512vl")]
4207#[cfg_attr(test, assert_instr(vfmaddcph))]
4208#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4209pub fn _mm_mask3_fmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4210    unsafe {
4211        transmute(vfmaddcph_mask3_128(
4212            transmute(a),
4213            transmute(b),
4214            transmute(c),
4215            k,
4216        ))
4217    }
4218}
4219
4220/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4221/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4222/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4223/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4224///
4225/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_pch)
4226#[inline]
4227#[target_feature(enable = "avx512fp16,avx512vl")]
4228#[cfg_attr(test, assert_instr(vfmaddcph))]
4229#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4230pub fn _mm_maskz_fmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4231    unsafe {
4232        transmute(vfmaddcph_maskz_128(
4233            transmute(a),
4234            transmute(b),
4235            transmute(c),
4236            k,
4237        ))
4238    }
4239}
4240
4241/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4242/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4243/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4244///
4245/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_pch)
4246#[inline]
4247#[target_feature(enable = "avx512fp16,avx512vl")]
4248#[cfg_attr(test, assert_instr(vfmaddcph))]
4249#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4250pub fn _mm256_fmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4251    _mm256_mask3_fmadd_pch(a, b, c, 0xff)
4252}
4253
4254/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4255/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4256/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4257/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4258///
4259/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_pch)
4260#[inline]
4261#[target_feature(enable = "avx512fp16,avx512vl")]
4262#[cfg_attr(test, assert_instr(vfmaddcph))]
4263#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4264pub fn _mm256_mask_fmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
4265    unsafe {
4266        let r: __m256 = transmute(_mm256_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4267        transmute(simd_select_bitmask(k, r, transmute(a)))
4268    }
4269}
4270
4271/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4272/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4273/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4274/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4275///
4276/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_pch)
4277#[inline]
4278#[target_feature(enable = "avx512fp16,avx512vl")]
4279#[cfg_attr(test, assert_instr(vfmaddcph))]
4280#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4281pub fn _mm256_mask3_fmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
4282    unsafe {
4283        transmute(vfmaddcph_mask3_256(
4284            transmute(a),
4285            transmute(b),
4286            transmute(c),
4287            k,
4288        ))
4289    }
4290}
4291
4292/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4293/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4294/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4295/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4296///
4297/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_pch)
4298#[inline]
4299#[target_feature(enable = "avx512fp16,avx512vl")]
4300#[cfg_attr(test, assert_instr(vfmaddcph))]
4301#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4302pub fn _mm256_maskz_fmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4303    unsafe {
4304        transmute(vfmaddcph_maskz_256(
4305            transmute(a),
4306            transmute(b),
4307            transmute(c),
4308            k,
4309        ))
4310    }
4311}
4312
4313/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4314/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4315/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4316///
4317/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_pch)
4318#[inline]
4319#[target_feature(enable = "avx512fp16")]
4320#[cfg_attr(test, assert_instr(vfmaddcph))]
4321#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4322pub fn _mm512_fmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4323    _mm512_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4324}
4325
4326/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4327/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4328/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4329/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4330///
4331/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_pch)
4332#[inline]
4333#[target_feature(enable = "avx512fp16")]
4334#[cfg_attr(test, assert_instr(vfmaddcph))]
4335#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4336pub fn _mm512_mask_fmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
4337    _mm512_mask_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4338}
4339
4340/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4341/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4342/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4343/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4344///
4345/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_pch)
4346#[inline]
4347#[target_feature(enable = "avx512fp16")]
4348#[cfg_attr(test, assert_instr(vfmaddcph))]
4349#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4350pub fn _mm512_mask3_fmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
4351    _mm512_mask3_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4352}
4353
4354/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4355/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4356/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4357/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4358///
4359/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_pch)
4360#[inline]
4361#[target_feature(enable = "avx512fp16")]
4362#[cfg_attr(test, assert_instr(vfmaddcph))]
4363#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4364pub fn _mm512_maskz_fmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4365    _mm512_maskz_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4366}
4367
4368/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4369/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4370/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4371///
4372/// Rounding is done according to the rounding parameter, which can be one of:
4373///
4374/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4375/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4376/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4377/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4378/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4379///
4380/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_pch)
4381#[inline]
4382#[target_feature(enable = "avx512fp16")]
4383#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4384#[rustc_legacy_const_generics(3)]
4385#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4386pub fn _mm512_fmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4387    static_assert_rounding!(ROUNDING);
4388    _mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
4389}
4390
4391/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4392/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4393/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4394/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4395///
4396/// Rounding is done according to the rounding parameter, which can be one of:
4397///
4398/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4399/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4400/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4401/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4402/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4403///
4404/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_pch)
4405#[inline]
4406#[target_feature(enable = "avx512fp16")]
4407#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4408#[rustc_legacy_const_generics(4)]
4409#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4410pub fn _mm512_mask_fmadd_round_pch<const ROUNDING: i32>(
4411    a: __m512h,
4412    k: __mmask16,
4413    b: __m512h,
4414    c: __m512h,
4415) -> __m512h {
4416    unsafe {
4417        static_assert_rounding!(ROUNDING);
4418        let r: __m512 = transmute(_mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
4419        transmute(simd_select_bitmask(k, r, transmute(a)))
4420    }
4421}
4422
4423/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4424/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4425/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4426/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4427///
4428/// Rounding is done according to the rounding parameter, which can be one of:
4429///
4430/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4431/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4432/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4433/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4434/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4435///
4436/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_pch)
4437#[inline]
4438#[target_feature(enable = "avx512fp16")]
4439#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4440#[rustc_legacy_const_generics(4)]
4441#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4442pub fn _mm512_mask3_fmadd_round_pch<const ROUNDING: i32>(
4443    a: __m512h,
4444    b: __m512h,
4445    c: __m512h,
4446    k: __mmask16,
4447) -> __m512h {
4448    unsafe {
4449        static_assert_rounding!(ROUNDING);
4450        transmute(vfmaddcph_mask3_512(
4451            transmute(a),
4452            transmute(b),
4453            transmute(c),
4454            k,
4455            ROUNDING,
4456        ))
4457    }
4458}
4459
4460/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4461/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4462/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4463/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4464///
4465/// Rounding is done according to the rounding parameter, which can be one of:
4466///
4467/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4468/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4469/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4470/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4471/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4472///
4473/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_pch)
4474#[inline]
4475#[target_feature(enable = "avx512fp16")]
4476#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4477#[rustc_legacy_const_generics(4)]
4478#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4479pub fn _mm512_maskz_fmadd_round_pch<const ROUNDING: i32>(
4480    k: __mmask16,
4481    a: __m512h,
4482    b: __m512h,
4483    c: __m512h,
4484) -> __m512h {
4485    unsafe {
4486        static_assert_rounding!(ROUNDING);
4487        transmute(vfmaddcph_maskz_512(
4488            transmute(a),
4489            transmute(b),
4490            transmute(c),
4491            k,
4492            ROUNDING,
4493        ))
4494    }
4495}
4496
4497/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4498/// store the result in the lower elements of dst, and copy the upper 6 packed elements from a to the
4499/// upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
4500/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4501///
4502/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sch)
4503#[inline]
4504#[target_feature(enable = "avx512fp16")]
4505#[cfg_attr(test, assert_instr(vfmaddcsh))]
4506#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4507pub fn _mm_fmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4508    _mm_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4509}
4510
4511/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4512/// store the result in the lower elements of dst using writemask k (elements are copied from a when
4513/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4514/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4515/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4516///
4517/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sch)
4518#[inline]
4519#[target_feature(enable = "avx512fp16")]
4520#[cfg_attr(test, assert_instr(vfmaddcsh))]
4521#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4522pub fn _mm_mask_fmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4523    _mm_mask_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4524}
4525
4526/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4527/// store the result in the lower elements of dst using writemask k (elements are copied from c when
4528/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4529/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4530/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4531///
4532/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sch)
4533#[inline]
4534#[target_feature(enable = "avx512fp16")]
4535#[cfg_attr(test, assert_instr(vfmaddcsh))]
4536#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4537pub fn _mm_mask3_fmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4538    _mm_mask3_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4539}
4540
4541/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4542/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
4543/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
4544/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4545/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4546///
4547/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sch)
4548#[inline]
4549#[target_feature(enable = "avx512fp16")]
4550#[cfg_attr(test, assert_instr(vfmaddcsh))]
4551#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4552pub fn _mm_maskz_fmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4553    _mm_maskz_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4554}
4555
4556/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4557/// store the result in the lower elements of dst. Each complex number is composed of two adjacent
4558/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4559///
4560/// Rounding is done according to the rounding parameter, which can be one of:
4561///
4562/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4563/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4564/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4565/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4566/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4567///
4568/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sch)
4569#[inline]
4570#[target_feature(enable = "avx512fp16")]
4571#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4572#[rustc_legacy_const_generics(3)]
4573#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4574pub fn _mm_fmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4575    unsafe {
4576        static_assert_rounding!(ROUNDING);
4577        transmute(vfmaddcsh_mask(
4578            transmute(a),
4579            transmute(b),
4580            transmute(c),
4581            0xff,
4582            ROUNDING,
4583        ))
4584    }
4585}
4586
4587/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4588/// store the result in the lower elements of dst using writemask k (elements are copied from a when
4589/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4590/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4591/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4592///
4593/// Rounding is done according to the rounding parameter, which can be one of:
4594///
4595/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4596/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4597/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4598/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4599/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4600///
4601/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sch)
4602#[inline]
4603#[target_feature(enable = "avx512fp16")]
4604#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4605#[rustc_legacy_const_generics(4)]
4606#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4607pub fn _mm_mask_fmadd_round_sch<const ROUNDING: i32>(
4608    a: __m128h,
4609    k: __mmask8,
4610    b: __m128h,
4611    c: __m128h,
4612) -> __m128h {
4613    unsafe {
4614        static_assert_rounding!(ROUNDING);
4615        let a = transmute(a);
4616        let r = vfmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING); // using `0xff` would have been fine here, but this is what CLang does
4617        transmute(_mm_mask_move_ss(a, k, a, r))
4618    }
4619}
4620
4621/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4622/// store the result in the lower elements of dst using writemask k (elements are copied from c when
4623/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4624/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4625/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4626///
4627/// Rounding is done according to the rounding parameter, which can be one of:
4628///
4629/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4630/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4631/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4632/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4633/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4634///
4635/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sch)
4636#[inline]
4637#[target_feature(enable = "avx512fp16")]
4638#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4639#[rustc_legacy_const_generics(4)]
4640#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4641pub fn _mm_mask3_fmadd_round_sch<const ROUNDING: i32>(
4642    a: __m128h,
4643    b: __m128h,
4644    c: __m128h,
4645    k: __mmask8,
4646) -> __m128h {
4647    unsafe {
4648        static_assert_rounding!(ROUNDING);
4649        let c = transmute(c);
4650        let r = vfmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
4651        transmute(_mm_move_ss(c, r))
4652    }
4653}
4654
4655/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4656/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
4657/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
4658/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4659/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4660///
4661/// Rounding is done according to the rounding parameter, which can be one of:
4662///
4663/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4664/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4665/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4666/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4667/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4668///
4669/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sch)
4670#[inline]
4671#[target_feature(enable = "avx512fp16")]
4672#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4673#[rustc_legacy_const_generics(4)]
4674#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4675pub fn _mm_maskz_fmadd_round_sch<const ROUNDING: i32>(
4676    k: __mmask8,
4677    a: __m128h,
4678    b: __m128h,
4679    c: __m128h,
4680) -> __m128h {
4681    unsafe {
4682        static_assert_rounding!(ROUNDING);
4683        transmute(vfmaddcsh_maskz(
4684            transmute(a),
4685            transmute(b),
4686            transmute(c),
4687            k,
4688            ROUNDING,
4689        ))
4690    }
4691}
4692
4693/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4694/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4695/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4696/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4697///
4698/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_pch)
4699#[inline]
4700#[target_feature(enable = "avx512fp16,avx512vl")]
4701#[cfg_attr(test, assert_instr(vfcmaddcph))]
4702#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4703pub fn _mm_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4704    _mm_mask3_fcmadd_pch(a, b, c, 0xff)
4705}
4706
4707/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4708/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4709/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4710/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4711/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4712///
4713/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_pch)
4714#[inline]
4715#[target_feature(enable = "avx512fp16,avx512vl")]
4716#[cfg_attr(test, assert_instr(vfcmaddcph))]
4717#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4718pub fn _mm_mask_fcmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4719    unsafe {
4720        let r: __m128 = transmute(_mm_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4721        transmute(simd_select_bitmask(k, r, transmute(a)))
4722    }
4723}
4724
4725/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4726/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4727/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4728/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4729/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4730///
4731/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_pch)
4732#[inline]
4733#[target_feature(enable = "avx512fp16,avx512vl")]
4734#[cfg_attr(test, assert_instr(vfcmaddcph))]
4735#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4736pub fn _mm_mask3_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4737    unsafe {
4738        transmute(vfcmaddcph_mask3_128(
4739            transmute(a),
4740            transmute(b),
4741            transmute(c),
4742            k,
4743        ))
4744    }
4745}
4746
4747/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4748/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4749/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4750/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4751/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4752///
4753/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_pch)
4754#[inline]
4755#[target_feature(enable = "avx512fp16,avx512vl")]
4756#[cfg_attr(test, assert_instr(vfcmaddcph))]
4757#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4758pub fn _mm_maskz_fcmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4759    unsafe {
4760        transmute(vfcmaddcph_maskz_128(
4761            transmute(a),
4762            transmute(b),
4763            transmute(c),
4764            k,
4765        ))
4766    }
4767}
4768
4769/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4770/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4771/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4772/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4773///
4774/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmadd_pch)
4775#[inline]
4776#[target_feature(enable = "avx512fp16,avx512vl")]
4777#[cfg_attr(test, assert_instr(vfcmaddcph))]
4778#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4779pub fn _mm256_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4780    _mm256_mask3_fcmadd_pch(a, b, c, 0xff)
4781}
4782
4783/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4784/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4785/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4786/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4787/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4788///
4789/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmadd_pch)
4790#[inline]
4791#[target_feature(enable = "avx512fp16,avx512vl")]
4792#[cfg_attr(test, assert_instr(vfcmaddcph))]
4793#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4794pub fn _mm256_mask_fcmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
4795    unsafe {
4796        let r: __m256 = transmute(_mm256_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4797        transmute(simd_select_bitmask(k, r, transmute(a)))
4798    }
4799}
4800
4801/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4802/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4803/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4804/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4805/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4806///
4807/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fcmadd_pch)
4808#[inline]
4809#[target_feature(enable = "avx512fp16,avx512vl")]
4810#[cfg_attr(test, assert_instr(vfcmaddcph))]
4811#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4812pub fn _mm256_mask3_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
4813    unsafe {
4814        transmute(vfcmaddcph_mask3_256(
4815            transmute(a),
4816            transmute(b),
4817            transmute(c),
4818            k,
4819        ))
4820    }
4821}
4822
4823/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4824/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4825/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4826/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4827/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4828///
4829/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmadd_pch)
4830#[inline]
4831#[target_feature(enable = "avx512fp16,avx512vl")]
4832#[cfg_attr(test, assert_instr(vfcmaddcph))]
4833#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4834pub fn _mm256_maskz_fcmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4835    unsafe {
4836        transmute(vfcmaddcph_maskz_256(
4837            transmute(a),
4838            transmute(b),
4839            transmute(c),
4840            k,
4841        ))
4842    }
4843}
4844
4845/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4846/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4847/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4848/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4849///
4850/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_pch)
4851#[inline]
4852#[target_feature(enable = "avx512fp16")]
4853#[cfg_attr(test, assert_instr(vfcmaddcph))]
4854#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4855pub fn _mm512_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4856    _mm512_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4857}
4858
4859/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4860/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4861/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4862/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4863/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4864///
4865/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_pch)
4866#[inline]
4867#[target_feature(enable = "avx512fp16")]
4868#[cfg_attr(test, assert_instr(vfcmaddcph))]
4869#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4870pub fn _mm512_mask_fcmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
4871    _mm512_mask_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4872}
4873
4874/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4875/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4876/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4877/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4878/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4879///
4880/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_pch)
4881#[inline]
4882#[target_feature(enable = "avx512fp16")]
4883#[cfg_attr(test, assert_instr(vfcmaddcph))]
4884#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4885pub fn _mm512_mask3_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
4886    _mm512_mask3_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4887}
4888
4889/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4890/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4891/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4892/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4893/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4894///
4895/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_pch)
4896#[inline]
4897#[target_feature(enable = "avx512fp16")]
4898#[cfg_attr(test, assert_instr(vfcmaddcph))]
4899#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4900pub fn _mm512_maskz_fcmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4901    _mm512_maskz_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4902}
4903
4904/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4905/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4906/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4907/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4908///
4909/// Rounding is done according to the rounding parameter, which can be one of:
4910///
4911/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4912/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4913/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4914/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4915/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4916///
4917/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_round_pch)
4918#[inline]
4919#[target_feature(enable = "avx512fp16")]
4920#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4921#[rustc_legacy_const_generics(3)]
4922#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4923pub fn _mm512_fcmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4924    static_assert_rounding!(ROUNDING);
4925    _mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
4926}
4927
4928/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4929/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4930/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4931/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4932/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4933///
4934/// Rounding is done according to the rounding parameter, which can be one of:
4935///
4936/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4937/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4938/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4939/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4940/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4941///
4942/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_round_pch)
4943#[inline]
4944#[target_feature(enable = "avx512fp16")]
4945#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4946#[rustc_legacy_const_generics(4)]
4947#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4948pub fn _mm512_mask_fcmadd_round_pch<const ROUNDING: i32>(
4949    a: __m512h,
4950    k: __mmask16,
4951    b: __m512h,
4952    c: __m512h,
4953) -> __m512h {
4954    unsafe {
4955        static_assert_rounding!(ROUNDING);
4956        let r: __m512 = transmute(_mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
4957        transmute(simd_select_bitmask(k, r, transmute(a)))
4958    }
4959}
4960
4961/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4962/// to the corresponding complex numbers in c using writemask k (the element is copied from c when the corresponding
4963/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
4964/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
4965/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4966///
4967/// Rounding is done according to the rounding parameter, which can be one of:
4968///
4969/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4970/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4971/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4972/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4973/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4974///
4975/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_round_pch)
4976#[inline]
4977#[target_feature(enable = "avx512fp16")]
4978#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4979#[rustc_legacy_const_generics(4)]
4980#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4981pub fn _mm512_mask3_fcmadd_round_pch<const ROUNDING: i32>(
4982    a: __m512h,
4983    b: __m512h,
4984    c: __m512h,
4985    k: __mmask16,
4986) -> __m512h {
4987    unsafe {
4988        static_assert_rounding!(ROUNDING);
4989        transmute(vfcmaddcph_mask3_512(
4990            transmute(a),
4991            transmute(b),
4992            transmute(c),
4993            k,
4994            ROUNDING,
4995        ))
4996    }
4997}
4998
4999/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
5000/// to the corresponding complex numbers in c using zeromask k (the element is zeroed out when the corresponding
5001/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
5002/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
5003/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5004///
5005/// Rounding is done according to the rounding parameter, which can be one of:
5006///
5007/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5008/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5009/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5010/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5011/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5012///
5013/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_round_pch)
5014#[inline]
5015#[target_feature(enable = "avx512fp16")]
5016#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
5017#[rustc_legacy_const_generics(4)]
5018#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5019pub fn _mm512_maskz_fcmadd_round_pch<const ROUNDING: i32>(
5020    k: __mmask16,
5021    a: __m512h,
5022    b: __m512h,
5023    c: __m512h,
5024) -> __m512h {
5025    unsafe {
5026        static_assert_rounding!(ROUNDING);
5027        transmute(vfcmaddcph_maskz_512(
5028            transmute(a),
5029            transmute(b),
5030            transmute(c),
5031            k,
5032            ROUNDING,
5033        ))
5034    }
5035}
5036
5037/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5038/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
5039/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
5040/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
5041/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5042///
5043/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_sch)
5044#[inline]
5045#[target_feature(enable = "avx512fp16")]
5046#[cfg_attr(test, assert_instr(vfcmaddcsh))]
5047#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5048pub fn _mm_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5049    _mm_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
5050}
5051
5052/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5053/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5054/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
5055/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5056/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5057/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5058///
5059/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_sch)
5060#[inline]
5061#[target_feature(enable = "avx512fp16")]
5062#[cfg_attr(test, assert_instr(vfcmaddcsh))]
5063#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5064pub fn _mm_mask_fcmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5065    _mm_mask_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
5066}
5067
5068/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5069/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5070/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
5071/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5072/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5073/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5074///
5075/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_sch)
5076#[inline]
5077#[target_feature(enable = "avx512fp16")]
5078#[cfg_attr(test, assert_instr(vfcmaddcsh))]
5079#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5080pub fn _mm_mask3_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5081    _mm_mask3_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
5082}
5083
5084/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5085/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5086/// zeromask k (the element is zeroed out when the corresponding mask bit is not set), and copy the upper
5087/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5088/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5089/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5090///
5091/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_sch)
5092#[inline]
5093#[target_feature(enable = "avx512fp16")]
5094#[cfg_attr(test, assert_instr(vfcmaddcsh))]
5095#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5096pub fn _mm_maskz_fcmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5097    _mm_maskz_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
5098}
5099
5100/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5101/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
5102/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
5103/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
5104/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5105///
5106/// Rounding is done according to the rounding parameter, which can be one of:
5107///
5108/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5109/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5110/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5111/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5112/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5113///
5114/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_round_sch)
5115#[inline]
5116#[target_feature(enable = "avx512fp16")]
5117#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5118#[rustc_legacy_const_generics(3)]
5119#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5120pub fn _mm_fcmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5121    unsafe {
5122        static_assert_rounding!(ROUNDING);
5123        transmute(vfcmaddcsh_mask(
5124            transmute(a),
5125            transmute(b),
5126            transmute(c),
5127            0xff,
5128            ROUNDING,
5129        ))
5130    }
5131}
5132
5133/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5134/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5135/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
5136/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5137/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5138/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5139///
5140/// Rounding is done according to the rounding parameter, which can be one of:
5141///
5142/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5143/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5144/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5145/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5146/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5147///
5148/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_round_sch)
5149#[inline]
5150#[target_feature(enable = "avx512fp16")]
5151#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5152#[rustc_legacy_const_generics(4)]
5153#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5154pub fn _mm_mask_fcmadd_round_sch<const ROUNDING: i32>(
5155    a: __m128h,
5156    k: __mmask8,
5157    b: __m128h,
5158    c: __m128h,
5159) -> __m128h {
5160    unsafe {
5161        static_assert_rounding!(ROUNDING);
5162        let a = transmute(a);
5163        let r = vfcmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING);
5164        transmute(_mm_mask_move_ss(a, k, a, r))
5165    }
5166}
5167
5168/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5169/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5170/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
5171/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5172/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5173/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5174///
5175/// Rounding is done according to the rounding parameter, which can be one of:
5176///
5177/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5178/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5179/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5180/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5181/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5182///
5183/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_round_sch)
5184#[inline]
5185#[target_feature(enable = "avx512fp16")]
5186#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5187#[rustc_legacy_const_generics(4)]
5188#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5189pub fn _mm_mask3_fcmadd_round_sch<const ROUNDING: i32>(
5190    a: __m128h,
5191    b: __m128h,
5192    c: __m128h,
5193    k: __mmask8,
5194) -> __m128h {
5195    unsafe {
5196        static_assert_rounding!(ROUNDING);
5197        let c = transmute(c);
5198        let r = vfcmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
5199        transmute(_mm_move_ss(c, r))
5200    }
5201}
5202
5203/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5204/// accumulate to the lower complex number in c using zeromask k (the element is zeroed out when the corresponding
5205/// mask bit is not set), and store the result in the lower elements of dst, and copy the upper 6 packed elements
5206/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
5207/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
5208/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5209///
5210/// Rounding is done according to the rounding parameter, which can be one of:
5211///
5212/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5213/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5214/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5215/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5216/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5217///
5218/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_round_sch)
5219#[inline]
5220#[target_feature(enable = "avx512fp16")]
5221#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5222#[rustc_legacy_const_generics(4)]
5223#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5224pub fn _mm_maskz_fcmadd_round_sch<const ROUNDING: i32>(
5225    k: __mmask8,
5226    a: __m128h,
5227    b: __m128h,
5228    c: __m128h,
5229) -> __m128h {
5230    unsafe {
5231        static_assert_rounding!(ROUNDING);
5232        transmute(vfcmaddcsh_maskz(
5233            transmute(a),
5234            transmute(b),
5235            transmute(c),
5236            k,
5237            ROUNDING,
5238        ))
5239    }
5240}
5241
5242/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5243/// result to packed elements in c, and store the results in dst.
5244///
5245/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_ph)
5246#[inline]
5247#[target_feature(enable = "avx512fp16,avx512vl")]
5248#[cfg_attr(test, assert_instr(vfmadd))]
5249#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5250#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5251pub const fn _mm_fmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5252    unsafe { simd_fma(a, b, c) }
5253}
5254
5255/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5256/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5257/// from a when the corresponding mask bit is not set).
5258///
5259/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_ph)
5260#[inline]
5261#[target_feature(enable = "avx512fp16,avx512vl")]
5262#[cfg_attr(test, assert_instr(vfmadd))]
5263#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5264#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5265pub const fn _mm_mask_fmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5266    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), a) }
5267}
5268
5269/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5270/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5271/// from c when the corresponding mask bit is not set).
5272///
5273/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_ph)
5274#[inline]
5275#[target_feature(enable = "avx512fp16,avx512vl")]
5276#[cfg_attr(test, assert_instr(vfmadd))]
5277#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5278#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5279pub const fn _mm_mask3_fmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5280    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), c) }
5281}
5282
5283/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5284/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5285/// out when the corresponding mask bit is not set).
5286///
5287/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_ph)
5288#[inline]
5289#[target_feature(enable = "avx512fp16,avx512vl")]
5290#[cfg_attr(test, assert_instr(vfmadd))]
5291#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5292#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5293pub const fn _mm_maskz_fmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5294    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), _mm_setzero_ph()) }
5295}
5296
5297/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5298/// result to packed elements in c, and store the results in dst.
5299///
5300/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_ph)
5301#[inline]
5302#[target_feature(enable = "avx512fp16,avx512vl")]
5303#[cfg_attr(test, assert_instr(vfmadd))]
5304#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5305#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5306pub const fn _mm256_fmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5307    unsafe { simd_fma(a, b, c) }
5308}
5309
5310/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5311/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5312/// from a when the corresponding mask bit is not set).
5313///
5314/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_ph)
5315#[inline]
5316#[target_feature(enable = "avx512fp16,avx512vl")]
5317#[cfg_attr(test, assert_instr(vfmadd))]
5318#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5319#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5320pub const fn _mm256_mask_fmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
5321    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), a) }
5322}
5323
5324/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5325/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5326/// from c when the corresponding mask bit is not set).
5327///
5328/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_ph)
5329#[inline]
5330#[target_feature(enable = "avx512fp16,avx512vl")]
5331#[cfg_attr(test, assert_instr(vfmadd))]
5332#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5333#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5334pub const fn _mm256_mask3_fmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
5335    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), c) }
5336}
5337
5338/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5339/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5340/// out when the corresponding mask bit is not set).
5341///
5342/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_ph)
5343#[inline]
5344#[target_feature(enable = "avx512fp16,avx512vl")]
5345#[cfg_attr(test, assert_instr(vfmadd))]
5346#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5347#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5348pub const fn _mm256_maskz_fmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5349    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), _mm256_setzero_ph()) }
5350}
5351
5352/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5353/// result to packed elements in c, and store the results in dst.
5354///
5355/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_ph)
5356#[inline]
5357#[target_feature(enable = "avx512fp16")]
5358#[cfg_attr(test, assert_instr(vfmadd))]
5359#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5360#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5361pub const fn _mm512_fmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5362    unsafe { simd_fma(a, b, c) }
5363}
5364
5365/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5366/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5367/// from a when the corresponding mask bit is not set).
5368///
5369/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_ph)
5370#[inline]
5371#[target_feature(enable = "avx512fp16")]
5372#[cfg_attr(test, assert_instr(vfmadd))]
5373#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5374#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5375pub const fn _mm512_mask_fmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
5376    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), a) }
5377}
5378
5379/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5380/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5381/// from c when the corresponding mask bit is not set).
5382///
5383/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_ph)
5384#[inline]
5385#[target_feature(enable = "avx512fp16")]
5386#[cfg_attr(test, assert_instr(vfmadd))]
5387#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5388#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5389pub const fn _mm512_mask3_fmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
5390    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), c) }
5391}
5392
5393/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5394/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5395/// out when the corresponding mask bit is not set).
5396///
5397/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_ph)
5398#[inline]
5399#[target_feature(enable = "avx512fp16")]
5400#[cfg_attr(test, assert_instr(vfmadd))]
5401#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5402#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5403pub const fn _mm512_maskz_fmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5404    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), _mm512_setzero_ph()) }
5405}
5406
5407/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5408/// result to packed elements in c, and store the results in dst.
5409///
5410/// Rounding is done according to the rounding parameter, which can be one of:
5411///
5412/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5413/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5414/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5415/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5416/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5417///
5418/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_ph)
5419#[inline]
5420#[target_feature(enable = "avx512fp16")]
5421#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5422#[rustc_legacy_const_generics(3)]
5423#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5424pub fn _mm512_fmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5425    unsafe {
5426        static_assert_rounding!(ROUNDING);
5427        vfmaddph_512(a, b, c, ROUNDING)
5428    }
5429}
5430
5431/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5432/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5433/// from a when the corresponding mask bit is not set).
5434///
5435/// Rounding is done according to the rounding parameter, which can be one of:
5436///
5437/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5438/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5439/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5440/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5441/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5442///
5443/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_ph)
5444#[inline]
5445#[target_feature(enable = "avx512fp16")]
5446#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5447#[rustc_legacy_const_generics(4)]
5448#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5449pub fn _mm512_mask_fmadd_round_ph<const ROUNDING: i32>(
5450    a: __m512h,
5451    k: __mmask32,
5452    b: __m512h,
5453    c: __m512h,
5454) -> __m512h {
5455    unsafe {
5456        static_assert_rounding!(ROUNDING);
5457        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), a)
5458    }
5459}
5460
5461/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5462/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5463/// from c when the corresponding mask bit is not set).
5464///
5465/// Rounding is done according to the rounding parameter, which can be one of:
5466///
5467/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5468/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5469/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5470/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5471/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5472///
5473/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_ph)
5474#[inline]
5475#[target_feature(enable = "avx512fp16")]
5476#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5477#[rustc_legacy_const_generics(4)]
5478#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5479pub fn _mm512_mask3_fmadd_round_ph<const ROUNDING: i32>(
5480    a: __m512h,
5481    b: __m512h,
5482    c: __m512h,
5483    k: __mmask32,
5484) -> __m512h {
5485    unsafe {
5486        static_assert_rounding!(ROUNDING);
5487        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), c)
5488    }
5489}
5490
5491/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5492/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5493/// out when the corresponding mask bit is not set).
5494///
5495/// Rounding is done according to the rounding parameter, which can be one of:
5496///
5497/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5498/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5499/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5500/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5501/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5502///
5503/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_ph)
5504#[inline]
5505#[target_feature(enable = "avx512fp16")]
5506#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5507#[rustc_legacy_const_generics(4)]
5508#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5509pub fn _mm512_maskz_fmadd_round_ph<const ROUNDING: i32>(
5510    k: __mmask32,
5511    a: __m512h,
5512    b: __m512h,
5513    c: __m512h,
5514) -> __m512h {
5515    unsafe {
5516        static_assert_rounding!(ROUNDING);
5517        simd_select_bitmask(
5518            k,
5519            _mm512_fmadd_round_ph::<ROUNDING>(a, b, c),
5520            _mm512_setzero_ph(),
5521        )
5522    }
5523}
5524
5525/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5526/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
5527/// 7 packed elements from a to the upper elements of dst.
5528///
5529/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sh)
5530#[inline]
5531#[target_feature(enable = "avx512fp16")]
5532#[cfg_attr(test, assert_instr(vfmadd))]
5533#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5534#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5535pub const fn _mm_fmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5536    unsafe {
5537        let extracta: f16 = simd_extract!(a, 0);
5538        let extractb: f16 = simd_extract!(b, 0);
5539        let extractc: f16 = simd_extract!(c, 0);
5540        let r = fmaf16(extracta, extractb, extractc);
5541        simd_insert!(a, 0, r)
5542    }
5543}
5544
5545/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5546/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5547/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5548/// upper elements of dst.
5549///
5550/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sh)
5551#[inline]
5552#[target_feature(enable = "avx512fp16")]
5553#[cfg_attr(test, assert_instr(vfmadd))]
5554#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5555#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5556pub const fn _mm_mask_fmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5557    unsafe {
5558        let mut fmadd: f16 = simd_extract!(a, 0);
5559        if k & 1 != 0 {
5560            let extractb: f16 = simd_extract!(b, 0);
5561            let extractc: f16 = simd_extract!(c, 0);
5562            fmadd = fmaf16(fmadd, extractb, extractc);
5563        }
5564        simd_insert!(a, 0, fmadd)
5565    }
5566}
5567
5568/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5569/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5570/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5571/// upper elements of dst.
5572///
5573/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sh)
5574#[inline]
5575#[target_feature(enable = "avx512fp16")]
5576#[cfg_attr(test, assert_instr(vfmadd))]
5577#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5578#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5579pub const fn _mm_mask3_fmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5580    unsafe {
5581        let mut fmadd: f16 = simd_extract!(c, 0);
5582        if k & 1 != 0 {
5583            let extracta: f16 = simd_extract!(a, 0);
5584            let extractb: f16 = simd_extract!(b, 0);
5585            fmadd = fmaf16(extracta, extractb, fmadd);
5586        }
5587        simd_insert!(c, 0, fmadd)
5588    }
5589}
5590
5591/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5592/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
5593/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5594/// upper elements of dst.
5595///
5596/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sh)
5597#[inline]
5598#[target_feature(enable = "avx512fp16")]
5599#[cfg_attr(test, assert_instr(vfmadd))]
5600#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5601#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5602pub const fn _mm_maskz_fmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5603    unsafe {
5604        let mut fmadd: f16 = 0.0;
5605        if k & 1 != 0 {
5606            let extracta: f16 = simd_extract!(a, 0);
5607            let extractb: f16 = simd_extract!(b, 0);
5608            let extractc: f16 = simd_extract!(c, 0);
5609            fmadd = fmaf16(extracta, extractb, extractc);
5610        }
5611        simd_insert!(a, 0, fmadd)
5612    }
5613}
5614
5615/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5616/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
5617/// 7 packed elements from a to the upper elements of dst.
5618///
5619/// Rounding is done according to the rounding parameter, which can be one of:
5620///
5621/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5622/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5623/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5624/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5625/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5626///
5627/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sh)
5628#[inline]
5629#[target_feature(enable = "avx512fp16")]
5630#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5631#[rustc_legacy_const_generics(3)]
5632#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5633pub fn _mm_fmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5634    unsafe {
5635        static_assert_rounding!(ROUNDING);
5636        let extracta: f16 = simd_extract!(a, 0);
5637        let extractb: f16 = simd_extract!(b, 0);
5638        let extractc: f16 = simd_extract!(c, 0);
5639        let r = vfmaddsh(extracta, extractb, extractc, ROUNDING);
5640        simd_insert!(a, 0, r)
5641    }
5642}
5643
5644/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5645/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5646/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5647/// upper elements of dst.
5648///
5649/// Rounding is done according to the rounding parameter, which can be one of:
5650///
5651/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5652/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5653/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5654/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5655/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5656///
5657/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sh)
5658#[inline]
5659#[target_feature(enable = "avx512fp16")]
5660#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5661#[rustc_legacy_const_generics(4)]
5662#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5663pub fn _mm_mask_fmadd_round_sh<const ROUNDING: i32>(
5664    a: __m128h,
5665    k: __mmask8,
5666    b: __m128h,
5667    c: __m128h,
5668) -> __m128h {
5669    unsafe {
5670        static_assert_rounding!(ROUNDING);
5671        let mut fmadd: f16 = simd_extract!(a, 0);
5672        if k & 1 != 0 {
5673            let extractb: f16 = simd_extract!(b, 0);
5674            let extractc: f16 = simd_extract!(c, 0);
5675            fmadd = vfmaddsh(fmadd, extractb, extractc, ROUNDING);
5676        }
5677        simd_insert!(a, 0, fmadd)
5678    }
5679}
5680
5681/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5682/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5683/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5684/// upper elements of dst.
5685///
5686/// Rounding is done according to the rounding parameter, which can be one of:
5687///
5688/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5689/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5690/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5691/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5692/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5693///
5694/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sh)
5695#[inline]
5696#[target_feature(enable = "avx512fp16")]
5697#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5698#[rustc_legacy_const_generics(4)]
5699#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5700pub fn _mm_mask3_fmadd_round_sh<const ROUNDING: i32>(
5701    a: __m128h,
5702    b: __m128h,
5703    c: __m128h,
5704    k: __mmask8,
5705) -> __m128h {
5706    unsafe {
5707        static_assert_rounding!(ROUNDING);
5708        let mut fmadd: f16 = simd_extract!(c, 0);
5709        if k & 1 != 0 {
5710            let extracta: f16 = simd_extract!(a, 0);
5711            let extractb: f16 = simd_extract!(b, 0);
5712            fmadd = vfmaddsh(extracta, extractb, fmadd, ROUNDING);
5713        }
5714        simd_insert!(c, 0, fmadd)
5715    }
5716}
5717
5718/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5719/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
5720/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5721/// upper elements of dst.
5722///
5723/// Rounding is done according to the rounding parameter, which can be one of:
5724///
5725/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5726/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5727/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5728/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5729/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5730///
5731/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sh)
5732#[inline]
5733#[target_feature(enable = "avx512fp16")]
5734#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5735#[rustc_legacy_const_generics(4)]
5736#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5737pub fn _mm_maskz_fmadd_round_sh<const ROUNDING: i32>(
5738    k: __mmask8,
5739    a: __m128h,
5740    b: __m128h,
5741    c: __m128h,
5742) -> __m128h {
5743    unsafe {
5744        static_assert_rounding!(ROUNDING);
5745        let mut fmadd: f16 = 0.0;
5746        if k & 1 != 0 {
5747            let extracta: f16 = simd_extract!(a, 0);
5748            let extractb: f16 = simd_extract!(b, 0);
5749            let extractc: f16 = simd_extract!(c, 0);
5750            fmadd = vfmaddsh(extracta, extractb, extractc, ROUNDING);
5751        }
5752        simd_insert!(a, 0, fmadd)
5753    }
5754}
5755
5756/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5757/// in c from the intermediate result, and store the results in dst.
5758/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5759///
5760/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_ph)
5761#[inline]
5762#[target_feature(enable = "avx512fp16,avx512vl")]
5763#[cfg_attr(test, assert_instr(vfmsub))]
5764#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5765#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5766pub const fn _mm_fmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5767    unsafe { simd_fma(a, b, simd_neg(c)) }
5768}
5769
5770/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5771/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5772/// from a when the corresponding mask bit is not set).
5773///
5774/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_ph)
5775#[inline]
5776#[target_feature(enable = "avx512fp16,avx512vl")]
5777#[cfg_attr(test, assert_instr(vfmsub))]
5778#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5779#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5780pub const fn _mm_mask_fmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5781    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), a) }
5782}
5783
5784/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5785/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5786/// from c when the corresponding mask bit is not set).
5787///
5788/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_ph)
5789#[inline]
5790#[target_feature(enable = "avx512fp16,avx512vl")]
5791#[cfg_attr(test, assert_instr(vfmsub))]
5792#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5793#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5794pub const fn _mm_mask3_fmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5795    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), c) }
5796}
5797
5798/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5799/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5800/// out when the corresponding mask bit is not set).
5801///
5802/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_ph)
5803#[inline]
5804#[target_feature(enable = "avx512fp16,avx512vl")]
5805#[cfg_attr(test, assert_instr(vfmsub))]
5806#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5807#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5808pub const fn _mm_maskz_fmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5809    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), _mm_setzero_ph()) }
5810}
5811
5812/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5813/// in c from the intermediate result, and store the results in dst.
5814///
5815/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsub_ph)
5816#[inline]
5817#[target_feature(enable = "avx512fp16,avx512vl")]
5818#[cfg_attr(test, assert_instr(vfmsub))]
5819#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5820#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5821pub const fn _mm256_fmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5822    unsafe { simd_fma(a, b, simd_neg(c)) }
5823}
5824
5825/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5826/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5827/// from a when the corresponding mask bit is not set).
5828///
5829/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsub_ph)
5830#[inline]
5831#[target_feature(enable = "avx512fp16,avx512vl")]
5832#[cfg_attr(test, assert_instr(vfmsub))]
5833#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5834#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5835pub const fn _mm256_mask_fmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
5836    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), a) }
5837}
5838
5839/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5840/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5841/// from c when the corresponding mask bit is not set).
5842///
5843/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsub_ph)
5844#[inline]
5845#[target_feature(enable = "avx512fp16,avx512vl")]
5846#[cfg_attr(test, assert_instr(vfmsub))]
5847#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5848#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5849pub const fn _mm256_mask3_fmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
5850    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), c) }
5851}
5852
5853/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5854/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5855/// out when the corresponding mask bit is not set).
5856///
5857/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsub_ph)
5858#[inline]
5859#[target_feature(enable = "avx512fp16,avx512vl")]
5860#[cfg_attr(test, assert_instr(vfmsub))]
5861#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5862#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5863pub const fn _mm256_maskz_fmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5864    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), _mm256_setzero_ph()) }
5865}
5866
5867/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5868/// in c from the intermediate result, and store the results in dst.
5869///
5870/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_ph)
5871#[inline]
5872#[target_feature(enable = "avx512fp16")]
5873#[cfg_attr(test, assert_instr(vfmsub))]
5874#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5875#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5876pub const fn _mm512_fmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5877    unsafe { simd_fma(a, b, simd_neg(c)) }
5878}
5879
5880/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5881/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5882/// from a when the corresponding mask bit is not set).
5883///
5884/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_ph)
5885#[inline]
5886#[target_feature(enable = "avx512fp16")]
5887#[cfg_attr(test, assert_instr(vfmsub))]
5888#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5889#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5890pub const fn _mm512_mask_fmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
5891    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), a) }
5892}
5893
5894/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5895/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5896/// from c when the corresponding mask bit is not set).
5897///
5898/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_ph)
5899#[inline]
5900#[target_feature(enable = "avx512fp16")]
5901#[cfg_attr(test, assert_instr(vfmsub))]
5902#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5903#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5904pub const fn _mm512_mask3_fmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
5905    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), c) }
5906}
5907
5908/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5909/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5910/// out when the corresponding mask bit is not set).
5911///
5912/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_ph)
5913#[inline]
5914#[target_feature(enable = "avx512fp16")]
5915#[cfg_attr(test, assert_instr(vfmsub))]
5916#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5917#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5918pub const fn _mm512_maskz_fmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5919    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), _mm512_setzero_ph()) }
5920}
5921
5922/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5923/// in c from the intermediate result, and store the results in dst.
5924///
5925/// Rounding is done according to the rounding parameter, which can be one of:
5926///
5927/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5928/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5929/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5930/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5931/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5932///
5933/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_round_ph)
5934#[inline]
5935#[target_feature(enable = "avx512fp16")]
5936#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5937#[rustc_legacy_const_generics(3)]
5938#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5939pub fn _mm512_fmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5940    unsafe {
5941        static_assert_rounding!(ROUNDING);
5942        vfmaddph_512(a, b, simd_neg(c), ROUNDING)
5943    }
5944}
5945
5946/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5947/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5948/// from a when the corresponding mask bit is not set).
5949///
5950/// Rounding is done according to the rounding parameter, which can be one of:
5951///
5952/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5953/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5954/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5955/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5956/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5957///
5958/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_round_ph)
5959#[inline]
5960#[target_feature(enable = "avx512fp16")]
5961#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5962#[rustc_legacy_const_generics(4)]
5963#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5964pub fn _mm512_mask_fmsub_round_ph<const ROUNDING: i32>(
5965    a: __m512h,
5966    k: __mmask32,
5967    b: __m512h,
5968    c: __m512h,
5969) -> __m512h {
5970    unsafe {
5971        static_assert_rounding!(ROUNDING);
5972        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), a)
5973    }
5974}
5975
5976/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5977/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5978/// from c when the corresponding mask bit is not set).
5979///
5980/// Rounding is done according to the rounding parameter, which can be one of:
5981///
5982/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5983/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5984/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5985/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5986/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5987///
5988/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_round_ph)
5989#[inline]
5990#[target_feature(enable = "avx512fp16")]
5991#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5992#[rustc_legacy_const_generics(4)]
5993#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5994pub fn _mm512_mask3_fmsub_round_ph<const ROUNDING: i32>(
5995    a: __m512h,
5996    b: __m512h,
5997    c: __m512h,
5998    k: __mmask32,
5999) -> __m512h {
6000    unsafe {
6001        static_assert_rounding!(ROUNDING);
6002        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), c)
6003    }
6004}
6005
6006/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6007/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
6008/// out when the corresponding mask bit is not set).
6009///
6010/// Rounding is done according to the rounding parameter, which can be one of:
6011///
6012/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6013/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6014/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6015/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6016/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6017///
6018/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_round_ph)
6019#[inline]
6020#[target_feature(enable = "avx512fp16")]
6021#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6022#[rustc_legacy_const_generics(4)]
6023#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6024pub fn _mm512_maskz_fmsub_round_ph<const ROUNDING: i32>(
6025    k: __mmask32,
6026    a: __m512h,
6027    b: __m512h,
6028    c: __m512h,
6029) -> __m512h {
6030    unsafe {
6031        static_assert_rounding!(ROUNDING);
6032        simd_select_bitmask(
6033            k,
6034            _mm512_fmsub_round_ph::<ROUNDING>(a, b, c),
6035            _mm512_setzero_ph(),
6036        )
6037    }
6038}
6039
6040/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6041/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
6042/// 7 packed elements from a to the upper elements of dst.
6043///
6044/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_sh)
6045#[inline]
6046#[target_feature(enable = "avx512fp16")]
6047#[cfg_attr(test, assert_instr(vfmsub))]
6048#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6049#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6050pub const fn _mm_fmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6051    unsafe {
6052        let extracta: f16 = simd_extract!(a, 0);
6053        let extractb: f16 = simd_extract!(b, 0);
6054        let extractc: f16 = simd_extract!(c, 0);
6055        let r = fmaf16(extracta, extractb, -extractc);
6056        simd_insert!(a, 0, r)
6057    }
6058}
6059
6060/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6061/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6062/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6063/// upper elements of dst.
6064///
6065/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_sh)
6066#[inline]
6067#[target_feature(enable = "avx512fp16")]
6068#[cfg_attr(test, assert_instr(vfmsub))]
6069#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6070#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6071pub const fn _mm_mask_fmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6072    unsafe {
6073        let mut fmsub: f16 = simd_extract!(a, 0);
6074        if k & 1 != 0 {
6075            let extractb: f16 = simd_extract!(b, 0);
6076            let extractc: f16 = simd_extract!(c, 0);
6077            fmsub = fmaf16(fmsub, extractb, -extractc);
6078        }
6079        simd_insert!(a, 0, fmsub)
6080    }
6081}
6082
6083/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6084/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6085/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
6086/// upper elements of dst.
6087///
6088/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_sh)
6089#[inline]
6090#[target_feature(enable = "avx512fp16")]
6091#[cfg_attr(test, assert_instr(vfmsub))]
6092#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6093#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6094pub const fn _mm_mask3_fmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6095    unsafe {
6096        let mut fmsub: f16 = simd_extract!(c, 0);
6097        if k & 1 != 0 {
6098            let extracta: f16 = simd_extract!(a, 0);
6099            let extractb: f16 = simd_extract!(b, 0);
6100            fmsub = fmaf16(extracta, extractb, -fmsub);
6101        }
6102        simd_insert!(c, 0, fmsub)
6103    }
6104}
6105
6106/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6107/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
6108/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6109/// upper elements of dst.
6110///
6111/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_sh)
6112#[inline]
6113#[target_feature(enable = "avx512fp16")]
6114#[cfg_attr(test, assert_instr(vfmsub))]
6115#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6116#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6117pub const fn _mm_maskz_fmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6118    unsafe {
6119        let mut fmsub: f16 = 0.0;
6120        if k & 1 != 0 {
6121            let extracta: f16 = simd_extract!(a, 0);
6122            let extractb: f16 = simd_extract!(b, 0);
6123            let extractc: f16 = simd_extract!(c, 0);
6124            fmsub = fmaf16(extracta, extractb, -extractc);
6125        }
6126        simd_insert!(a, 0, fmsub)
6127    }
6128}
6129
6130/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6131/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
6132/// 7 packed elements from a to the upper elements of dst.
6133///
6134/// Rounding is done according to the rounding parameter, which can be one of:
6135///
6136/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6137/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6138/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6139/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6140/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6141///
6142/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_round_sh)
6143#[inline]
6144#[target_feature(enable = "avx512fp16")]
6145#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6146#[rustc_legacy_const_generics(3)]
6147#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6148pub fn _mm_fmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6149    unsafe {
6150        static_assert_rounding!(ROUNDING);
6151        let extracta: f16 = simd_extract!(a, 0);
6152        let extractb: f16 = simd_extract!(b, 0);
6153        let extractc: f16 = simd_extract!(c, 0);
6154        let r = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
6155        simd_insert!(a, 0, r)
6156    }
6157}
6158
6159/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6160/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6161/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6162/// upper elements of dst.
6163///
6164/// Rounding is done according to the rounding parameter, which can be one of:
6165///
6166/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6167/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6168/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6169/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6170/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6171///
6172/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_round_sh)
6173#[inline]
6174#[target_feature(enable = "avx512fp16")]
6175#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6176#[rustc_legacy_const_generics(4)]
6177#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6178pub fn _mm_mask_fmsub_round_sh<const ROUNDING: i32>(
6179    a: __m128h,
6180    k: __mmask8,
6181    b: __m128h,
6182    c: __m128h,
6183) -> __m128h {
6184    unsafe {
6185        static_assert_rounding!(ROUNDING);
6186        let mut fmsub: f16 = simd_extract!(a, 0);
6187        if k & 1 != 0 {
6188            let extractb: f16 = simd_extract!(b, 0);
6189            let extractc: f16 = simd_extract!(c, 0);
6190            fmsub = vfmaddsh(fmsub, extractb, -extractc, ROUNDING);
6191        }
6192        simd_insert!(a, 0, fmsub)
6193    }
6194}
6195
6196/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6197/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6198/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
6199/// upper elements of dst.
6200///
6201/// Rounding is done according to the rounding parameter, which can be one of:
6202///
6203/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6204/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6205/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6206/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6207/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6208///
6209/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_round_sh)
6210#[inline]
6211#[target_feature(enable = "avx512fp16")]
6212#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6213#[rustc_legacy_const_generics(4)]
6214#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6215pub fn _mm_mask3_fmsub_round_sh<const ROUNDING: i32>(
6216    a: __m128h,
6217    b: __m128h,
6218    c: __m128h,
6219    k: __mmask8,
6220) -> __m128h {
6221    unsafe {
6222        static_assert_rounding!(ROUNDING);
6223        let mut fmsub: f16 = simd_extract!(c, 0);
6224        if k & 1 != 0 {
6225            let extracta: f16 = simd_extract!(a, 0);
6226            let extractb: f16 = simd_extract!(b, 0);
6227            fmsub = vfmaddsh(extracta, extractb, -fmsub, ROUNDING);
6228        }
6229        simd_insert!(c, 0, fmsub)
6230    }
6231}
6232
6233/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6234/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
6235/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6236/// upper elements of dst.
6237///
6238/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_round_sh)
6239#[inline]
6240#[target_feature(enable = "avx512fp16")]
6241#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6242#[rustc_legacy_const_generics(4)]
6243#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6244pub fn _mm_maskz_fmsub_round_sh<const ROUNDING: i32>(
6245    k: __mmask8,
6246    a: __m128h,
6247    b: __m128h,
6248    c: __m128h,
6249) -> __m128h {
6250    unsafe {
6251        static_assert_rounding!(ROUNDING);
6252        let mut fmsub: f16 = 0.0;
6253        if k & 1 != 0 {
6254            let extracta: f16 = simd_extract!(a, 0);
6255            let extractb: f16 = simd_extract!(b, 0);
6256            let extractc: f16 = simd_extract!(c, 0);
6257            fmsub = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
6258        }
6259        simd_insert!(a, 0, fmsub)
6260    }
6261}
6262
6263/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6264/// result from packed elements in c, and store the results in dst.
6265///
6266/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_ph)
6267#[inline]
6268#[target_feature(enable = "avx512fp16,avx512vl")]
6269#[cfg_attr(test, assert_instr(vfnmadd))]
6270#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6271#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6272pub const fn _mm_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6273    unsafe { simd_fma(simd_neg(a), b, c) }
6274}
6275
6276/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6277/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6278/// from a when the corresponding mask bit is not set).
6279///
6280/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_ph)
6281#[inline]
6282#[target_feature(enable = "avx512fp16,avx512vl")]
6283#[cfg_attr(test, assert_instr(vfnmadd))]
6284#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6285#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6286pub const fn _mm_mask_fnmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6287    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), a) }
6288}
6289
6290/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6291/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6292/// from c when the corresponding mask bit is not set).
6293///
6294/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_ph)
6295#[inline]
6296#[target_feature(enable = "avx512fp16,avx512vl")]
6297#[cfg_attr(test, assert_instr(vfnmadd))]
6298#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6299#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6300pub const fn _mm_mask3_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6301    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), c) }
6302}
6303
6304/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6305/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6306/// out when the corresponding mask bit is not set).
6307///
6308/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_ph)
6309#[inline]
6310#[target_feature(enable = "avx512fp16,avx512vl")]
6311#[cfg_attr(test, assert_instr(vfnmadd))]
6312#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6313#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6314pub const fn _mm_maskz_fnmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6315    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), _mm_setzero_ph()) }
6316}
6317
6318/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6319/// result from packed elements in c, and store the results in dst.
6320///
6321/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmadd_ph)
6322#[inline]
6323#[target_feature(enable = "avx512fp16,avx512vl")]
6324#[cfg_attr(test, assert_instr(vfnmadd))]
6325#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6326#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6327pub const fn _mm256_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6328    unsafe { simd_fma(simd_neg(a), b, c) }
6329}
6330
6331/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6332/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6333/// from a when the corresponding mask bit is not set).
6334///
6335/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmadd_ph)
6336#[inline]
6337#[target_feature(enable = "avx512fp16,avx512vl")]
6338#[cfg_attr(test, assert_instr(vfnmadd))]
6339#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6340#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6341pub const fn _mm256_mask_fnmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
6342    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), a) }
6343}
6344
6345/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6346/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6347/// from c when the corresponding mask bit is not set).
6348///
6349/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmadd_ph)
6350#[inline]
6351#[target_feature(enable = "avx512fp16,avx512vl")]
6352#[cfg_attr(test, assert_instr(vfnmadd))]
6353#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6354#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6355pub const fn _mm256_mask3_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
6356    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), c) }
6357}
6358
6359/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6360/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6361/// out when the corresponding mask bit is not set).
6362///
6363/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmadd_ph)
6364#[inline]
6365#[target_feature(enable = "avx512fp16,avx512vl")]
6366#[cfg_attr(test, assert_instr(vfnmadd))]
6367#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6368#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6369pub const fn _mm256_maskz_fnmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6370    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), _mm256_setzero_ph()) }
6371}
6372
6373/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6374/// result from packed elements in c, and store the results in dst.
6375///
6376/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_ph)
6377#[inline]
6378#[target_feature(enable = "avx512fp16")]
6379#[cfg_attr(test, assert_instr(vfnmadd))]
6380#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6381#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6382pub const fn _mm512_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6383    unsafe { simd_fma(simd_neg(a), b, c) }
6384}
6385
6386/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6387/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6388/// from a when the corresponding mask bit is not set).
6389///
6390/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_ph)
6391#[inline]
6392#[target_feature(enable = "avx512fp16")]
6393#[cfg_attr(test, assert_instr(vfnmadd))]
6394#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6395#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6396pub const fn _mm512_mask_fnmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
6397    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), a) }
6398}
6399
6400/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6401/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6402/// from c when the corresponding mask bit is not set).
6403///
6404/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_ph)
6405#[inline]
6406#[target_feature(enable = "avx512fp16")]
6407#[cfg_attr(test, assert_instr(vfnmadd))]
6408#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6409#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6410pub const fn _mm512_mask3_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
6411    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), c) }
6412}
6413
6414/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6415/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6416/// out when the corresponding mask bit is not set).
6417///
6418/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_ph)
6419#[inline]
6420#[target_feature(enable = "avx512fp16")]
6421#[cfg_attr(test, assert_instr(vfnmadd))]
6422#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6423#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6424pub const fn _mm512_maskz_fnmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6425    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), _mm512_setzero_ph()) }
6426}
6427
6428/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6429/// result from packed elements in c, and store the results in dst.
6430///
6431/// Rounding is done according to the rounding parameter, which can be one of:
6432///
6433/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6434/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6435/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6436/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6437/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6438///
6439/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_round_ph)
6440#[inline]
6441#[target_feature(enable = "avx512fp16")]
6442#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6443#[rustc_legacy_const_generics(3)]
6444#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6445pub fn _mm512_fnmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6446    unsafe {
6447        static_assert_rounding!(ROUNDING);
6448        vfmaddph_512(simd_neg(a), b, c, ROUNDING)
6449    }
6450}
6451
6452/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6453/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6454/// from a when the corresponding mask bit is not set).
6455///
6456/// Rounding is done according to the rounding parameter, which can be one of:
6457///
6458/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6459/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6460/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6461/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6462/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6463///
6464/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_round_ph)
6465#[inline]
6466#[target_feature(enable = "avx512fp16")]
6467#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6468#[rustc_legacy_const_generics(4)]
6469#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6470pub fn _mm512_mask_fnmadd_round_ph<const ROUNDING: i32>(
6471    a: __m512h,
6472    k: __mmask32,
6473    b: __m512h,
6474    c: __m512h,
6475) -> __m512h {
6476    unsafe {
6477        static_assert_rounding!(ROUNDING);
6478        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), a)
6479    }
6480}
6481
6482/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6483/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6484/// from c when the corresponding mask bit is not set).
6485///
6486/// Rounding is done according to the rounding parameter, which can be one of:
6487///
6488/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6489/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6490/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6491/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6492/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6493///
6494/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_round_ph)
6495#[inline]
6496#[target_feature(enable = "avx512fp16")]
6497#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6498#[rustc_legacy_const_generics(4)]
6499#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6500pub fn _mm512_mask3_fnmadd_round_ph<const ROUNDING: i32>(
6501    a: __m512h,
6502    b: __m512h,
6503    c: __m512h,
6504    k: __mmask32,
6505) -> __m512h {
6506    unsafe {
6507        static_assert_rounding!(ROUNDING);
6508        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), c)
6509    }
6510}
6511
6512/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6513/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6514/// out when the corresponding mask bit is not set).
6515///
6516/// Rounding is done according to the rounding parameter, which can be one of:
6517///
6518/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6519/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6520/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6521/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6522/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6523///
6524/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_round_ph)
6525#[inline]
6526#[target_feature(enable = "avx512fp16")]
6527#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6528#[rustc_legacy_const_generics(4)]
6529#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6530pub fn _mm512_maskz_fnmadd_round_ph<const ROUNDING: i32>(
6531    k: __mmask32,
6532    a: __m512h,
6533    b: __m512h,
6534    c: __m512h,
6535) -> __m512h {
6536    unsafe {
6537        static_assert_rounding!(ROUNDING);
6538        simd_select_bitmask(
6539            k,
6540            _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c),
6541            _mm512_setzero_ph(),
6542        )
6543    }
6544}
6545
6546/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6547/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6548/// elements from a to the upper elements of dst.
6549///
6550/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_sh)
6551#[inline]
6552#[target_feature(enable = "avx512fp16")]
6553#[cfg_attr(test, assert_instr(vfnmadd))]
6554#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6555#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6556pub const fn _mm_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6557    unsafe {
6558        let extracta: f16 = simd_extract!(a, 0);
6559        let extractb: f16 = simd_extract!(b, 0);
6560        let extractc: f16 = simd_extract!(c, 0);
6561        let r = fmaf16(-extracta, extractb, extractc);
6562        simd_insert!(a, 0, r)
6563    }
6564}
6565
6566/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6567/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6568/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6569/// elements of dst.
6570///
6571/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_sh)
6572#[inline]
6573#[target_feature(enable = "avx512fp16")]
6574#[cfg_attr(test, assert_instr(vfnmadd))]
6575#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6576#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6577pub const fn _mm_mask_fnmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6578    unsafe {
6579        let mut fnmadd: f16 = simd_extract!(a, 0);
6580        if k & 1 != 0 {
6581            let extractb: f16 = simd_extract!(b, 0);
6582            let extractc: f16 = simd_extract!(c, 0);
6583            fnmadd = fmaf16(-fnmadd, extractb, extractc);
6584        }
6585        simd_insert!(a, 0, fnmadd)
6586    }
6587}
6588
6589/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6590/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6591/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6592/// elements of dst.
6593///
6594/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_sh)
6595#[inline]
6596#[target_feature(enable = "avx512fp16")]
6597#[cfg_attr(test, assert_instr(vfnmadd))]
6598#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6599#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6600pub const fn _mm_mask3_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6601    unsafe {
6602        let mut fnmadd: f16 = simd_extract!(c, 0);
6603        if k & 1 != 0 {
6604            let extracta: f16 = simd_extract!(a, 0);
6605            let extractb: f16 = simd_extract!(b, 0);
6606            fnmadd = fmaf16(-extracta, extractb, fnmadd);
6607        }
6608        simd_insert!(c, 0, fnmadd)
6609    }
6610}
6611
6612/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6613/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6614/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6615/// elements of dst.
6616///
6617/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_sh)
6618#[inline]
6619#[target_feature(enable = "avx512fp16")]
6620#[cfg_attr(test, assert_instr(vfnmadd))]
6621#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6622#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6623pub const fn _mm_maskz_fnmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6624    unsafe {
6625        let mut fnmadd: f16 = 0.0;
6626        if k & 1 != 0 {
6627            let extracta: f16 = simd_extract!(a, 0);
6628            let extractb: f16 = simd_extract!(b, 0);
6629            let extractc: f16 = simd_extract!(c, 0);
6630            fnmadd = fmaf16(-extracta, extractb, extractc);
6631        }
6632        simd_insert!(a, 0, fnmadd)
6633    }
6634}
6635
6636/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6637/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6638/// elements from a to the upper elements of dst.
6639///
6640/// Rounding is done according to the rounding parameter, which can be one of:
6641///
6642/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6643/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6644/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6645/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6646/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6647///
6648/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_round_sh)
6649#[inline]
6650#[target_feature(enable = "avx512fp16")]
6651#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6652#[rustc_legacy_const_generics(3)]
6653#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6654pub fn _mm_fnmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6655    unsafe {
6656        static_assert_rounding!(ROUNDING);
6657        let extracta: f16 = simd_extract!(a, 0);
6658        let extractb: f16 = simd_extract!(b, 0);
6659        let extractc: f16 = simd_extract!(c, 0);
6660        let r = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
6661        simd_insert!(a, 0, r)
6662    }
6663}
6664
6665/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6666/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6667/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6668/// elements of dst.
6669///
6670/// Rounding is done according to the rounding parameter, which can be one of:
6671///
6672/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6673/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6674/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6675/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6676/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6677///
6678/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_round_sh)
6679#[inline]
6680#[target_feature(enable = "avx512fp16")]
6681#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6682#[rustc_legacy_const_generics(4)]
6683#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6684pub fn _mm_mask_fnmadd_round_sh<const ROUNDING: i32>(
6685    a: __m128h,
6686    k: __mmask8,
6687    b: __m128h,
6688    c: __m128h,
6689) -> __m128h {
6690    unsafe {
6691        static_assert_rounding!(ROUNDING);
6692        let mut fnmadd: f16 = simd_extract!(a, 0);
6693        if k & 1 != 0 {
6694            let extractb: f16 = simd_extract!(b, 0);
6695            let extractc: f16 = simd_extract!(c, 0);
6696            fnmadd = vfmaddsh(-fnmadd, extractb, extractc, ROUNDING);
6697        }
6698        simd_insert!(a, 0, fnmadd)
6699    }
6700}
6701
6702/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6703/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6704/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6705/// elements of dst.
6706///
6707/// Rounding is done according to the rounding parameter, which can be one of:
6708///
6709/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6710/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6711/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6712/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6713/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6714///
6715/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_round_sh)
6716#[inline]
6717#[target_feature(enable = "avx512fp16")]
6718#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6719#[rustc_legacy_const_generics(4)]
6720#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6721pub fn _mm_mask3_fnmadd_round_sh<const ROUNDING: i32>(
6722    a: __m128h,
6723    b: __m128h,
6724    c: __m128h,
6725    k: __mmask8,
6726) -> __m128h {
6727    unsafe {
6728        static_assert_rounding!(ROUNDING);
6729        let mut fnmadd: f16 = simd_extract!(c, 0);
6730        if k & 1 != 0 {
6731            let extracta: f16 = simd_extract!(a, 0);
6732            let extractb: f16 = simd_extract!(b, 0);
6733            fnmadd = vfmaddsh(-extracta, extractb, fnmadd, ROUNDING);
6734        }
6735        simd_insert!(c, 0, fnmadd)
6736    }
6737}
6738
6739/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6740/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6741/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6742/// elements of dst.
6743///
6744/// Rounding is done according to the rounding parameter, which can be one of:
6745///
6746/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6747/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6748/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6749/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6750/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6751///
6752/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_round_sh)
6753#[inline]
6754#[target_feature(enable = "avx512fp16")]
6755#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6756#[rustc_legacy_const_generics(4)]
6757#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6758pub fn _mm_maskz_fnmadd_round_sh<const ROUNDING: i32>(
6759    k: __mmask8,
6760    a: __m128h,
6761    b: __m128h,
6762    c: __m128h,
6763) -> __m128h {
6764    unsafe {
6765        static_assert_rounding!(ROUNDING);
6766        let mut fnmadd: f16 = 0.0;
6767        if k & 1 != 0 {
6768            let extracta: f16 = simd_extract!(a, 0);
6769            let extractb: f16 = simd_extract!(b, 0);
6770            let extractc: f16 = simd_extract!(c, 0);
6771            fnmadd = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
6772        }
6773        simd_insert!(a, 0, fnmadd)
6774    }
6775}
6776
6777/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6778/// in c from the negated intermediate result, and store the results in dst.
6779///
6780/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_ph)
6781#[inline]
6782#[target_feature(enable = "avx512fp16,avx512vl")]
6783#[cfg_attr(test, assert_instr(vfnmsub))]
6784#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6785#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6786pub const fn _mm_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6787    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6788}
6789
6790/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6791/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6792/// copied from a when the corresponding mask bit is not set).
6793///
6794/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_ph)
6795#[inline]
6796#[target_feature(enable = "avx512fp16,avx512vl")]
6797#[cfg_attr(test, assert_instr(vfnmsub))]
6798#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6799#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6800pub const fn _mm_mask_fnmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6801    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), a) }
6802}
6803
6804/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6805/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6806/// copied from c when the corresponding mask bit is not set).
6807///
6808/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_ph)
6809#[inline]
6810#[target_feature(enable = "avx512fp16,avx512vl")]
6811#[cfg_attr(test, assert_instr(vfnmsub))]
6812#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6813#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6814pub const fn _mm_mask3_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6815    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), c) }
6816}
6817
6818/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6819/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6820/// zeroed out when the corresponding mask bit is not set).
6821///
6822/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_ph)
6823#[inline]
6824#[target_feature(enable = "avx512fp16,avx512vl")]
6825#[cfg_attr(test, assert_instr(vfnmsub))]
6826#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6827#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6828pub const fn _mm_maskz_fnmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6829    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), _mm_setzero_ph()) }
6830}
6831
6832/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6833/// in c from the negated intermediate result, and store the results in dst.
6834///
6835/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmsub_ph)
6836#[inline]
6837#[target_feature(enable = "avx512fp16,avx512vl")]
6838#[cfg_attr(test, assert_instr(vfnmsub))]
6839#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6840#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6841pub const fn _mm256_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6842    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6843}
6844
6845/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6846/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6847/// copied from a when the corresponding mask bit is not set).
6848///
6849/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmsub_ph)
6850#[inline]
6851#[target_feature(enable = "avx512fp16,avx512vl")]
6852#[cfg_attr(test, assert_instr(vfnmsub))]
6853#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6854#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6855pub const fn _mm256_mask_fnmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
6856    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), a) }
6857}
6858
6859/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6860/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6861/// copied from c when the corresponding mask bit is not set).
6862///
6863/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmsub_ph)
6864#[inline]
6865#[target_feature(enable = "avx512fp16,avx512vl")]
6866#[cfg_attr(test, assert_instr(vfnmsub))]
6867#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6868#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6869pub const fn _mm256_mask3_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
6870    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), c) }
6871}
6872
6873/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6874/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6875/// zeroed out when the corresponding mask bit is not set).
6876///
6877/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmsub_ph)
6878#[inline]
6879#[target_feature(enable = "avx512fp16,avx512vl")]
6880#[cfg_attr(test, assert_instr(vfnmsub))]
6881#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6882#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6883pub const fn _mm256_maskz_fnmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6884    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), _mm256_setzero_ph()) }
6885}
6886
6887/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6888/// in c from the negated intermediate result, and store the results in dst.
6889///
6890/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_ph)
6891#[inline]
6892#[target_feature(enable = "avx512fp16")]
6893#[cfg_attr(test, assert_instr(vfnmsub))]
6894#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6895#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6896pub const fn _mm512_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6897    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6898}
6899
6900/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6901/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6902/// copied from a when the corresponding mask bit is not set).
6903///
6904/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_ph)
6905#[inline]
6906#[target_feature(enable = "avx512fp16")]
6907#[cfg_attr(test, assert_instr(vfnmsub))]
6908#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6909#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6910pub const fn _mm512_mask_fnmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
6911    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), a) }
6912}
6913
6914/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6915/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6916/// copied from c when the corresponding mask bit is not set).
6917///
6918/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_ph)
6919#[inline]
6920#[target_feature(enable = "avx512fp16")]
6921#[cfg_attr(test, assert_instr(vfnmsub))]
6922#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6923#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6924pub const fn _mm512_mask3_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
6925    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), c) }
6926}
6927
6928/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6929/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6930/// zeroed out when the corresponding mask bit is not set).
6931///
6932/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_ph)
6933#[inline]
6934#[target_feature(enable = "avx512fp16")]
6935#[cfg_attr(test, assert_instr(vfnmsub))]
6936#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6937#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6938pub const fn _mm512_maskz_fnmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6939    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), _mm512_setzero_ph()) }
6940}
6941
6942/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6943/// in c from the negated intermediate result, and store the results in dst.
6944///
6945/// Rounding is done according to the rounding parameter, which can be one of:
6946///
6947/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6948/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6949/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6950/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6951/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6952///
6953/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_round_ph)
6954#[inline]
6955#[target_feature(enable = "avx512fp16")]
6956#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6957#[rustc_legacy_const_generics(3)]
6958#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6959pub fn _mm512_fnmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6960    unsafe {
6961        static_assert_rounding!(ROUNDING);
6962        vfmaddph_512(simd_neg(a), b, simd_neg(c), ROUNDING)
6963    }
6964}
6965
6966/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6967/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6968/// copied from a when the corresponding mask bit is not set).
6969///
6970/// Rounding is done according to the rounding parameter, which can be one of:
6971///
6972/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6973/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6974/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6975/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6976/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6977///
6978/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_round_ph)
6979#[inline]
6980#[target_feature(enable = "avx512fp16")]
6981#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6982#[rustc_legacy_const_generics(4)]
6983#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6984pub fn _mm512_mask_fnmsub_round_ph<const ROUNDING: i32>(
6985    a: __m512h,
6986    k: __mmask32,
6987    b: __m512h,
6988    c: __m512h,
6989) -> __m512h {
6990    unsafe {
6991        static_assert_rounding!(ROUNDING);
6992        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), a)
6993    }
6994}
6995
6996/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6997/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6998/// copied from c when the corresponding mask bit is not set).
6999///
7000/// Rounding is done according to the rounding parameter, which can be one of:
7001///
7002/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7003/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7004/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7005/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7006/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7007///
7008/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_round_ph)
7009#[inline]
7010#[target_feature(enable = "avx512fp16")]
7011#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7012#[rustc_legacy_const_generics(4)]
7013#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7014pub fn _mm512_mask3_fnmsub_round_ph<const ROUNDING: i32>(
7015    a: __m512h,
7016    b: __m512h,
7017    c: __m512h,
7018    k: __mmask32,
7019) -> __m512h {
7020    unsafe {
7021        static_assert_rounding!(ROUNDING);
7022        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), c)
7023    }
7024}
7025
7026/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
7027/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
7028/// zeroed out when the corresponding mask bit is not set).
7029///
7030/// Rounding is done according to the rounding parameter, which can be one of:
7031///
7032/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7033/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7034/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7035/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7036/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7037///
7038/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_round_ph)
7039#[inline]
7040#[target_feature(enable = "avx512fp16")]
7041#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7042#[rustc_legacy_const_generics(4)]
7043#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7044pub fn _mm512_maskz_fnmsub_round_ph<const ROUNDING: i32>(
7045    k: __mmask32,
7046    a: __m512h,
7047    b: __m512h,
7048    c: __m512h,
7049) -> __m512h {
7050    unsafe {
7051        static_assert_rounding!(ROUNDING);
7052        simd_select_bitmask(
7053            k,
7054            _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c),
7055            _mm512_setzero_ph(),
7056        )
7057    }
7058}
7059
7060/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7061/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
7062/// elements from a to the upper elements of dst.
7063///
7064/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_sh)
7065#[inline]
7066#[target_feature(enable = "avx512fp16")]
7067#[cfg_attr(test, assert_instr(vfnmsub))]
7068#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7069#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7070pub const fn _mm_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7071    unsafe {
7072        let extracta: f16 = simd_extract!(a, 0);
7073        let extractb: f16 = simd_extract!(b, 0);
7074        let extractc: f16 = simd_extract!(c, 0);
7075        let r = fmaf16(-extracta, extractb, -extractc);
7076        simd_insert!(a, 0, r)
7077    }
7078}
7079
7080/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7081/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
7082/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
7083/// elements of dst.
7084///
7085/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_sh)
7086#[inline]
7087#[target_feature(enable = "avx512fp16")]
7088#[cfg_attr(test, assert_instr(vfnmsub))]
7089#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7090#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7091pub const fn _mm_mask_fnmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
7092    unsafe {
7093        let mut fnmsub: f16 = simd_extract!(a, 0);
7094        if k & 1 != 0 {
7095            let extractb: f16 = simd_extract!(b, 0);
7096            let extractc: f16 = simd_extract!(c, 0);
7097            fnmsub = fmaf16(-fnmsub, extractb, -extractc);
7098        }
7099        simd_insert!(a, 0, fnmsub)
7100    }
7101}
7102
7103/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7104/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
7105/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
7106/// elements of dst.
7107///
7108/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_sh)
7109#[inline]
7110#[target_feature(enable = "avx512fp16")]
7111#[cfg_attr(test, assert_instr(vfnmsub))]
7112#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7113#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7114pub const fn _mm_mask3_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
7115    unsafe {
7116        let mut fnmsub: f16 = simd_extract!(c, 0);
7117        if k & 1 != 0 {
7118            let extracta: f16 = simd_extract!(a, 0);
7119            let extractb: f16 = simd_extract!(b, 0);
7120            fnmsub = fmaf16(-extracta, extractb, -fnmsub);
7121        }
7122        simd_insert!(c, 0, fnmsub)
7123    }
7124}
7125
7126/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7127/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
7128/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
7129/// elements of dst.
7130///
7131/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_sh)
7132#[inline]
7133#[target_feature(enable = "avx512fp16")]
7134#[cfg_attr(test, assert_instr(vfnmsub))]
7135#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7136#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7137pub const fn _mm_maskz_fnmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7138    unsafe {
7139        let mut fnmsub: f16 = 0.0;
7140        if k & 1 != 0 {
7141            let extracta: f16 = simd_extract!(a, 0);
7142            let extractb: f16 = simd_extract!(b, 0);
7143            let extractc: f16 = simd_extract!(c, 0);
7144            fnmsub = fmaf16(-extracta, extractb, -extractc);
7145        }
7146        simd_insert!(a, 0, fnmsub)
7147    }
7148}
7149
7150/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7151/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
7152/// elements from a to the upper elements of dst.
7153///
7154/// Rounding is done according to the rounding parameter, which can be one of:
7155///
7156/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7157/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7158/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7159/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7160/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7161///
7162/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_round_sh)
7163#[inline]
7164#[target_feature(enable = "avx512fp16")]
7165#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7166#[rustc_legacy_const_generics(3)]
7167#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7168pub fn _mm_fnmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7169    unsafe {
7170        static_assert_rounding!(ROUNDING);
7171        let extracta: f16 = simd_extract!(a, 0);
7172        let extractb: f16 = simd_extract!(b, 0);
7173        let extractc: f16 = simd_extract!(c, 0);
7174        let r = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
7175        simd_insert!(a, 0, r)
7176    }
7177}
7178
7179/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7180/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
7181/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
7182/// elements of dst.
7183///
7184/// Rounding is done according to the rounding parameter, which can be one of:
7185///
7186/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7187/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7188/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7189/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7190/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7191///
7192/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_round_sh)
7193#[inline]
7194#[target_feature(enable = "avx512fp16")]
7195#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7196#[rustc_legacy_const_generics(4)]
7197#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7198pub fn _mm_mask_fnmsub_round_sh<const ROUNDING: i32>(
7199    a: __m128h,
7200    k: __mmask8,
7201    b: __m128h,
7202    c: __m128h,
7203) -> __m128h {
7204    unsafe {
7205        static_assert_rounding!(ROUNDING);
7206        let mut fnmsub: f16 = simd_extract!(a, 0);
7207        if k & 1 != 0 {
7208            let extractb: f16 = simd_extract!(b, 0);
7209            let extractc: f16 = simd_extract!(c, 0);
7210            fnmsub = vfmaddsh(-fnmsub, extractb, -extractc, ROUNDING);
7211        }
7212        simd_insert!(a, 0, fnmsub)
7213    }
7214}
7215
7216/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7217/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
7218/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
7219/// elements of dst.
7220///
7221/// Rounding is done according to the rounding parameter, which can be one of:
7222///
7223/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7224/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7225/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7226/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7227/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7228///
7229/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_round_sh)
7230#[inline]
7231#[target_feature(enable = "avx512fp16")]
7232#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7233#[rustc_legacy_const_generics(4)]
7234#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7235pub fn _mm_mask3_fnmsub_round_sh<const ROUNDING: i32>(
7236    a: __m128h,
7237    b: __m128h,
7238    c: __m128h,
7239    k: __mmask8,
7240) -> __m128h {
7241    unsafe {
7242        static_assert_rounding!(ROUNDING);
7243        let mut fnmsub: f16 = simd_extract!(c, 0);
7244        if k & 1 != 0 {
7245            let extracta: f16 = simd_extract!(a, 0);
7246            let extractb: f16 = simd_extract!(b, 0);
7247            fnmsub = vfmaddsh(-extracta, extractb, -fnmsub, ROUNDING);
7248        }
7249        simd_insert!(c, 0, fnmsub)
7250    }
7251}
7252
7253/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7254/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
7255/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
7256/// elements of dst.
7257///
7258/// Rounding is done according to the rounding parameter, which can be one of:
7259///
7260/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7261/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7262/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7263/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7264/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7265///
7266/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_round_sh)
7267#[inline]
7268#[target_feature(enable = "avx512fp16")]
7269#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7270#[rustc_legacy_const_generics(4)]
7271#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7272pub fn _mm_maskz_fnmsub_round_sh<const ROUNDING: i32>(
7273    k: __mmask8,
7274    a: __m128h,
7275    b: __m128h,
7276    c: __m128h,
7277) -> __m128h {
7278    unsafe {
7279        static_assert_rounding!(ROUNDING);
7280        let mut fnmsub: f16 = 0.0;
7281        if k & 1 != 0 {
7282            let extracta: f16 = simd_extract!(a, 0);
7283            let extractb: f16 = simd_extract!(b, 0);
7284            let extractc: f16 = simd_extract!(c, 0);
7285            fnmsub = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
7286        }
7287        simd_insert!(a, 0, fnmsub)
7288    }
7289}
7290
7291/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7292/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7293///
7294/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmaddsub_ph)
7295#[inline]
7296#[target_feature(enable = "avx512fp16,avx512vl")]
7297#[cfg_attr(test, assert_instr(vfmaddsub))]
7298#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7299#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7300pub const fn _mm_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7301    unsafe {
7302        let add = simd_fma(a, b, c);
7303        let sub = simd_fma(a, b, simd_neg(c));
7304        simd_shuffle!(sub, add, [0, 9, 2, 11, 4, 13, 6, 15])
7305    }
7306}
7307
7308/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7309/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7310/// (the element is copied from a when the corresponding mask bit is not set).
7311///
7312/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmaddsub_ph)
7313#[inline]
7314#[target_feature(enable = "avx512fp16,avx512vl")]
7315#[cfg_attr(test, assert_instr(vfmaddsub))]
7316#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7317#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7318pub const fn _mm_mask_fmaddsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
7319    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), a) }
7320}
7321
7322/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7323/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7324/// (the element is copied from c when the corresponding mask bit is not set).
7325///
7326/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmaddsub_ph)
7327#[inline]
7328#[target_feature(enable = "avx512fp16,avx512vl")]
7329#[cfg_attr(test, assert_instr(vfmaddsub))]
7330#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7331#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7332pub const fn _mm_mask3_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
7333    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), c) }
7334}
7335
7336/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7337/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7338/// (the element is zeroed out when the corresponding mask bit is not set).
7339///
7340/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmaddsub_ph)
7341#[inline]
7342#[target_feature(enable = "avx512fp16,avx512vl")]
7343#[cfg_attr(test, assert_instr(vfmaddsub))]
7344#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7345#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7346pub const fn _mm_maskz_fmaddsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7347    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), _mm_setzero_ph()) }
7348}
7349
7350/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7351/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7352///
7353/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmaddsub_ph)
7354#[inline]
7355#[target_feature(enable = "avx512fp16,avx512vl")]
7356#[cfg_attr(test, assert_instr(vfmaddsub))]
7357#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7358#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7359pub const fn _mm256_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7360    unsafe {
7361        let add = simd_fma(a, b, c);
7362        let sub = simd_fma(a, b, simd_neg(c));
7363        simd_shuffle!(
7364            sub,
7365            add,
7366            [0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31]
7367        )
7368    }
7369}
7370
7371/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7372/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7373/// (the element is copied from a when the corresponding mask bit is not set).
7374///
7375/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmaddsub_ph)
7376#[inline]
7377#[target_feature(enable = "avx512fp16,avx512vl")]
7378#[cfg_attr(test, assert_instr(vfmaddsub))]
7379#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7380#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7381pub const fn _mm256_mask_fmaddsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
7382    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), a) }
7383}
7384
7385/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7386/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7387/// (the element is copied from c when the corresponding mask bit is not set).
7388///
7389/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmaddsub_ph)
7390#[inline]
7391#[target_feature(enable = "avx512fp16,avx512vl")]
7392#[cfg_attr(test, assert_instr(vfmaddsub))]
7393#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7394#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7395pub const fn _mm256_mask3_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
7396    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), c) }
7397}
7398
7399/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7400/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7401/// (the element is zeroed out when the corresponding mask bit is not set).
7402///
7403/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmaddsub_ph)
7404#[inline]
7405#[target_feature(enable = "avx512fp16,avx512vl")]
7406#[cfg_attr(test, assert_instr(vfmaddsub))]
7407#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7408#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7409pub const fn _mm256_maskz_fmaddsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7410    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), _mm256_setzero_ph()) }
7411}
7412
7413/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7414/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7415///
7416/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_ph)
7417#[inline]
7418#[target_feature(enable = "avx512fp16")]
7419#[cfg_attr(test, assert_instr(vfmaddsub))]
7420#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7421#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7422pub const fn _mm512_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7423    unsafe {
7424        let add = simd_fma(a, b, c);
7425        let sub = simd_fma(a, b, simd_neg(c));
7426        simd_shuffle!(
7427            sub,
7428            add,
7429            [
7430                0, 33, 2, 35, 4, 37, 6, 39, 8, 41, 10, 43, 12, 45, 14, 47, 16, 49, 18, 51, 20, 53,
7431                22, 55, 24, 57, 26, 59, 28, 61, 30, 63
7432            ]
7433        )
7434    }
7435}
7436
7437/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7438/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7439/// (the element is copied from a when the corresponding mask bit is not set).
7440///
7441/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_ph)
7442#[inline]
7443#[target_feature(enable = "avx512fp16")]
7444#[cfg_attr(test, assert_instr(vfmaddsub))]
7445#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7446#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7447pub const fn _mm512_mask_fmaddsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
7448    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), a) }
7449}
7450
7451/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7452/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7453/// (the element is copied from c when the corresponding mask bit is not set).
7454///
7455/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_ph)
7456#[inline]
7457#[target_feature(enable = "avx512fp16")]
7458#[cfg_attr(test, assert_instr(vfmaddsub))]
7459#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7460#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7461pub const fn _mm512_mask3_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
7462    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), c) }
7463}
7464
7465/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7466/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7467/// (the element is zeroed out when the corresponding mask bit is not set).
7468///
7469/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_ph)
7470#[inline]
7471#[target_feature(enable = "avx512fp16")]
7472#[cfg_attr(test, assert_instr(vfmaddsub))]
7473#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7474#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7475pub const fn _mm512_maskz_fmaddsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7476    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), _mm512_setzero_ph()) }
7477}
7478
7479/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7480/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7481///
7482/// Rounding is done according to the rounding parameter, which can be one of:
7483///
7484/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7485/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7486/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7487/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7488/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7489///
7490/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_round_ph)
7491#[inline]
7492#[target_feature(enable = "avx512fp16")]
7493#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7494#[rustc_legacy_const_generics(3)]
7495#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7496pub fn _mm512_fmaddsub_round_ph<const ROUNDING: i32>(
7497    a: __m512h,
7498    b: __m512h,
7499    c: __m512h,
7500) -> __m512h {
7501    unsafe {
7502        static_assert_rounding!(ROUNDING);
7503        vfmaddsubph_512(a, b, c, ROUNDING)
7504    }
7505}
7506
7507/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7508/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7509/// (the element is copied from a when the corresponding mask bit is not set).
7510///
7511/// Rounding is done according to the rounding parameter, which can be one of:
7512///
7513/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7514/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7515/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7516/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7517/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7518///
7519/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_round_ph)
7520#[inline]
7521#[target_feature(enable = "avx512fp16")]
7522#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7523#[rustc_legacy_const_generics(4)]
7524#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7525pub fn _mm512_mask_fmaddsub_round_ph<const ROUNDING: i32>(
7526    a: __m512h,
7527    k: __mmask32,
7528    b: __m512h,
7529    c: __m512h,
7530) -> __m512h {
7531    unsafe {
7532        static_assert_rounding!(ROUNDING);
7533        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), a)
7534    }
7535}
7536
7537/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7538/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7539/// (the element is copied from c when the corresponding mask bit is not set).
7540///
7541/// Rounding is done according to the rounding parameter, which can be one of:
7542///
7543/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7544/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7545/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7546/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7547/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7548///
7549/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_round_ph)
7550#[inline]
7551#[target_feature(enable = "avx512fp16")]
7552#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7553#[rustc_legacy_const_generics(4)]
7554#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7555pub fn _mm512_mask3_fmaddsub_round_ph<const ROUNDING: i32>(
7556    a: __m512h,
7557    b: __m512h,
7558    c: __m512h,
7559    k: __mmask32,
7560) -> __m512h {
7561    unsafe {
7562        static_assert_rounding!(ROUNDING);
7563        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), c)
7564    }
7565}
7566
7567/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7568/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7569/// (the element is zeroed out when the corresponding mask bit is not set).
7570///
7571/// Rounding is done according to the rounding parameter, which can be one of:
7572///
7573/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7574/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7575/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7576/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7577/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7578///
7579/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_round_ph)
7580#[inline]
7581#[target_feature(enable = "avx512fp16")]
7582#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7583#[rustc_legacy_const_generics(4)]
7584#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7585pub fn _mm512_maskz_fmaddsub_round_ph<const ROUNDING: i32>(
7586    k: __mmask32,
7587    a: __m512h,
7588    b: __m512h,
7589    c: __m512h,
7590) -> __m512h {
7591    unsafe {
7592        static_assert_rounding!(ROUNDING);
7593        simd_select_bitmask(
7594            k,
7595            _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c),
7596            _mm512_setzero_ph(),
7597        )
7598    }
7599}
7600
7601/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7602/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7603///
7604/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsubadd_ph)
7605#[inline]
7606#[target_feature(enable = "avx512fp16,avx512vl")]
7607#[cfg_attr(test, assert_instr(vfmsubadd))]
7608#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7609#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7610pub const fn _mm_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7611    _mm_fmaddsub_ph(a, b, unsafe { simd_neg(c) })
7612}
7613
7614/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7615/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7616/// (the element is copied from a when the corresponding mask bit is not set).
7617///
7618/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsubadd_ph)
7619#[inline]
7620#[target_feature(enable = "avx512fp16,avx512vl")]
7621#[cfg_attr(test, assert_instr(vfmsubadd))]
7622#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7623#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7624pub const fn _mm_mask_fmsubadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
7625    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), a) }
7626}
7627
7628/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7629/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7630/// (the element is copied from c when the corresponding mask bit is not set).
7631///
7632/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsubadd_ph)
7633#[inline]
7634#[target_feature(enable = "avx512fp16,avx512vl")]
7635#[cfg_attr(test, assert_instr(vfmsubadd))]
7636#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7637#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7638pub const fn _mm_mask3_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
7639    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), c) }
7640}
7641
7642/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7643/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7644/// (the element is zeroed out when the corresponding mask bit is not set).
7645///
7646/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsubadd_ph)
7647#[inline]
7648#[target_feature(enable = "avx512fp16,avx512vl")]
7649#[cfg_attr(test, assert_instr(vfmsubadd))]
7650#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7651#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7652pub const fn _mm_maskz_fmsubadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7653    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), _mm_setzero_ph()) }
7654}
7655
7656/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7657/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7658///
7659/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsubadd_ph)
7660#[inline]
7661#[target_feature(enable = "avx512fp16,avx512vl")]
7662#[cfg_attr(test, assert_instr(vfmsubadd))]
7663#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7664#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7665pub const fn _mm256_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7666    _mm256_fmaddsub_ph(a, b, unsafe { simd_neg(c) })
7667}
7668
7669/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7670/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7671/// (the element is copied from a when the corresponding mask bit is not set).
7672///
7673/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsubadd_ph)
7674#[inline]
7675#[target_feature(enable = "avx512fp16,avx512vl")]
7676#[cfg_attr(test, assert_instr(vfmsubadd))]
7677#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7678#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7679pub const fn _mm256_mask_fmsubadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
7680    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), a) }
7681}
7682
7683/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7684/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7685/// (the element is copied from c when the corresponding mask bit is not set).
7686///
7687/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsubadd_ph)
7688#[inline]
7689#[target_feature(enable = "avx512fp16,avx512vl")]
7690#[cfg_attr(test, assert_instr(vfmsubadd))]
7691#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7692#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7693pub const fn _mm256_mask3_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
7694    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), c) }
7695}
7696
7697/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7698/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7699/// (the element is zeroed out when the corresponding mask bit is not set).
7700///
7701/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsubadd_ph)
7702#[inline]
7703#[target_feature(enable = "avx512fp16,avx512vl")]
7704#[cfg_attr(test, assert_instr(vfmsubadd))]
7705#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7706#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7707pub const fn _mm256_maskz_fmsubadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7708    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), _mm256_setzero_ph()) }
7709}
7710
7711/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7712/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7713///
7714/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_ph)
7715#[inline]
7716#[target_feature(enable = "avx512fp16")]
7717#[cfg_attr(test, assert_instr(vfmsubadd))]
7718#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7719#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7720pub const fn _mm512_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7721    _mm512_fmaddsub_ph(a, b, unsafe { simd_neg(c) })
7722}
7723
7724/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7725/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7726/// (the element is copied from a when the corresponding mask bit is not set).
7727///
7728/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_ph)
7729#[inline]
7730#[target_feature(enable = "avx512fp16")]
7731#[cfg_attr(test, assert_instr(vfmsubadd))]
7732#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7733#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7734pub const fn _mm512_mask_fmsubadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
7735    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), a) }
7736}
7737
7738/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7739/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7740/// (the element is copied from c when the corresponding mask bit is not set).
7741///
7742/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_ph)
7743#[inline]
7744#[target_feature(enable = "avx512fp16")]
7745#[cfg_attr(test, assert_instr(vfmsubadd))]
7746#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7747#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7748pub const fn _mm512_mask3_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
7749    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), c) }
7750}
7751
7752/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7753/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7754/// (the element is zeroed out when the corresponding mask bit is not set).
7755///
7756/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_ph)
7757#[inline]
7758#[target_feature(enable = "avx512fp16")]
7759#[cfg_attr(test, assert_instr(vfmsubadd))]
7760#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7761#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7762pub const fn _mm512_maskz_fmsubadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7763    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), _mm512_setzero_ph()) }
7764}
7765
7766/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7767/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7768///
7769/// Rounding is done according to the rounding parameter, which can be one of:
7770///
7771/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7772/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7773/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7774/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7775/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7776///
7777/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_round_ph)
7778#[inline]
7779#[target_feature(enable = "avx512fp16")]
7780#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7781#[rustc_legacy_const_generics(3)]
7782#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7783pub fn _mm512_fmsubadd_round_ph<const ROUNDING: i32>(
7784    a: __m512h,
7785    b: __m512h,
7786    c: __m512h,
7787) -> __m512h {
7788    unsafe {
7789        static_assert_rounding!(ROUNDING);
7790        vfmaddsubph_512(a, b, simd_neg(c), ROUNDING)
7791    }
7792}
7793
7794/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7795/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7796/// (the element is copied from a when the corresponding mask bit is not set).
7797///
7798/// Rounding is done according to the rounding parameter, which can be one of:
7799///
7800/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7801/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7802/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7803/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7804/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7805///
7806/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_round_ph)
7807#[inline]
7808#[target_feature(enable = "avx512fp16")]
7809#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7810#[rustc_legacy_const_generics(4)]
7811#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7812pub fn _mm512_mask_fmsubadd_round_ph<const ROUNDING: i32>(
7813    a: __m512h,
7814    k: __mmask32,
7815    b: __m512h,
7816    c: __m512h,
7817) -> __m512h {
7818    unsafe {
7819        static_assert_rounding!(ROUNDING);
7820        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), a)
7821    }
7822}
7823
7824/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7825/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7826/// (the element is copied from c when the corresponding mask bit is not set).
7827///
7828/// Rounding is done according to the rounding parameter, which can be one of:
7829///
7830/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7831/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7832/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7833/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7834/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7835///
7836/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_round_ph)
7837#[inline]
7838#[target_feature(enable = "avx512fp16")]
7839#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7840#[rustc_legacy_const_generics(4)]
7841#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7842pub fn _mm512_mask3_fmsubadd_round_ph<const ROUNDING: i32>(
7843    a: __m512h,
7844    b: __m512h,
7845    c: __m512h,
7846    k: __mmask32,
7847) -> __m512h {
7848    unsafe {
7849        static_assert_rounding!(ROUNDING);
7850        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), c)
7851    }
7852}
7853
7854/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7855/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7856/// (the element is zeroed out when the corresponding mask bit is not set).
7857///
7858/// Rounding is done according to the rounding parameter, which can be one of:
7859///
7860/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7861/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7862/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7863/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7864/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7865///
7866/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_round_ph)
7867#[inline]
7868#[target_feature(enable = "avx512fp16")]
7869#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7870#[rustc_legacy_const_generics(4)]
7871#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7872pub fn _mm512_maskz_fmsubadd_round_ph<const ROUNDING: i32>(
7873    k: __mmask32,
7874    a: __m512h,
7875    b: __m512h,
7876    c: __m512h,
7877) -> __m512h {
7878    unsafe {
7879        static_assert_rounding!(ROUNDING);
7880        simd_select_bitmask(
7881            k,
7882            _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c),
7883            _mm512_setzero_ph(),
7884        )
7885    }
7886}
7887
7888/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7889/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7890///
7891/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ph)
7892#[inline]
7893#[target_feature(enable = "avx512fp16,avx512vl")]
7894#[cfg_attr(test, assert_instr(vrcpph))]
7895#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7896pub fn _mm_rcp_ph(a: __m128h) -> __m128h {
7897    _mm_mask_rcp_ph(_mm_undefined_ph(), 0xff, a)
7898}
7899
7900/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7901/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7902/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7903///
7904/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_ph)
7905#[inline]
7906#[target_feature(enable = "avx512fp16,avx512vl")]
7907#[cfg_attr(test, assert_instr(vrcpph))]
7908#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7909pub fn _mm_mask_rcp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
7910    unsafe { vrcpph_128(a, src, k) }
7911}
7912
7913/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7914/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7915/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7916///
7917/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_ph)
7918#[inline]
7919#[target_feature(enable = "avx512fp16,avx512vl")]
7920#[cfg_attr(test, assert_instr(vrcpph))]
7921#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7922pub fn _mm_maskz_rcp_ph(k: __mmask8, a: __m128h) -> __m128h {
7923    _mm_mask_rcp_ph(_mm_setzero_ph(), k, a)
7924}
7925
7926/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7927/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7928///
7929/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rcp_ph)
7930#[inline]
7931#[target_feature(enable = "avx512fp16,avx512vl")]
7932#[cfg_attr(test, assert_instr(vrcpph))]
7933#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7934pub fn _mm256_rcp_ph(a: __m256h) -> __m256h {
7935    _mm256_mask_rcp_ph(_mm256_undefined_ph(), 0xffff, a)
7936}
7937
7938/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7939/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7940/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7941///
7942/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rcp_ph)
7943#[inline]
7944#[target_feature(enable = "avx512fp16,avx512vl")]
7945#[cfg_attr(test, assert_instr(vrcpph))]
7946#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7947pub fn _mm256_mask_rcp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
7948    unsafe { vrcpph_256(a, src, k) }
7949}
7950
7951/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7952/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7953/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7954///
7955/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rcp_ph)
7956#[inline]
7957#[target_feature(enable = "avx512fp16,avx512vl")]
7958#[cfg_attr(test, assert_instr(vrcpph))]
7959#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7960pub fn _mm256_maskz_rcp_ph(k: __mmask16, a: __m256h) -> __m256h {
7961    _mm256_mask_rcp_ph(_mm256_setzero_ph(), k, a)
7962}
7963
7964/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7965/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7966///
7967/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp_ph)
7968#[inline]
7969#[target_feature(enable = "avx512fp16")]
7970#[cfg_attr(test, assert_instr(vrcpph))]
7971#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7972pub fn _mm512_rcp_ph(a: __m512h) -> __m512h {
7973    _mm512_mask_rcp_ph(_mm512_undefined_ph(), 0xffffffff, a)
7974}
7975
7976/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7977/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7978/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7979///
7980/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp_ph)
7981#[inline]
7982#[target_feature(enable = "avx512fp16")]
7983#[cfg_attr(test, assert_instr(vrcpph))]
7984#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7985pub fn _mm512_mask_rcp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
7986    unsafe { vrcpph_512(a, src, k) }
7987}
7988
7989/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7990/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7991/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7992///
7993/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rcp_ph)
7994#[inline]
7995#[target_feature(enable = "avx512fp16")]
7996#[cfg_attr(test, assert_instr(vrcpph))]
7997#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7998pub fn _mm512_maskz_rcp_ph(k: __mmask32, a: __m512h) -> __m512h {
7999    _mm512_mask_rcp_ph(_mm512_setzero_ph(), k, a)
8000}
8001
8002/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
8003/// store the result in the lower element of dst, and copy the upper 7 packed elements from a to the
8004/// upper elements of dst.
8005/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8006///
8007/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_sh)
8008#[inline]
8009#[target_feature(enable = "avx512fp16")]
8010#[cfg_attr(test, assert_instr(vrcpsh))]
8011#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8012pub fn _mm_rcp_sh(a: __m128h, b: __m128h) -> __m128h {
8013    _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
8014}
8015
8016/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
8017/// store the result in the lower element of dst using writemask k (the element is copied from src when
8018/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8019/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8020///
8021/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_sh)
8022#[inline]
8023#[target_feature(enable = "avx512fp16")]
8024#[cfg_attr(test, assert_instr(vrcpsh))]
8025#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8026pub fn _mm_mask_rcp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8027    unsafe { vrcpsh(a, b, src, k) }
8028}
8029
8030/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
8031/// store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
8032/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8033/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8034///
8035/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_sh)
8036#[inline]
8037#[target_feature(enable = "avx512fp16")]
8038#[cfg_attr(test, assert_instr(vrcpsh))]
8039#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8040pub fn _mm_maskz_rcp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8041    _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), k, a, b)
8042}
8043
8044/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8045/// elements in a, and store the results in dst.
8046/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8047///
8048/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ph)
8049#[inline]
8050#[target_feature(enable = "avx512fp16,avx512vl")]
8051#[cfg_attr(test, assert_instr(vrsqrtph))]
8052#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8053pub fn _mm_rsqrt_ph(a: __m128h) -> __m128h {
8054    _mm_mask_rsqrt_ph(_mm_undefined_ph(), 0xff, a)
8055}
8056
8057/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8058/// elements in a, and store the results in dst using writemask k (elements are copied from src when
8059/// the corresponding mask bit is not set).
8060/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8061///
8062/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_ph)
8063#[inline]
8064#[target_feature(enable = "avx512fp16,avx512vl")]
8065#[cfg_attr(test, assert_instr(vrsqrtph))]
8066#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8067pub fn _mm_mask_rsqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
8068    unsafe { vrsqrtph_128(a, src, k) }
8069}
8070
8071/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8072/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
8073/// corresponding mask bit is not set).
8074/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8075///
8076/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_ph)
8077#[inline]
8078#[target_feature(enable = "avx512fp16,avx512vl")]
8079#[cfg_attr(test, assert_instr(vrsqrtph))]
8080#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8081pub fn _mm_maskz_rsqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
8082    _mm_mask_rsqrt_ph(_mm_setzero_ph(), k, a)
8083}
8084
8085/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8086/// elements in a, and store the results in dst.
8087/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8088///
8089/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rsqrt_ph)
8090#[inline]
8091#[target_feature(enable = "avx512fp16,avx512vl")]
8092#[cfg_attr(test, assert_instr(vrsqrtph))]
8093#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8094pub fn _mm256_rsqrt_ph(a: __m256h) -> __m256h {
8095    _mm256_mask_rsqrt_ph(_mm256_undefined_ph(), 0xffff, a)
8096}
8097
8098/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8099/// elements in a, and store the results in dst using writemask k (elements are copied from src when
8100/// the corresponding mask bit is not set).
8101/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8102///
8103/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rsqrt_ph)
8104#[inline]
8105#[target_feature(enable = "avx512fp16,avx512vl")]
8106#[cfg_attr(test, assert_instr(vrsqrtph))]
8107#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8108pub fn _mm256_mask_rsqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
8109    unsafe { vrsqrtph_256(a, src, k) }
8110}
8111
8112/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8113/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
8114/// corresponding mask bit is not set).
8115/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8116///
8117/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rsqrt_ph)
8118#[inline]
8119#[target_feature(enable = "avx512fp16,avx512vl")]
8120#[cfg_attr(test, assert_instr(vrsqrtph))]
8121#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8122pub fn _mm256_maskz_rsqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
8123    _mm256_mask_rsqrt_ph(_mm256_setzero_ph(), k, a)
8124}
8125
8126/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8127/// elements in a, and store the results in dst.
8128/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8129///
8130/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt_ph)
8131#[inline]
8132#[target_feature(enable = "avx512fp16")]
8133#[cfg_attr(test, assert_instr(vrsqrtph))]
8134#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8135pub fn _mm512_rsqrt_ph(a: __m512h) -> __m512h {
8136    _mm512_mask_rsqrt_ph(_mm512_undefined_ph(), 0xffffffff, a)
8137}
8138
8139/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8140/// elements in a, and store the results in dst using writemask k (elements are copied from src when
8141/// the corresponding mask bit is not set).
8142/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8143///
8144/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt_ph)
8145#[inline]
8146#[target_feature(enable = "avx512fp16")]
8147#[cfg_attr(test, assert_instr(vrsqrtph))]
8148#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8149pub fn _mm512_mask_rsqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
8150    unsafe { vrsqrtph_512(a, src, k) }
8151}
8152
8153/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8154/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
8155/// corresponding mask bit is not set).
8156/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8157///
8158/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt_ph)
8159#[inline]
8160#[target_feature(enable = "avx512fp16")]
8161#[cfg_attr(test, assert_instr(vrsqrtph))]
8162#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8163pub fn _mm512_maskz_rsqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
8164    _mm512_mask_rsqrt_ph(_mm512_setzero_ph(), k, a)
8165}
8166
8167/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
8168/// element in b, store the result in the lower element of dst, and copy the upper 7 packed elements from a
8169/// to the upper elements of dst.
8170/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8171///
8172/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_sh)
8173#[inline]
8174#[target_feature(enable = "avx512fp16")]
8175#[cfg_attr(test, assert_instr(vrsqrtsh))]
8176#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8177pub fn _mm_rsqrt_sh(a: __m128h, b: __m128h) -> __m128h {
8178    _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
8179}
8180
8181/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
8182/// element in b, store the result in the lower element of dst using writemask k (the element is copied from src
8183/// when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8184/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8185///
8186/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_sh)
8187#[inline]
8188#[target_feature(enable = "avx512fp16")]
8189#[cfg_attr(test, assert_instr(vrsqrtsh))]
8190#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8191pub fn _mm_mask_rsqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8192    unsafe { vrsqrtsh(a, b, src, k) }
8193}
8194
8195/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
8196/// element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when
8197/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8198/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8199///
8200/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_sh)
8201#[inline]
8202#[target_feature(enable = "avx512fp16")]
8203#[cfg_attr(test, assert_instr(vrsqrtsh))]
8204#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8205pub fn _mm_maskz_rsqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8206    _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), k, a, b)
8207}
8208
8209/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8210/// results in dst.
8211///
8212/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ph)
8213#[inline]
8214#[target_feature(enable = "avx512fp16,avx512vl")]
8215#[cfg_attr(test, assert_instr(vsqrtph))]
8216#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8217pub fn _mm_sqrt_ph(a: __m128h) -> __m128h {
8218    unsafe { simd_fsqrt(a) }
8219}
8220
8221/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8222/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8223///
8224/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ph)
8225#[inline]
8226#[target_feature(enable = "avx512fp16,avx512vl")]
8227#[cfg_attr(test, assert_instr(vsqrtph))]
8228#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8229pub fn _mm_mask_sqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
8230    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), src) }
8231}
8232
8233/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8234/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8235///
8236/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ph)
8237#[inline]
8238#[target_feature(enable = "avx512fp16,avx512vl")]
8239#[cfg_attr(test, assert_instr(vsqrtph))]
8240#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8241pub fn _mm_maskz_sqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
8242    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), _mm_setzero_ph()) }
8243}
8244
8245/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8246/// results in dst.
8247///
8248/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sqrt_ph)
8249#[inline]
8250#[target_feature(enable = "avx512fp16,avx512vl")]
8251#[cfg_attr(test, assert_instr(vsqrtph))]
8252#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8253pub fn _mm256_sqrt_ph(a: __m256h) -> __m256h {
8254    unsafe { simd_fsqrt(a) }
8255}
8256
8257/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8258/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8259///
8260/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sqrt_ph)
8261#[inline]
8262#[target_feature(enable = "avx512fp16,avx512vl")]
8263#[cfg_attr(test, assert_instr(vsqrtph))]
8264#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8265pub fn _mm256_mask_sqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
8266    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), src) }
8267}
8268
8269/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8270/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8271///
8272/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sqrt_ph)
8273#[inline]
8274#[target_feature(enable = "avx512fp16,avx512vl")]
8275#[cfg_attr(test, assert_instr(vsqrtph))]
8276#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8277pub fn _mm256_maskz_sqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
8278    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), _mm256_setzero_ph()) }
8279}
8280
8281/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8282/// results in dst.
8283///
8284/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_ph)
8285#[inline]
8286#[target_feature(enable = "avx512fp16")]
8287#[cfg_attr(test, assert_instr(vsqrtph))]
8288#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8289pub fn _mm512_sqrt_ph(a: __m512h) -> __m512h {
8290    unsafe { simd_fsqrt(a) }
8291}
8292
8293/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8294/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8295///
8296/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_ph)
8297#[inline]
8298#[target_feature(enable = "avx512fp16")]
8299#[cfg_attr(test, assert_instr(vsqrtph))]
8300#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8301pub fn _mm512_mask_sqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
8302    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), src) }
8303}
8304
8305/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8306/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8307///
8308/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_ph)
8309#[inline]
8310#[target_feature(enable = "avx512fp16")]
8311#[cfg_attr(test, assert_instr(vsqrtph))]
8312#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8313pub fn _mm512_maskz_sqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
8314    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), _mm512_setzero_ph()) }
8315}
8316
8317/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8318/// results in dst.
8319/// Rounding is done according to the rounding parameter, which can be one of:
8320///
8321/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8322/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8323/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8324/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8325/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8326///
8327/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_round_ph)
8328#[inline]
8329#[target_feature(enable = "avx512fp16")]
8330#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8331#[rustc_legacy_const_generics(1)]
8332#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8333pub fn _mm512_sqrt_round_ph<const ROUNDING: i32>(a: __m512h) -> __m512h {
8334    unsafe {
8335        static_assert_rounding!(ROUNDING);
8336        vsqrtph_512(a, ROUNDING)
8337    }
8338}
8339
8340/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8341/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8342/// Rounding is done according to the rounding parameter, which can be one of:
8343///
8344/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8345/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8346/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8347/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8348/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8349///
8350/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_round_ph)
8351#[inline]
8352#[target_feature(enable = "avx512fp16")]
8353#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8354#[rustc_legacy_const_generics(3)]
8355#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8356pub fn _mm512_mask_sqrt_round_ph<const ROUNDING: i32>(
8357    src: __m512h,
8358    k: __mmask32,
8359    a: __m512h,
8360) -> __m512h {
8361    unsafe {
8362        static_assert_rounding!(ROUNDING);
8363        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), src)
8364    }
8365}
8366
8367/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8368/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8369/// Rounding is done according to the rounding parameter, which can be one of:
8370///
8371/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8372/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8373/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8374/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8375/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8376///
8377/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_round_ph)
8378#[inline]
8379#[target_feature(enable = "avx512fp16")]
8380#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8381#[rustc_legacy_const_generics(2)]
8382#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8383pub fn _mm512_maskz_sqrt_round_ph<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512h {
8384    unsafe {
8385        static_assert_rounding!(ROUNDING);
8386        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), _mm512_setzero_ph())
8387    }
8388}
8389
8390/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8391/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
8392/// elements of dst.
8393///
8394/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sh)
8395#[inline]
8396#[target_feature(enable = "avx512fp16")]
8397#[cfg_attr(test, assert_instr(vsqrtsh))]
8398#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8399pub fn _mm_sqrt_sh(a: __m128h, b: __m128h) -> __m128h {
8400    _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
8401}
8402
8403/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8404/// the result in the lower element of dst using writemask k (the element is copied from src when mask
8405/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8406///
8407/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sh)
8408#[inline]
8409#[target_feature(enable = "avx512fp16")]
8410#[cfg_attr(test, assert_instr(vsqrtsh))]
8411#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8412pub fn _mm_mask_sqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8413    _mm_mask_sqrt_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8414}
8415
8416/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8417/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
8418/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8419///
8420/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sh)
8421#[inline]
8422#[target_feature(enable = "avx512fp16")]
8423#[cfg_attr(test, assert_instr(vsqrtsh))]
8424#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8425pub fn _mm_maskz_sqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8426    _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), k, a, b)
8427}
8428
8429/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8430/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
8431/// elements of dst.
8432/// Rounding is done according to the rounding parameter, which can be one of:
8433///
8434/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8435/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8436/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8437/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8438/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8439///
8440/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sh)
8441#[inline]
8442#[target_feature(enable = "avx512fp16")]
8443#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8444#[rustc_legacy_const_generics(2)]
8445#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8446pub fn _mm_sqrt_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
8447    static_assert_rounding!(ROUNDING);
8448    _mm_mask_sqrt_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
8449}
8450
8451/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8452/// the result in the lower element of dst using writemask k (the element is copied from src when mask
8453/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8454/// Rounding is done according to the rounding parameter, which can be one of:
8455///
8456/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8457/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8458/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8459/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8460/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8461///
8462/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sh)
8463#[inline]
8464#[target_feature(enable = "avx512fp16")]
8465#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8466#[rustc_legacy_const_generics(4)]
8467#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8468pub fn _mm_mask_sqrt_round_sh<const ROUNDING: i32>(
8469    src: __m128h,
8470    k: __mmask8,
8471    a: __m128h,
8472    b: __m128h,
8473) -> __m128h {
8474    unsafe {
8475        static_assert_rounding!(ROUNDING);
8476        vsqrtsh(a, b, src, k, ROUNDING)
8477    }
8478}
8479
8480/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8481/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
8482/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8483/// Rounding is done according to the rounding parameter, which can be one of:
8484///
8485/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8486/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8487/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8488/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8489/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8490///
8491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sh)
8492#[inline]
8493#[target_feature(enable = "avx512fp16")]
8494#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8495#[rustc_legacy_const_generics(3)]
8496#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8497pub fn _mm_maskz_sqrt_round_sh<const ROUNDING: i32>(
8498    k: __mmask8,
8499    a: __m128h,
8500    b: __m128h,
8501) -> __m128h {
8502    static_assert_rounding!(ROUNDING);
8503    _mm_mask_sqrt_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
8504}
8505
8506/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8507/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8508/// value when inputs are NaN or signed-zero values.
8509///
8510/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ph)
8511#[inline]
8512#[target_feature(enable = "avx512fp16,avx512vl")]
8513#[cfg_attr(test, assert_instr(vmaxph))]
8514#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8515pub fn _mm_max_ph(a: __m128h, b: __m128h) -> __m128h {
8516    unsafe { vmaxph_128(a, b) }
8517}
8518
8519/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8520/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8521/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8522/// NaN or signed-zero values.
8523///
8524/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_ph)
8525#[inline]
8526#[target_feature(enable = "avx512fp16,avx512vl")]
8527#[cfg_attr(test, assert_instr(vmaxph))]
8528#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8529pub fn _mm_mask_max_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8530    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), src) }
8531}
8532
8533/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8534/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8535/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8536/// NaN or signed-zero values.
8537///
8538/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_ph)
8539#[inline]
8540#[target_feature(enable = "avx512fp16,avx512vl")]
8541#[cfg_attr(test, assert_instr(vmaxph))]
8542#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8543pub fn _mm_maskz_max_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8544    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), _mm_setzero_ph()) }
8545}
8546
8547/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8548/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8549/// value when inputs are NaN or signed-zero values.
8550///
8551/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_ph)
8552#[inline]
8553#[target_feature(enable = "avx512fp16,avx512vl")]
8554#[cfg_attr(test, assert_instr(vmaxph))]
8555#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8556pub fn _mm256_max_ph(a: __m256h, b: __m256h) -> __m256h {
8557    unsafe { vmaxph_256(a, b) }
8558}
8559
8560/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8561/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8562/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8563/// NaN or signed-zero values.
8564///
8565/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_ph)
8566#[inline]
8567#[target_feature(enable = "avx512fp16,avx512vl")]
8568#[cfg_attr(test, assert_instr(vmaxph))]
8569#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8570pub fn _mm256_mask_max_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8571    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), src) }
8572}
8573
8574/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8575/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8576/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8577/// NaN or signed-zero values.
8578///
8579/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_ph)
8580#[inline]
8581#[target_feature(enable = "avx512fp16,avx512vl")]
8582#[cfg_attr(test, assert_instr(vmaxph))]
8583#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8584pub fn _mm256_maskz_max_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8585    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), _mm256_setzero_ph()) }
8586}
8587
8588/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8589/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8590/// value when inputs are NaN or signed-zero values.
8591///
8592/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_ph)
8593#[inline]
8594#[target_feature(enable = "avx512fp16")]
8595#[cfg_attr(test, assert_instr(vmaxph))]
8596#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8597pub fn _mm512_max_ph(a: __m512h, b: __m512h) -> __m512h {
8598    _mm512_max_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
8599}
8600
8601/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8602/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8603/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8604/// NaN or signed-zero values.
8605///
8606/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_ph)
8607#[inline]
8608#[target_feature(enable = "avx512fp16")]
8609#[cfg_attr(test, assert_instr(vmaxph))]
8610#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8611pub fn _mm512_mask_max_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8612    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), src) }
8613}
8614
8615/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8616/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8617/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8618/// NaN or signed-zero values.
8619///
8620/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_ph)
8621#[inline]
8622#[target_feature(enable = "avx512fp16")]
8623#[cfg_attr(test, assert_instr(vmaxph))]
8624#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8625pub fn _mm512_maskz_max_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8626    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), _mm512_setzero_ph()) }
8627}
8628
8629/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8630/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8631/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8632/// NaN or signed-zero values.
8633///
8634/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_round_ph)
8635#[inline]
8636#[target_feature(enable = "avx512fp16")]
8637#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8638#[rustc_legacy_const_generics(2)]
8639#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8640pub fn _mm512_max_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
8641    unsafe {
8642        static_assert_sae!(SAE);
8643        vmaxph_512(a, b, SAE)
8644    }
8645}
8646
8647/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8648/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8649/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8650/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8651///
8652/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_round_ph)
8653#[inline]
8654#[target_feature(enable = "avx512fp16")]
8655#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8656#[rustc_legacy_const_generics(4)]
8657#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8658pub fn _mm512_mask_max_round_ph<const SAE: i32>(
8659    src: __m512h,
8660    k: __mmask32,
8661    a: __m512h,
8662    b: __m512h,
8663) -> __m512h {
8664    unsafe {
8665        static_assert_sae!(SAE);
8666        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), src)
8667    }
8668}
8669
8670/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8671/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8672/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8673/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8674///
8675/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_round_ph)
8676#[inline]
8677#[target_feature(enable = "avx512fp16")]
8678#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8679#[rustc_legacy_const_generics(3)]
8680#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8681pub fn _mm512_maskz_max_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8682    unsafe {
8683        static_assert_sae!(SAE);
8684        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), _mm512_setzero_ph())
8685    }
8686}
8687
8688/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
8689/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
8690/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value
8691/// when inputs are NaN or signed-zero values.
8692///
8693/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sh)
8694#[inline]
8695#[target_feature(enable = "avx512fp16,avx512vl")]
8696#[cfg_attr(test, assert_instr(vmaxsh))]
8697#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8698pub fn _mm_max_sh(a: __m128h, b: __m128h) -> __m128h {
8699    _mm_mask_max_sh(_mm_undefined_ph(), 0xff, a, b)
8700}
8701
8702/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
8703/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
8704/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
8705/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8706///
8707/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_sh)
8708#[inline]
8709#[target_feature(enable = "avx512fp16,avx512vl")]
8710#[cfg_attr(test, assert_instr(vmaxsh))]
8711#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8712pub fn _mm_mask_max_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8713    _mm_mask_max_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8714}
8715
8716/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8717/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8718/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
8719/// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8720///
8721/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_sh)
8722#[inline]
8723#[target_feature(enable = "avx512fp16,avx512vl")]
8724#[cfg_attr(test, assert_instr(vmaxsh))]
8725#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8726pub fn _mm_maskz_max_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8727    _mm_mask_max_sh(f16x8::ZERO.as_m128h(), k, a, b)
8728}
8729
8730/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8731/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
8732/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8733/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8734///
8735/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sh)
8736#[inline]
8737#[target_feature(enable = "avx512fp16,avx512vl")]
8738#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8739#[rustc_legacy_const_generics(2)]
8740#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8741pub fn _mm_max_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
8742    static_assert_sae!(SAE);
8743    _mm_mask_max_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
8744}
8745
8746/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8747/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
8748/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8749/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8750/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8751///
8752/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_sh)
8753#[inline]
8754#[target_feature(enable = "avx512fp16,avx512vl")]
8755#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8756#[rustc_legacy_const_generics(4)]
8757#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8758pub fn _mm_mask_max_round_sh<const SAE: i32>(
8759    src: __m128h,
8760    k: __mmask8,
8761    a: __m128h,
8762    b: __m128h,
8763) -> __m128h {
8764    unsafe {
8765        static_assert_sae!(SAE);
8766        vmaxsh(a, b, src, k, SAE)
8767    }
8768}
8769
8770/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8771/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8772/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8773/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8774/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8775///
8776/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_sh)
8777#[inline]
8778#[target_feature(enable = "avx512fp16,avx512vl")]
8779#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8780#[rustc_legacy_const_generics(3)]
8781#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8782pub fn _mm_maskz_max_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8783    static_assert_sae!(SAE);
8784    _mm_mask_max_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
8785}
8786
8787/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8788/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8789/// when inputs are NaN or signed-zero values.
8790///
8791/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ph)
8792#[inline]
8793#[target_feature(enable = "avx512fp16,avx512vl")]
8794#[cfg_attr(test, assert_instr(vminph))]
8795#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8796pub fn _mm_min_ph(a: __m128h, b: __m128h) -> __m128h {
8797    unsafe { vminph_128(a, b) }
8798}
8799
8800/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8801/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8802/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8803/// NaN or signed-zero values.
8804///
8805/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ph)
8806#[inline]
8807#[target_feature(enable = "avx512fp16,avx512vl")]
8808#[cfg_attr(test, assert_instr(vminph))]
8809#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8810pub fn _mm_mask_min_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8811    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), src) }
8812}
8813
8814/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8815/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8816/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8817/// NaN or signed-zero values.
8818///
8819/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ph)
8820#[inline]
8821#[target_feature(enable = "avx512fp16,avx512vl")]
8822#[cfg_attr(test, assert_instr(vminph))]
8823#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8824pub fn _mm_maskz_min_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8825    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), _mm_setzero_ph()) }
8826}
8827
8828/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8829/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8830/// when inputs are NaN or signed-zero values.
8831///
8832/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_ph)
8833#[inline]
8834#[target_feature(enable = "avx512fp16,avx512vl")]
8835#[cfg_attr(test, assert_instr(vminph))]
8836#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8837pub fn _mm256_min_ph(a: __m256h, b: __m256h) -> __m256h {
8838    unsafe { vminph_256(a, b) }
8839}
8840
8841/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8842/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8843/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8844/// NaN or signed-zero values.
8845///
8846/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_ph)
8847#[inline]
8848#[target_feature(enable = "avx512fp16,avx512vl")]
8849#[cfg_attr(test, assert_instr(vminph))]
8850#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8851pub fn _mm256_mask_min_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8852    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), src) }
8853}
8854
8855/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8856/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8857/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8858/// NaN or signed-zero values.
8859///
8860/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_ph)
8861#[inline]
8862#[target_feature(enable = "avx512fp16,avx512vl")]
8863#[cfg_attr(test, assert_instr(vminph))]
8864#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8865pub fn _mm256_maskz_min_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8866    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), _mm256_setzero_ph()) }
8867}
8868
8869/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8870/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8871/// when inputs are NaN or signed-zero values.
8872///
8873/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_ph)
8874#[inline]
8875#[target_feature(enable = "avx512fp16")]
8876#[cfg_attr(test, assert_instr(vminph))]
8877#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8878pub fn _mm512_min_ph(a: __m512h, b: __m512h) -> __m512h {
8879    _mm512_min_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
8880}
8881
8882/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8883/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8884/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8885/// NaN or signed-zero values.
8886///
8887/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_ph)
8888#[inline]
8889#[target_feature(enable = "avx512fp16")]
8890#[cfg_attr(test, assert_instr(vminph))]
8891#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8892pub fn _mm512_mask_min_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8893    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), src) }
8894}
8895
8896/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8897/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8898/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8899/// NaN or signed-zero values.
8900///
8901/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_ph)
8902#[inline]
8903#[target_feature(enable = "avx512fp16")]
8904#[cfg_attr(test, assert_instr(vminph))]
8905#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8906pub fn _mm512_maskz_min_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8907    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), _mm512_setzero_ph()) }
8908}
8909
8910/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8911/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not
8912/// follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8913///
8914/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_round_ph)
8915#[inline]
8916#[target_feature(enable = "avx512fp16")]
8917#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8918#[rustc_legacy_const_generics(2)]
8919#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8920pub fn _mm512_min_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
8921    unsafe {
8922        static_assert_sae!(SAE);
8923        vminph_512(a, b, SAE)
8924    }
8925}
8926
8927/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8928/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8929/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8930/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8931///
8932/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_ph)
8933#[inline]
8934#[target_feature(enable = "avx512fp16")]
8935#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8936#[rustc_legacy_const_generics(4)]
8937#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8938pub fn _mm512_mask_min_round_ph<const SAE: i32>(
8939    src: __m512h,
8940    k: __mmask32,
8941    a: __m512h,
8942    b: __m512h,
8943) -> __m512h {
8944    unsafe {
8945        static_assert_sae!(SAE);
8946        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), src)
8947    }
8948}
8949
8950/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8951/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8952/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8953/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8954///
8955/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_ph)
8956#[inline]
8957#[target_feature(enable = "avx512fp16")]
8958#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8959#[rustc_legacy_const_generics(3)]
8960#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8961pub fn _mm512_maskz_min_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8962    unsafe {
8963        static_assert_sae!(SAE);
8964        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), _mm512_setzero_ph())
8965    }
8966}
8967
8968/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
8969/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
8970/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
8971/// inputs are NaN or signed-zero values.
8972///
8973/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sh)
8974#[inline]
8975#[target_feature(enable = "avx512fp16,avx512vl")]
8976#[cfg_attr(test, assert_instr(vminsh))]
8977#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8978pub fn _mm_min_sh(a: __m128h, b: __m128h) -> __m128h {
8979    _mm_mask_min_sh(_mm_undefined_ph(), 0xff, a, b)
8980}
8981
8982/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
8983/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
8984/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
8985/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8986///
8987/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sh)
8988#[inline]
8989#[target_feature(enable = "avx512fp16,avx512vl")]
8990#[cfg_attr(test, assert_instr(vminsh))]
8991#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8992pub fn _mm_mask_min_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8993    _mm_mask_min_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8994}
8995
8996/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8997/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8998/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
8999/// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
9000///
9001/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sh)
9002#[inline]
9003#[target_feature(enable = "avx512fp16,avx512vl")]
9004#[cfg_attr(test, assert_instr(vminsh))]
9005#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9006pub fn _mm_maskz_min_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9007    _mm_mask_min_sh(f16x8::ZERO.as_m128h(), k, a, b)
9008}
9009
9010/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
9011/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
9012/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
9013/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
9014///
9015/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sh)
9016#[inline]
9017#[target_feature(enable = "avx512fp16,avx512vl")]
9018#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
9019#[rustc_legacy_const_generics(2)]
9020#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9021pub fn _mm_min_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
9022    static_assert_sae!(SAE);
9023    _mm_mask_min_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
9024}
9025
9026/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
9027/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
9028/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
9029/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
9030/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
9031///
9032/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_sh)
9033#[inline]
9034#[target_feature(enable = "avx512fp16,avx512vl")]
9035#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
9036#[rustc_legacy_const_generics(4)]
9037#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9038pub fn _mm_mask_min_round_sh<const SAE: i32>(
9039    src: __m128h,
9040    k: __mmask8,
9041    a: __m128h,
9042    b: __m128h,
9043) -> __m128h {
9044    unsafe {
9045        static_assert_sae!(SAE);
9046        vminsh(a, b, src, k, SAE)
9047    }
9048}
9049
9050/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
9051/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
9052/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
9053/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
9054/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
9055///
9056/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_sh)
9057#[inline]
9058#[target_feature(enable = "avx512fp16,avx512vl")]
9059#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
9060#[rustc_legacy_const_generics(3)]
9061#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9062pub fn _mm_maskz_min_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9063    static_assert_sae!(SAE);
9064    _mm_mask_min_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
9065}
9066
9067/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9068/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
9069/// This intrinsic essentially calculates `floor(log2(x))` for each element.
9070///
9071/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ph)
9072#[inline]
9073#[target_feature(enable = "avx512fp16,avx512vl")]
9074#[cfg_attr(test, assert_instr(vgetexpph))]
9075#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9076pub fn _mm_getexp_ph(a: __m128h) -> __m128h {
9077    _mm_mask_getexp_ph(_mm_undefined_ph(), 0xff, a)
9078}
9079
9080/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9081/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
9082/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
9083/// `floor(log2(x))` for each element.
9084///
9085/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ph)
9086#[inline]
9087#[target_feature(enable = "avx512fp16,avx512vl")]
9088#[cfg_attr(test, assert_instr(vgetexpph))]
9089#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9090pub fn _mm_mask_getexp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
9091    unsafe { vgetexpph_128(a, src, k) }
9092}
9093
9094/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9095/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
9096/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
9097/// `floor(log2(x))` for each element.
9098///
9099/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ph)
9100#[inline]
9101#[target_feature(enable = "avx512fp16,avx512vl")]
9102#[cfg_attr(test, assert_instr(vgetexpph))]
9103#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9104pub fn _mm_maskz_getexp_ph(k: __mmask8, a: __m128h) -> __m128h {
9105    _mm_mask_getexp_ph(_mm_setzero_ph(), k, a)
9106}
9107
9108/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9109/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
9110/// This intrinsic essentially calculates `floor(log2(x))` for each element.
9111///
9112/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getexp_ph)
9113#[inline]
9114#[target_feature(enable = "avx512fp16,avx512vl")]
9115#[cfg_attr(test, assert_instr(vgetexpph))]
9116#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9117pub fn _mm256_getexp_ph(a: __m256h) -> __m256h {
9118    _mm256_mask_getexp_ph(_mm256_undefined_ph(), 0xffff, a)
9119}
9120
9121/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9122/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
9123/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
9124/// `floor(log2(x))` for each element.
9125///
9126/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getexp_ph)
9127#[inline]
9128#[target_feature(enable = "avx512fp16,avx512vl")]
9129#[cfg_attr(test, assert_instr(vgetexpph))]
9130#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9131pub fn _mm256_mask_getexp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
9132    unsafe { vgetexpph_256(a, src, k) }
9133}
9134
9135/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9136/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
9137/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
9138/// `floor(log2(x))` for each element.
9139///
9140/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getexp_ph)
9141#[inline]
9142#[target_feature(enable = "avx512fp16,avx512vl")]
9143#[cfg_attr(test, assert_instr(vgetexpph))]
9144#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9145pub fn _mm256_maskz_getexp_ph(k: __mmask16, a: __m256h) -> __m256h {
9146    _mm256_mask_getexp_ph(_mm256_setzero_ph(), k, a)
9147}
9148
9149/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9150/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
9151/// This intrinsic essentially calculates `floor(log2(x))` for each element.
9152///
9153/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_ph)
9154#[inline]
9155#[target_feature(enable = "avx512fp16")]
9156#[cfg_attr(test, assert_instr(vgetexpph))]
9157#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9158pub fn _mm512_getexp_ph(a: __m512h) -> __m512h {
9159    _mm512_mask_getexp_ph(_mm512_undefined_ph(), 0xffffffff, a)
9160}
9161
9162/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9163/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
9164/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
9165/// `floor(log2(x))` for each element.
9166///
9167/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_ph)
9168#[inline]
9169#[target_feature(enable = "avx512fp16")]
9170#[cfg_attr(test, assert_instr(vgetexpph))]
9171#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9172pub fn _mm512_mask_getexp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
9173    _mm512_mask_getexp_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
9174}
9175
9176/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9177/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
9178/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
9179/// `floor(log2(x))` for each element.
9180///
9181/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_ph)
9182#[inline]
9183#[target_feature(enable = "avx512fp16")]
9184#[cfg_attr(test, assert_instr(vgetexpph))]
9185#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9186pub fn _mm512_maskz_getexp_ph(k: __mmask32, a: __m512h) -> __m512h {
9187    _mm512_mask_getexp_ph(_mm512_setzero_ph(), k, a)
9188}
9189
9190/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9191/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
9192/// This intrinsic essentially calculates `floor(log2(x))` for each element. Exceptions can be suppressed
9193/// by passing _MM_FROUND_NO_EXC in the sae parameter
9194///
9195/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_round_ph)
9196#[inline]
9197#[target_feature(enable = "avx512fp16")]
9198#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
9199#[rustc_legacy_const_generics(1)]
9200#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9201pub fn _mm512_getexp_round_ph<const SAE: i32>(a: __m512h) -> __m512h {
9202    static_assert_sae!(SAE);
9203    _mm512_mask_getexp_round_ph::<SAE>(_mm512_undefined_ph(), 0xffffffff, a)
9204}
9205
9206/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9207/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
9208/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
9209/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9210///
9211/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_round_ph)
9212#[inline]
9213#[target_feature(enable = "avx512fp16")]
9214#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
9215#[rustc_legacy_const_generics(3)]
9216#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9217pub fn _mm512_mask_getexp_round_ph<const SAE: i32>(
9218    src: __m512h,
9219    k: __mmask32,
9220    a: __m512h,
9221) -> __m512h {
9222    unsafe {
9223        static_assert_sae!(SAE);
9224        vgetexpph_512(a, src, k, SAE)
9225    }
9226}
9227
9228/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9229/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
9230/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
9231/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9232///
9233/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_round_ph)
9234#[inline]
9235#[target_feature(enable = "avx512fp16")]
9236#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
9237#[rustc_legacy_const_generics(2)]
9238#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9239pub fn _mm512_maskz_getexp_round_ph<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512h {
9240    static_assert_sae!(SAE);
9241    _mm512_mask_getexp_round_ph::<SAE>(_mm512_setzero_ph(), k, a)
9242}
9243
9244/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9245/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9246/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
9247/// calculates `floor(log2(x))` for the lower element.
9248///
9249/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_sh)
9250#[inline]
9251#[target_feature(enable = "avx512fp16")]
9252#[cfg_attr(test, assert_instr(vgetexpsh))]
9253#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9254pub fn _mm_getexp_sh(a: __m128h, b: __m128h) -> __m128h {
9255    _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
9256}
9257
9258/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9259/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9260/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
9261/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
9262/// for the lower element.
9263///
9264/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_sh)
9265#[inline]
9266#[target_feature(enable = "avx512fp16")]
9267#[cfg_attr(test, assert_instr(vgetexpsh))]
9268#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9269pub fn _mm_mask_getexp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9270    _mm_mask_getexp_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
9271}
9272
9273/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9274/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9275/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
9276/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
9277/// lower element.
9278///
9279/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_sh)
9280#[inline]
9281#[target_feature(enable = "avx512fp16")]
9282#[cfg_attr(test, assert_instr(vgetexpsh))]
9283#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9284pub fn _mm_maskz_getexp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9285    _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), k, a, b)
9286}
9287
9288/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9289/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9290/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
9291/// calculates `floor(log2(x))` for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9292/// in the sae parameter
9293///
9294/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_sh)
9295#[inline]
9296#[target_feature(enable = "avx512fp16")]
9297#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9298#[rustc_legacy_const_generics(2)]
9299#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9300pub fn _mm_getexp_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
9301    static_assert_sae!(SAE);
9302    _mm_mask_getexp_round_sh::<SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
9303}
9304
9305/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9306/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9307/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
9308/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
9309/// for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9310///
9311/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_sh)
9312#[inline]
9313#[target_feature(enable = "avx512fp16")]
9314#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9315#[rustc_legacy_const_generics(4)]
9316#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9317pub fn _mm_mask_getexp_round_sh<const SAE: i32>(
9318    src: __m128h,
9319    k: __mmask8,
9320    a: __m128h,
9321    b: __m128h,
9322) -> __m128h {
9323    unsafe {
9324        static_assert_sae!(SAE);
9325        vgetexpsh(a, b, src, k, SAE)
9326    }
9327}
9328
9329/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9330/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9331/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
9332/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
9333/// lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9334///
9335/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_sh)
9336#[inline]
9337#[target_feature(enable = "avx512fp16")]
9338#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9339#[rustc_legacy_const_generics(3)]
9340#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9341pub fn _mm_maskz_getexp_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9342    static_assert_sae!(SAE);
9343    _mm_mask_getexp_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
9344}
9345
9346/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9347/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9348/// on the interval range defined by norm and the sign depends on sign and the source sign.
9349///
9350/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9351///
9352///     _MM_MANT_NORM_1_2     // interval [1, 2)
9353///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9354///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9355///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9356///
9357/// The sign is determined by sc which can take the following values:
9358///
9359///     _MM_MANT_SIGN_src     // sign = sign(src)
9360///     _MM_MANT_SIGN_zero    // sign = 0
9361///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9362///
9363/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ph)
9364#[inline]
9365#[target_feature(enable = "avx512fp16,avx512vl")]
9366#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9367#[rustc_legacy_const_generics(1, 2)]
9368#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9369pub fn _mm_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9370    a: __m128h,
9371) -> __m128h {
9372    static_assert_uimm_bits!(NORM, 4);
9373    static_assert_uimm_bits!(SIGN, 2);
9374    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_undefined_ph(), 0xff, a)
9375}
9376
9377/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9378/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9379/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9380/// by norm and the sign depends on sign and the source sign.
9381///
9382/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9383///
9384///     _MM_MANT_NORM_1_2     // interval [1, 2)
9385///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9386///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9387///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9388///
9389/// The sign is determined by sc which can take the following values:
9390///
9391///     _MM_MANT_SIGN_src     // sign = sign(src)
9392///     _MM_MANT_SIGN_zero    // sign = 0
9393///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9394///
9395/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ph)
9396#[inline]
9397#[target_feature(enable = "avx512fp16,avx512vl")]
9398#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9399#[rustc_legacy_const_generics(3, 4)]
9400#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9401pub fn _mm_mask_getmant_ph<
9402    const NORM: _MM_MANTISSA_NORM_ENUM,
9403    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9404>(
9405    src: __m128h,
9406    k: __mmask8,
9407    a: __m128h,
9408) -> __m128h {
9409    unsafe {
9410        static_assert_uimm_bits!(NORM, 4);
9411        static_assert_uimm_bits!(SIGN, 2);
9412        vgetmantph_128(a, (SIGN << 2) | NORM, src, k)
9413    }
9414}
9415
9416/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9417/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9418/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9419/// by norm and the sign depends on sign and the source sign.
9420///
9421/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9422///
9423///     _MM_MANT_NORM_1_2     // interval [1, 2)
9424///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9425///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9426///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9427///
9428/// The sign is determined by sc which can take the following values:
9429///
9430///     _MM_MANT_SIGN_src     // sign = sign(src)
9431///     _MM_MANT_SIGN_zero    // sign = 0
9432///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9433///
9434/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ph)
9435#[inline]
9436#[target_feature(enable = "avx512fp16,avx512vl")]
9437#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9438#[rustc_legacy_const_generics(2, 3)]
9439#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9440pub fn _mm_maskz_getmant_ph<
9441    const NORM: _MM_MANTISSA_NORM_ENUM,
9442    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9443>(
9444    k: __mmask8,
9445    a: __m128h,
9446) -> __m128h {
9447    static_assert_uimm_bits!(NORM, 4);
9448    static_assert_uimm_bits!(SIGN, 2);
9449    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_setzero_ph(), k, a)
9450}
9451
9452/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9453/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9454/// on the interval range defined by norm and the sign depends on sign and the source sign.
9455///
9456/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9457///
9458///     _MM_MANT_NORM_1_2     // interval [1, 2)
9459///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9460///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9461///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9462///
9463/// The sign is determined by sc which can take the following values:
9464///
9465///     _MM_MANT_SIGN_src     // sign = sign(src)
9466///     _MM_MANT_SIGN_zero    // sign = 0
9467///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9468///
9469/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getmant_ph)
9470#[inline]
9471#[target_feature(enable = "avx512fp16,avx512vl")]
9472#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9473#[rustc_legacy_const_generics(1, 2)]
9474#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9475pub fn _mm256_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9476    a: __m256h,
9477) -> __m256h {
9478    static_assert_uimm_bits!(NORM, 4);
9479    static_assert_uimm_bits!(SIGN, 2);
9480    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_undefined_ph(), 0xffff, a)
9481}
9482
9483/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9484/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9485/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9486/// by norm and the sign depends on sign and the source sign.
9487///
9488/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9489///
9490///     _MM_MANT_NORM_1_2     // interval [1, 2)
9491///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9492///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9493///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9494///
9495/// The sign is determined by sc which can take the following values:
9496///
9497///     _MM_MANT_SIGN_src     // sign = sign(src)
9498///     _MM_MANT_SIGN_zero    // sign = 0
9499///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9500///
9501/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getmant_ph)
9502#[inline]
9503#[target_feature(enable = "avx512fp16,avx512vl")]
9504#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9505#[rustc_legacy_const_generics(3, 4)]
9506#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9507pub fn _mm256_mask_getmant_ph<
9508    const NORM: _MM_MANTISSA_NORM_ENUM,
9509    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9510>(
9511    src: __m256h,
9512    k: __mmask16,
9513    a: __m256h,
9514) -> __m256h {
9515    unsafe {
9516        static_assert_uimm_bits!(NORM, 4);
9517        static_assert_uimm_bits!(SIGN, 2);
9518        vgetmantph_256(a, (SIGN << 2) | NORM, src, k)
9519    }
9520}
9521
9522/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9523/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9524/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9525/// by norm and the sign depends on sign and the source sign.
9526///
9527/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9528///
9529///     _MM_MANT_NORM_1_2     // interval [1, 2)
9530///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9531///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9532///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9533///
9534/// The sign is determined by sc which can take the following values:
9535///
9536///     _MM_MANT_SIGN_src     // sign = sign(src)
9537///     _MM_MANT_SIGN_zero    // sign = 0
9538///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9539///
9540/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getmant_ph)
9541#[inline]
9542#[target_feature(enable = "avx512fp16,avx512vl")]
9543#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9544#[rustc_legacy_const_generics(2, 3)]
9545#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9546pub fn _mm256_maskz_getmant_ph<
9547    const NORM: _MM_MANTISSA_NORM_ENUM,
9548    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9549>(
9550    k: __mmask16,
9551    a: __m256h,
9552) -> __m256h {
9553    static_assert_uimm_bits!(NORM, 4);
9554    static_assert_uimm_bits!(SIGN, 2);
9555    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_setzero_ph(), k, a)
9556}
9557
9558/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9559/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9560/// on the interval range defined by norm and the sign depends on sign and the source sign.
9561///
9562/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9563///
9564///     _MM_MANT_NORM_1_2     // interval [1, 2)
9565///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9566///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9567///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9568///
9569/// The sign is determined by sc which can take the following values:
9570///
9571///     _MM_MANT_SIGN_src     // sign = sign(src)
9572///     _MM_MANT_SIGN_zero    // sign = 0
9573///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9574///
9575/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_ph)
9576#[inline]
9577#[target_feature(enable = "avx512fp16")]
9578#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9579#[rustc_legacy_const_generics(1, 2)]
9580#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9581pub fn _mm512_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9582    a: __m512h,
9583) -> __m512h {
9584    static_assert_uimm_bits!(NORM, 4);
9585    static_assert_uimm_bits!(SIGN, 2);
9586    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_undefined_ph(), 0xffffffff, a)
9587}
9588
9589/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9590/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9591/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9592/// by norm and the sign depends on sign and the source sign.
9593///
9594/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9595///
9596///     _MM_MANT_NORM_1_2     // interval [1, 2)
9597///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9598///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9599///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9600///
9601/// The sign is determined by sc which can take the following values:
9602///
9603///     _MM_MANT_SIGN_src     // sign = sign(src)
9604///     _MM_MANT_SIGN_zero    // sign = 0
9605///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9606///
9607/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_ph)
9608#[inline]
9609#[target_feature(enable = "avx512fp16")]
9610#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9611#[rustc_legacy_const_generics(3, 4)]
9612#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9613pub fn _mm512_mask_getmant_ph<
9614    const NORM: _MM_MANTISSA_NORM_ENUM,
9615    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9616>(
9617    src: __m512h,
9618    k: __mmask32,
9619    a: __m512h,
9620) -> __m512h {
9621    static_assert_uimm_bits!(NORM, 4);
9622    static_assert_uimm_bits!(SIGN, 2);
9623    _mm512_mask_getmant_round_ph::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a)
9624}
9625
9626/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9627/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9628/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9629/// by norm and the sign depends on sign and the source sign.
9630///
9631/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9632///
9633///     _MM_MANT_NORM_1_2     // interval [1, 2)
9634///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9635///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9636///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9637///
9638/// The sign is determined by sc which can take the following values:
9639///
9640///     _MM_MANT_SIGN_src     // sign = sign(src)
9641///     _MM_MANT_SIGN_zero    // sign = 0
9642///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9643///
9644/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_ph)
9645#[inline]
9646#[target_feature(enable = "avx512fp16")]
9647#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9648#[rustc_legacy_const_generics(2, 3)]
9649#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9650pub fn _mm512_maskz_getmant_ph<
9651    const NORM: _MM_MANTISSA_NORM_ENUM,
9652    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9653>(
9654    k: __mmask32,
9655    a: __m512h,
9656) -> __m512h {
9657    static_assert_uimm_bits!(NORM, 4);
9658    static_assert_uimm_bits!(SIGN, 2);
9659    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_setzero_ph(), k, a)
9660}
9661
9662/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9663/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9664/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
9665/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9666///
9667/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9668///
9669///     _MM_MANT_NORM_1_2     // interval [1, 2)
9670///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9671///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9672///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9673///
9674/// The sign is determined by sc which can take the following values:
9675///
9676///     _MM_MANT_SIGN_src     // sign = sign(src)
9677///     _MM_MANT_SIGN_zero    // sign = 0
9678///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9679///
9680/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9681///
9682/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_round_ph)
9683#[inline]
9684#[target_feature(enable = "avx512fp16")]
9685#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9686#[rustc_legacy_const_generics(1, 2, 3)]
9687#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9688pub fn _mm512_getmant_round_ph<
9689    const NORM: _MM_MANTISSA_NORM_ENUM,
9690    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9691    const SAE: i32,
9692>(
9693    a: __m512h,
9694) -> __m512h {
9695    static_assert_uimm_bits!(NORM, 4);
9696    static_assert_uimm_bits!(SIGN, 2);
9697    static_assert_sae!(SAE);
9698    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
9699}
9700
9701/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9702/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9703/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9704/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9705/// in the sae parameter
9706///
9707/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9708///
9709///     _MM_MANT_NORM_1_2     // interval [1, 2)
9710///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9711///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9712///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9713///
9714/// The sign is determined by sc which can take the following values:
9715///
9716///     _MM_MANT_SIGN_src     // sign = sign(src)
9717///     _MM_MANT_SIGN_zero    // sign = 0
9718///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9719///
9720/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9721///
9722/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_round_ph)
9723#[inline]
9724#[target_feature(enable = "avx512fp16")]
9725#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9726#[rustc_legacy_const_generics(3, 4, 5)]
9727#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9728pub fn _mm512_mask_getmant_round_ph<
9729    const NORM: _MM_MANTISSA_NORM_ENUM,
9730    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9731    const SAE: i32,
9732>(
9733    src: __m512h,
9734    k: __mmask32,
9735    a: __m512h,
9736) -> __m512h {
9737    unsafe {
9738        static_assert_uimm_bits!(NORM, 4);
9739        static_assert_uimm_bits!(SIGN, 2);
9740        static_assert_sae!(SAE);
9741        vgetmantph_512(a, (SIGN << 2) | NORM, src, k, SAE)
9742    }
9743}
9744
9745/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9746/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9747/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9748/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9749/// in the sae parameter
9750///
9751/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9752///
9753///     _MM_MANT_NORM_1_2     // interval [1, 2)
9754///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9755///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9756///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9757///
9758/// The sign is determined by sc which can take the following values:
9759///
9760///     _MM_MANT_SIGN_src     // sign = sign(src)
9761///     _MM_MANT_SIGN_zero    // sign = 0
9762///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9763///
9764/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9765///
9766/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_ph)
9767#[inline]
9768#[target_feature(enable = "avx512fp16")]
9769#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9770#[rustc_legacy_const_generics(2, 3, 4)]
9771#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9772pub fn _mm512_maskz_getmant_round_ph<
9773    const NORM: _MM_MANTISSA_NORM_ENUM,
9774    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9775    const SAE: i32,
9776>(
9777    k: __mmask32,
9778    a: __m512h,
9779) -> __m512h {
9780    static_assert_uimm_bits!(NORM, 4);
9781    static_assert_uimm_bits!(SIGN, 2);
9782    static_assert_sae!(SAE);
9783    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_setzero_ph(), k, a)
9784}
9785
9786/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9787/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
9788/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9789/// on the interval range defined by norm and the sign depends on sign and the source sign.
9790///
9791/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9792///
9793///     _MM_MANT_NORM_1_2     // interval [1, 2)
9794///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9795///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9796///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9797///
9798/// The sign is determined by sc which can take the following values:
9799///
9800///     _MM_MANT_SIGN_src     // sign = sign(src)
9801///     _MM_MANT_SIGN_zero    // sign = 0
9802///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9803///
9804/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sh)
9805#[inline]
9806#[target_feature(enable = "avx512fp16")]
9807#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9808#[rustc_legacy_const_generics(2, 3)]
9809#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9810pub fn _mm_getmant_sh<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9811    a: __m128h,
9812    b: __m128h,
9813) -> __m128h {
9814    static_assert_uimm_bits!(NORM, 4);
9815    static_assert_uimm_bits!(SIGN, 2);
9816    _mm_mask_getmant_sh::<NORM, SIGN>(f16x8::ZERO.as_m128h(), 0xff, a, b)
9817}
9818
9819/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9820/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
9821/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9822/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9823/// the source sign.
9824///
9825/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9826///
9827///     _MM_MANT_NORM_1_2     // interval [1, 2)
9828///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9829///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9830///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9831///
9832/// The sign is determined by sc which can take the following values:
9833///
9834///     _MM_MANT_SIGN_src     // sign = sign(src)
9835///     _MM_MANT_SIGN_zero    // sign = 0
9836///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9837///
9838/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_sh)
9839#[inline]
9840#[target_feature(enable = "avx512fp16")]
9841#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9842#[rustc_legacy_const_generics(4, 5)]
9843#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9844pub fn _mm_mask_getmant_sh<
9845    const NORM: _MM_MANTISSA_NORM_ENUM,
9846    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9847>(
9848    src: __m128h,
9849    k: __mmask8,
9850    a: __m128h,
9851    b: __m128h,
9852) -> __m128h {
9853    static_assert_uimm_bits!(NORM, 4);
9854    static_assert_uimm_bits!(SIGN, 2);
9855    _mm_mask_getmant_round_sh::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
9856}
9857
9858/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9859/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
9860/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9861/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9862/// the source sign.
9863///
9864/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9865///
9866///     _MM_MANT_NORM_1_2     // interval [1, 2)
9867///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9868///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9869///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9870///
9871/// The sign is determined by sc which can take the following values:
9872///
9873///     _MM_MANT_SIGN_src     // sign = sign(src)
9874///     _MM_MANT_SIGN_zero    // sign = 0
9875///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9876///
9877/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_sh)
9878#[inline]
9879#[target_feature(enable = "avx512fp16")]
9880#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9881#[rustc_legacy_const_generics(3, 4)]
9882#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9883pub fn _mm_maskz_getmant_sh<
9884    const NORM: _MM_MANTISSA_NORM_ENUM,
9885    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9886>(
9887    k: __mmask8,
9888    a: __m128h,
9889    b: __m128h,
9890) -> __m128h {
9891    static_assert_uimm_bits!(NORM, 4);
9892    static_assert_uimm_bits!(SIGN, 2);
9893    _mm_mask_getmant_sh::<NORM, SIGN>(f16x8::ZERO.as_m128h(), k, a, b)
9894}
9895
9896/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9897/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
9898/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9899/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
9900/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9901///
9902/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9903///
9904///     _MM_MANT_NORM_1_2     // interval [1, 2)
9905///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9906///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9907///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9908///
9909/// The sign is determined by sc which can take the following values:
9910///
9911///     _MM_MANT_SIGN_src     // sign = sign(src)
9912///     _MM_MANT_SIGN_zero    // sign = 0
9913///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9914///
9915/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9916///
9917/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_sh)
9918#[inline]
9919#[target_feature(enable = "avx512fp16")]
9920#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9921#[rustc_legacy_const_generics(2, 3, 4)]
9922#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9923pub fn _mm_getmant_round_sh<
9924    const NORM: _MM_MANTISSA_NORM_ENUM,
9925    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9926    const SAE: i32,
9927>(
9928    a: __m128h,
9929    b: __m128h,
9930) -> __m128h {
9931    static_assert_uimm_bits!(NORM, 4);
9932    static_assert_uimm_bits!(SIGN, 2);
9933    static_assert_sae!(SAE);
9934    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
9935}
9936
9937/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9938/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
9939/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9940/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9941/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9942///
9943/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9944///
9945///     _MM_MANT_NORM_1_2     // interval [1, 2)
9946///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9947///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9948///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9949///
9950/// The sign is determined by sc which can take the following values:
9951///
9952///     _MM_MANT_SIGN_src     // sign = sign(src)
9953///     _MM_MANT_SIGN_zero    // sign = 0
9954///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9955///
9956/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9957///
9958/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_sh)
9959#[inline]
9960#[target_feature(enable = "avx512fp16")]
9961#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9962#[rustc_legacy_const_generics(4, 5, 6)]
9963#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9964pub fn _mm_mask_getmant_round_sh<
9965    const NORM: _MM_MANTISSA_NORM_ENUM,
9966    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9967    const SAE: i32,
9968>(
9969    src: __m128h,
9970    k: __mmask8,
9971    a: __m128h,
9972    b: __m128h,
9973) -> __m128h {
9974    unsafe {
9975        static_assert_uimm_bits!(NORM, 4);
9976        static_assert_uimm_bits!(SIGN, 2);
9977        static_assert_sae!(SAE);
9978        vgetmantsh(a, b, (SIGN << 2) | NORM, src, k, SAE)
9979    }
9980}
9981
9982/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9983/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
9984/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9985/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9986/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9987///
9988/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9989///
9990///     _MM_MANT_NORM_1_2     // interval [1, 2)
9991///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9992///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9993///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9994///
9995/// The sign is determined by sc which can take the following values:
9996///
9997///     _MM_MANT_SIGN_src     // sign = sign(src)
9998///     _MM_MANT_SIGN_zero    // sign = 0
9999///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
10000///
10001/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10002///
10003/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_sh)
10004#[inline]
10005#[target_feature(enable = "avx512fp16")]
10006#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
10007#[rustc_legacy_const_generics(3, 4, 5)]
10008#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10009pub fn _mm_maskz_getmant_round_sh<
10010    const NORM: _MM_MANTISSA_NORM_ENUM,
10011    const SIGN: _MM_MANTISSA_SIGN_ENUM,
10012    const SAE: i32,
10013>(
10014    k: __mmask8,
10015    a: __m128h,
10016    b: __m128h,
10017) -> __m128h {
10018    static_assert_uimm_bits!(NORM, 4);
10019    static_assert_uimm_bits!(SIGN, 2);
10020    static_assert_sae!(SAE);
10021    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
10022}
10023
10024/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10025/// specified by imm8, and store the results in dst.
10026///
10027/// Rounding is done according to the imm8 parameter, which can be one of:
10028///
10029/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10030/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10031/// * [`_MM_FROUND_TO_POS_INF`] : round up
10032/// * [`_MM_FROUND_TO_ZERO`] : truncate
10033/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10034///
10035/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ph)
10036#[inline]
10037#[target_feature(enable = "avx512fp16,avx512vl")]
10038#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10039#[rustc_legacy_const_generics(1)]
10040#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10041pub fn _mm_roundscale_ph<const IMM8: i32>(a: __m128h) -> __m128h {
10042    static_assert_uimm_bits!(IMM8, 8);
10043    _mm_mask_roundscale_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
10044}
10045
10046/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10047/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
10048/// the corresponding mask bit is not set).
10049///
10050/// Rounding is done according to the imm8 parameter, which can be one of:
10051///
10052/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10053/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10054/// * [`_MM_FROUND_TO_POS_INF`] : round up
10055/// * [`_MM_FROUND_TO_ZERO`] : truncate
10056/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10057///
10058/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ph)
10059#[inline]
10060#[target_feature(enable = "avx512fp16,avx512vl")]
10061#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10062#[rustc_legacy_const_generics(3)]
10063#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10064pub fn _mm_mask_roundscale_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
10065    unsafe {
10066        static_assert_uimm_bits!(IMM8, 8);
10067        vrndscaleph_128(a, IMM8, src, k)
10068    }
10069}
10070
10071/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10072/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
10073/// mask bit is not set).
10074///
10075/// Rounding is done according to the imm8 parameter, which can be one of:
10076///
10077/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10078/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10079/// * [`_MM_FROUND_TO_POS_INF`] : round up
10080/// * [`_MM_FROUND_TO_ZERO`] : truncate
10081/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10082///
10083/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ph)
10084#[inline]
10085#[target_feature(enable = "avx512fp16,avx512vl")]
10086#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10087#[rustc_legacy_const_generics(2)]
10088#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10089pub fn _mm_maskz_roundscale_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
10090    static_assert_uimm_bits!(IMM8, 8);
10091    _mm_mask_roundscale_ph::<IMM8>(_mm_setzero_ph(), k, a)
10092}
10093
10094/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10095/// specified by imm8, and store the results in dst.
10096///
10097/// Rounding is done according to the imm8 parameter, which can be one of:
10098///
10099/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10100/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10101/// * [`_MM_FROUND_TO_POS_INF`] : round up
10102/// * [`_MM_FROUND_TO_ZERO`] : truncate
10103/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10104///
10105/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_roundscale_ph)
10106#[inline]
10107#[target_feature(enable = "avx512fp16,avx512vl")]
10108#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10109#[rustc_legacy_const_generics(1)]
10110#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10111pub fn _mm256_roundscale_ph<const IMM8: i32>(a: __m256h) -> __m256h {
10112    static_assert_uimm_bits!(IMM8, 8);
10113    _mm256_mask_roundscale_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
10114}
10115
10116/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10117/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
10118/// the corresponding mask bit is not set).
10119///
10120/// Rounding is done according to the imm8 parameter, which can be one of:
10121///
10122/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10123/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10124/// * [`_MM_FROUND_TO_POS_INF`] : round up
10125/// * [`_MM_FROUND_TO_ZERO`] : truncate
10126/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10127///
10128/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_roundscale_ph)
10129#[inline]
10130#[target_feature(enable = "avx512fp16,avx512vl")]
10131#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10132#[rustc_legacy_const_generics(3)]
10133#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10134pub fn _mm256_mask_roundscale_ph<const IMM8: i32>(
10135    src: __m256h,
10136    k: __mmask16,
10137    a: __m256h,
10138) -> __m256h {
10139    unsafe {
10140        static_assert_uimm_bits!(IMM8, 8);
10141        vrndscaleph_256(a, IMM8, src, k)
10142    }
10143}
10144
10145/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10146/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
10147/// mask bit is not set).
10148///
10149/// Rounding is done according to the imm8 parameter, which can be one of:
10150///
10151/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10152/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10153/// * [`_MM_FROUND_TO_POS_INF`] : round up
10154/// * [`_MM_FROUND_TO_ZERO`] : truncate
10155/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10156///
10157/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_roundscale_ph)
10158#[inline]
10159#[target_feature(enable = "avx512fp16,avx512vl")]
10160#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10161#[rustc_legacy_const_generics(2)]
10162#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10163pub fn _mm256_maskz_roundscale_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
10164    static_assert_uimm_bits!(IMM8, 8);
10165    _mm256_mask_roundscale_ph::<IMM8>(_mm256_setzero_ph(), k, a)
10166}
10167
10168/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10169/// specified by imm8, and store the results in dst.
10170///
10171/// Rounding is done according to the imm8 parameter, which can be one of:
10172///
10173/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10174/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10175/// * [`_MM_FROUND_TO_POS_INF`] : round up
10176/// * [`_MM_FROUND_TO_ZERO`] : truncate
10177/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10178///
10179/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ph)
10180#[inline]
10181#[target_feature(enable = "avx512fp16")]
10182#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10183#[rustc_legacy_const_generics(1)]
10184#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10185pub fn _mm512_roundscale_ph<const IMM8: i32>(a: __m512h) -> __m512h {
10186    static_assert_uimm_bits!(IMM8, 8);
10187    _mm512_mask_roundscale_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
10188}
10189
10190/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10191/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
10192/// the corresponding mask bit is not set).
10193///
10194/// Rounding is done according to the imm8 parameter, which can be one of:
10195///
10196/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10197/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10198/// * [`_MM_FROUND_TO_POS_INF`] : round up
10199/// * [`_MM_FROUND_TO_ZERO`] : truncate
10200/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10201///
10202/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ph)
10203#[inline]
10204#[target_feature(enable = "avx512fp16")]
10205#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10206#[rustc_legacy_const_generics(3)]
10207#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10208pub fn _mm512_mask_roundscale_ph<const IMM8: i32>(
10209    src: __m512h,
10210    k: __mmask32,
10211    a: __m512h,
10212) -> __m512h {
10213    static_assert_uimm_bits!(IMM8, 8);
10214    _mm512_mask_roundscale_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
10215}
10216
10217/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10218/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
10219/// mask bit is not set).
10220///
10221/// Rounding is done according to the imm8 parameter, which can be one of:
10222///
10223/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10224/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10225/// * [`_MM_FROUND_TO_POS_INF`] : round up
10226/// * [`_MM_FROUND_TO_ZERO`] : truncate
10227/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10228///
10229/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ph)
10230#[inline]
10231#[target_feature(enable = "avx512fp16")]
10232#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10233#[rustc_legacy_const_generics(2)]
10234#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10235pub fn _mm512_maskz_roundscale_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
10236    static_assert_uimm_bits!(IMM8, 8);
10237    _mm512_mask_roundscale_ph::<IMM8>(_mm512_setzero_ph(), k, a)
10238}
10239
10240/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10241/// specified by imm8, and store the results in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
10242/// in the sae parameter
10243///
10244/// Rounding is done according to the imm8 parameter, which can be one of:
10245///
10246/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10247/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10248/// * [`_MM_FROUND_TO_POS_INF`] : round up
10249/// * [`_MM_FROUND_TO_ZERO`] : truncate
10250/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10251///
10252/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ph)
10253#[inline]
10254#[target_feature(enable = "avx512fp16")]
10255#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10256#[rustc_legacy_const_generics(1, 2)]
10257#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10258pub fn _mm512_roundscale_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
10259    static_assert_uimm_bits!(IMM8, 8);
10260    static_assert_sae!(SAE);
10261    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
10262}
10263
10264/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10265/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
10266/// the corresponding mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
10267/// in the sae parameter
10268///
10269/// Rounding is done according to the imm8 parameter, which can be one of:
10270///
10271/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10272/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10273/// * [`_MM_FROUND_TO_POS_INF`] : round up
10274/// * [`_MM_FROUND_TO_ZERO`] : truncate
10275/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10276///
10277/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ph)
10278#[inline]
10279#[target_feature(enable = "avx512fp16")]
10280#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10281#[rustc_legacy_const_generics(3, 4)]
10282#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10283pub fn _mm512_mask_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
10284    src: __m512h,
10285    k: __mmask32,
10286    a: __m512h,
10287) -> __m512h {
10288    unsafe {
10289        static_assert_uimm_bits!(IMM8, 8);
10290        static_assert_sae!(SAE);
10291        vrndscaleph_512(a, IMM8, src, k, SAE)
10292    }
10293}
10294
10295/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10296/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
10297/// mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10298///
10299/// Rounding is done according to the imm8 parameter, which can be one of:
10300///
10301/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10302/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10303/// * [`_MM_FROUND_TO_POS_INF`] : round up
10304/// * [`_MM_FROUND_TO_ZERO`] : truncate
10305/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10306///
10307/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ph)
10308#[inline]
10309#[target_feature(enable = "avx512fp16")]
10310#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10311#[rustc_legacy_const_generics(2, 3)]
10312#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10313pub fn _mm512_maskz_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
10314    k: __mmask32,
10315    a: __m512h,
10316) -> __m512h {
10317    static_assert_uimm_bits!(IMM8, 8);
10318    static_assert_sae!(SAE);
10319    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
10320}
10321
10322/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10323/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
10324/// from a to the upper elements of dst.
10325///
10326/// Rounding is done according to the imm8 parameter, which can be one of:
10327///
10328/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10329/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10330/// * [`_MM_FROUND_TO_POS_INF`] : round up
10331/// * [`_MM_FROUND_TO_ZERO`] : truncate
10332/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10333///
10334/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sh)
10335#[inline]
10336#[target_feature(enable = "avx512fp16")]
10337#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10338#[rustc_legacy_const_generics(2)]
10339#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10340pub fn _mm_roundscale_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
10341    static_assert_uimm_bits!(IMM8, 8);
10342    _mm_mask_roundscale_sh::<IMM8>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10343}
10344
10345/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10346/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
10347/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10348///
10349/// Rounding is done according to the imm8 parameter, which can be one of:
10350///
10351/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10352/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10353/// * [`_MM_FROUND_TO_POS_INF`] : round up
10354/// * [`_MM_FROUND_TO_ZERO`] : truncate
10355/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10356///
10357/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sh)
10358#[inline]
10359#[target_feature(enable = "avx512fp16")]
10360#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10361#[rustc_legacy_const_generics(4)]
10362#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10363pub fn _mm_mask_roundscale_sh<const IMM8: i32>(
10364    src: __m128h,
10365    k: __mmask8,
10366    a: __m128h,
10367    b: __m128h,
10368) -> __m128h {
10369    static_assert_uimm_bits!(IMM8, 8);
10370    _mm_mask_roundscale_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10371}
10372
10373/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10374/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
10375/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10376///
10377/// Rounding is done according to the imm8 parameter, which can be one of:
10378///
10379/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10380/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10381/// * [`_MM_FROUND_TO_POS_INF`] : round up
10382/// * [`_MM_FROUND_TO_ZERO`] : truncate
10383/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10384///
10385/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sh)
10386#[inline]
10387#[target_feature(enable = "avx512fp16")]
10388#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10389#[rustc_legacy_const_generics(3)]
10390#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10391pub fn _mm_maskz_roundscale_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10392    static_assert_uimm_bits!(IMM8, 8);
10393    _mm_mask_roundscale_sh::<IMM8>(f16x8::ZERO.as_m128h(), k, a, b)
10394}
10395
10396/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10397/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
10398/// from a to the upper elements of dst.
10399///
10400/// Rounding is done according to the imm8 parameter, which can be one of:
10401///
10402/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10403/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10404/// * [`_MM_FROUND_TO_POS_INF`] : round up
10405/// * [`_MM_FROUND_TO_ZERO`] : truncate
10406/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10407///
10408/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10409///
10410/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sh)
10411#[inline]
10412#[target_feature(enable = "avx512fp16")]
10413#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10414#[rustc_legacy_const_generics(2, 3)]
10415#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10416pub fn _mm_roundscale_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
10417    static_assert_uimm_bits!(IMM8, 8);
10418    static_assert_sae!(SAE);
10419    _mm_mask_roundscale_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10420}
10421
10422/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10423/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
10424/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10425///
10426/// Rounding is done according to the imm8 parameter, which can be one of:
10427///
10428/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10429/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10430/// * [`_MM_FROUND_TO_POS_INF`] : round up
10431/// * [`_MM_FROUND_TO_ZERO`] : truncate
10432/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10433///
10434/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10435///
10436/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sh)
10437#[inline]
10438#[target_feature(enable = "avx512fp16")]
10439#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10440#[rustc_legacy_const_generics(4, 5)]
10441#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10442pub fn _mm_mask_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
10443    src: __m128h,
10444    k: __mmask8,
10445    a: __m128h,
10446    b: __m128h,
10447) -> __m128h {
10448    unsafe {
10449        static_assert_uimm_bits!(IMM8, 8);
10450        static_assert_sae!(SAE);
10451        vrndscalesh(a, b, src, k, IMM8, SAE)
10452    }
10453}
10454
10455/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10456/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
10457/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10458///
10459/// Rounding is done according to the imm8 parameter, which can be one of:
10460///
10461/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10462/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10463/// * [`_MM_FROUND_TO_POS_INF`] : round up
10464/// * [`_MM_FROUND_TO_ZERO`] : truncate
10465/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10466///
10467/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10468///
10469/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sh)
10470#[inline]
10471#[target_feature(enable = "avx512fp16")]
10472#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10473#[rustc_legacy_const_generics(3, 4)]
10474#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10475pub fn _mm_maskz_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
10476    k: __mmask8,
10477    a: __m128h,
10478    b: __m128h,
10479) -> __m128h {
10480    static_assert_uimm_bits!(IMM8, 8);
10481    static_assert_sae!(SAE);
10482    _mm_mask_roundscale_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
10483}
10484
10485/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10486/// the results in dst.
10487///
10488/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ph)
10489#[inline]
10490#[target_feature(enable = "avx512fp16,avx512vl")]
10491#[cfg_attr(test, assert_instr(vscalefph))]
10492#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10493pub fn _mm_scalef_ph(a: __m128h, b: __m128h) -> __m128h {
10494    _mm_mask_scalef_ph(_mm_undefined_ph(), 0xff, a, b)
10495}
10496
10497/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10498/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10499///
10500/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ph)
10501#[inline]
10502#[target_feature(enable = "avx512fp16,avx512vl")]
10503#[cfg_attr(test, assert_instr(vscalefph))]
10504#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10505pub fn _mm_mask_scalef_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10506    unsafe { vscalefph_128(a, b, src, k) }
10507}
10508
10509/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10510/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10511///
10512/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ph)
10513#[inline]
10514#[target_feature(enable = "avx512fp16,avx512vl")]
10515#[cfg_attr(test, assert_instr(vscalefph))]
10516#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10517pub fn _mm_maskz_scalef_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10518    _mm_mask_scalef_ph(_mm_setzero_ph(), k, a, b)
10519}
10520
10521/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10522/// the results in dst.
10523///
10524/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_scalef_ph)
10525#[inline]
10526#[target_feature(enable = "avx512fp16,avx512vl")]
10527#[cfg_attr(test, assert_instr(vscalefph))]
10528#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10529pub fn _mm256_scalef_ph(a: __m256h, b: __m256h) -> __m256h {
10530    _mm256_mask_scalef_ph(_mm256_undefined_ph(), 0xffff, a, b)
10531}
10532
10533/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10534/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10535///
10536/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_scalef_ph)
10537#[inline]
10538#[target_feature(enable = "avx512fp16,avx512vl")]
10539#[cfg_attr(test, assert_instr(vscalefph))]
10540#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10541pub fn _mm256_mask_scalef_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
10542    unsafe { vscalefph_256(a, b, src, k) }
10543}
10544
10545/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10546/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10547///
10548/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_scalef_ph)
10549#[inline]
10550#[target_feature(enable = "avx512fp16,avx512vl")]
10551#[cfg_attr(test, assert_instr(vscalefph))]
10552#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10553pub fn _mm256_maskz_scalef_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
10554    _mm256_mask_scalef_ph(_mm256_setzero_ph(), k, a, b)
10555}
10556
10557/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10558/// the results in dst.
10559///
10560/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ph)
10561#[inline]
10562#[target_feature(enable = "avx512fp16")]
10563#[cfg_attr(test, assert_instr(vscalefph))]
10564#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10565pub fn _mm512_scalef_ph(a: __m512h, b: __m512h) -> __m512h {
10566    _mm512_mask_scalef_ph(_mm512_undefined_ph(), 0xffffffff, a, b)
10567}
10568
10569/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10570/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10571///
10572/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ph)
10573#[inline]
10574#[target_feature(enable = "avx512fp16")]
10575#[cfg_attr(test, assert_instr(vscalefph))]
10576#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10577pub fn _mm512_mask_scalef_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
10578    _mm512_mask_scalef_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10579}
10580
10581/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10582/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10583///
10584/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ph)
10585#[inline]
10586#[target_feature(enable = "avx512fp16")]
10587#[cfg_attr(test, assert_instr(vscalefph))]
10588#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10589pub fn _mm512_maskz_scalef_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
10590    _mm512_mask_scalef_ph(_mm512_setzero_ph(), k, a, b)
10591}
10592
10593/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10594/// the results in dst.
10595///
10596/// Rounding is done according to the rounding parameter, which can be one of:
10597///
10598/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10599/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10600/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10601/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10602/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10603///
10604/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ph)
10605#[inline]
10606#[target_feature(enable = "avx512fp16")]
10607#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10608#[rustc_legacy_const_generics(2)]
10609#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10610pub fn _mm512_scalef_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
10611    static_assert_rounding!(ROUNDING);
10612    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_undefined_ph(), 0xffffffff, a, b)
10613}
10614
10615/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10616/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10617///
10618/// Rounding is done according to the rounding parameter, which can be one of:
10619///
10620/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10621/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10622/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10623/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10624/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10625///
10626/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ph)
10627#[inline]
10628#[target_feature(enable = "avx512fp16")]
10629#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10630#[rustc_legacy_const_generics(4)]
10631#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10632pub fn _mm512_mask_scalef_round_ph<const ROUNDING: i32>(
10633    src: __m512h,
10634    k: __mmask32,
10635    a: __m512h,
10636    b: __m512h,
10637) -> __m512h {
10638    unsafe {
10639        static_assert_rounding!(ROUNDING);
10640        vscalefph_512(a, b, src, k, ROUNDING)
10641    }
10642}
10643
10644/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10645/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10646///
10647/// Rounding is done according to the rounding parameter, which can be one of:
10648///
10649/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10650/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10651/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10652/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10653/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10654///
10655/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ph)
10656#[inline]
10657#[target_feature(enable = "avx512fp16")]
10658#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10659#[rustc_legacy_const_generics(3)]
10660#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10661pub fn _mm512_maskz_scalef_round_ph<const ROUNDING: i32>(
10662    k: __mmask32,
10663    a: __m512h,
10664    b: __m512h,
10665) -> __m512h {
10666    static_assert_rounding!(ROUNDING);
10667    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
10668}
10669
10670/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10671/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
10672/// elements of dst.
10673///
10674/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sh)
10675#[inline]
10676#[target_feature(enable = "avx512fp16")]
10677#[cfg_attr(test, assert_instr(vscalefsh))]
10678#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10679pub fn _mm_scalef_sh(a: __m128h, b: __m128h) -> __m128h {
10680    _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
10681}
10682
10683/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10684/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
10685/// and copy the upper 7 packed elements from a to the upper elements of dst.
10686///
10687/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sh)
10688#[inline]
10689#[target_feature(enable = "avx512fp16")]
10690#[cfg_attr(test, assert_instr(vscalefsh))]
10691#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10692pub fn _mm_mask_scalef_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10693    _mm_mask_scalef_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10694}
10695
10696/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10697/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
10698/// and copy the upper 7 packed elements from a to the upper elements of dst.
10699///
10700/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sh)
10701#[inline]
10702#[target_feature(enable = "avx512fp16")]
10703#[cfg_attr(test, assert_instr(vscalefsh))]
10704#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10705pub fn _mm_maskz_scalef_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10706    _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), k, a, b)
10707}
10708
10709/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10710/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
10711/// elements of dst.
10712///
10713/// Rounding is done according to the rounding parameter, which can be one of:
10714///
10715/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10716/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10717/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10718/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10719/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10720///
10721/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sh)
10722#[inline]
10723#[target_feature(enable = "avx512fp16")]
10724#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10725#[rustc_legacy_const_generics(2)]
10726#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10727pub fn _mm_scalef_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
10728    static_assert_rounding!(ROUNDING);
10729    _mm_mask_scalef_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10730}
10731
10732/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10733/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
10734/// and copy the upper 7 packed elements from a to the upper elements of dst.
10735///
10736/// Rounding is done according to the rounding parameter, which can be one of:
10737///
10738/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10739/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10740/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10741/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10742/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10743///
10744/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sh)
10745#[inline]
10746#[target_feature(enable = "avx512fp16")]
10747#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10748#[rustc_legacy_const_generics(4)]
10749#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10750pub fn _mm_mask_scalef_round_sh<const ROUNDING: i32>(
10751    src: __m128h,
10752    k: __mmask8,
10753    a: __m128h,
10754    b: __m128h,
10755) -> __m128h {
10756    unsafe {
10757        static_assert_rounding!(ROUNDING);
10758        vscalefsh(a, b, src, k, ROUNDING)
10759    }
10760}
10761
10762/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10763/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
10764/// and copy the upper 7 packed elements from a to the upper elements of dst.
10765///
10766/// Rounding is done according to the rounding parameter, which can be one of:
10767///
10768/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10769/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10770/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10771/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10772/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10773///
10774/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sh)
10775#[inline]
10776#[target_feature(enable = "avx512fp16")]
10777#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10778#[rustc_legacy_const_generics(3)]
10779#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10780pub fn _mm_maskz_scalef_round_sh<const ROUNDING: i32>(
10781    k: __mmask8,
10782    a: __m128h,
10783    b: __m128h,
10784) -> __m128h {
10785    static_assert_rounding!(ROUNDING);
10786    _mm_mask_scalef_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
10787}
10788
10789/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10790/// number of bits specified by imm8, and store the results in dst.
10791///
10792/// Rounding is done according to the imm8 parameter, which can be one of:
10793///
10794/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10795/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10796/// * [`_MM_FROUND_TO_POS_INF`] : round up
10797/// * [`_MM_FROUND_TO_ZERO`] : truncate
10798/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10799///
10800/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_ph)
10801#[inline]
10802#[target_feature(enable = "avx512fp16,avx512vl")]
10803#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10804#[rustc_legacy_const_generics(1)]
10805#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10806pub fn _mm_reduce_ph<const IMM8: i32>(a: __m128h) -> __m128h {
10807    static_assert_uimm_bits!(IMM8, 8);
10808    _mm_mask_reduce_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
10809}
10810
10811/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10812/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10813/// from src when the corresponding mask bit is not set).
10814///
10815/// Rounding is done according to the imm8 parameter, which can be one of:
10816///
10817/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10818/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10819/// * [`_MM_FROUND_TO_POS_INF`] : round up
10820/// * [`_MM_FROUND_TO_ZERO`] : truncate
10821/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10822///
10823/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_ph)
10824#[inline]
10825#[target_feature(enable = "avx512fp16,avx512vl")]
10826#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10827#[rustc_legacy_const_generics(3)]
10828#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10829pub fn _mm_mask_reduce_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
10830    unsafe {
10831        static_assert_uimm_bits!(IMM8, 8);
10832        vreduceph_128(a, IMM8, src, k)
10833    }
10834}
10835
10836/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10837/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10838/// out when the corresponding mask bit is not set).
10839///
10840/// Rounding is done according to the imm8 parameter, which can be one of:
10841///
10842/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10843/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10844/// * [`_MM_FROUND_TO_POS_INF`] : round up
10845/// * [`_MM_FROUND_TO_ZERO`] : truncate
10846/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10847///
10848/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_ph)
10849#[inline]
10850#[target_feature(enable = "avx512fp16,avx512vl")]
10851#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10852#[rustc_legacy_const_generics(2)]
10853#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10854pub fn _mm_maskz_reduce_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
10855    static_assert_uimm_bits!(IMM8, 8);
10856    _mm_mask_reduce_ph::<IMM8>(_mm_setzero_ph(), k, a)
10857}
10858
10859/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10860/// number of bits specified by imm8, and store the results in dst.
10861///
10862/// Rounding is done according to the imm8 parameter, which can be one of:
10863///
10864/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10865/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10866/// * [`_MM_FROUND_TO_POS_INF`] : round up
10867/// * [`_MM_FROUND_TO_ZERO`] : truncate
10868/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10869///
10870/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_ph)
10871#[inline]
10872#[target_feature(enable = "avx512fp16,avx512vl")]
10873#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10874#[rustc_legacy_const_generics(1)]
10875#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10876pub fn _mm256_reduce_ph<const IMM8: i32>(a: __m256h) -> __m256h {
10877    static_assert_uimm_bits!(IMM8, 8);
10878    _mm256_mask_reduce_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
10879}
10880
10881/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10882/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10883/// from src when the corresponding mask bit is not set).
10884///
10885/// Rounding is done according to the imm8 parameter, which can be one of:
10886///
10887/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10888/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10889/// * [`_MM_FROUND_TO_POS_INF`] : round up
10890/// * [`_MM_FROUND_TO_ZERO`] : truncate
10891/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10892///
10893/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_ph)
10894#[inline]
10895#[target_feature(enable = "avx512fp16,avx512vl")]
10896#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10897#[rustc_legacy_const_generics(3)]
10898#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10899pub fn _mm256_mask_reduce_ph<const IMM8: i32>(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
10900    unsafe {
10901        static_assert_uimm_bits!(IMM8, 8);
10902        vreduceph_256(a, IMM8, src, k)
10903    }
10904}
10905
10906/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10907/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10908/// out when the corresponding mask bit is not set).
10909///
10910/// Rounding is done according to the imm8 parameter, which can be one of:
10911///
10912/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10913/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10914/// * [`_MM_FROUND_TO_POS_INF`] : round up
10915/// * [`_MM_FROUND_TO_ZERO`] : truncate
10916/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10917///
10918/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_reduce_ph)
10919#[inline]
10920#[target_feature(enable = "avx512fp16,avx512vl")]
10921#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10922#[rustc_legacy_const_generics(2)]
10923#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10924pub fn _mm256_maskz_reduce_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
10925    static_assert_uimm_bits!(IMM8, 8);
10926    _mm256_mask_reduce_ph::<IMM8>(_mm256_setzero_ph(), k, a)
10927}
10928
10929/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10930/// number of bits specified by imm8, and store the results in dst.
10931///
10932/// Rounding is done according to the imm8 parameter, which can be one of:
10933///
10934/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10935/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10936/// * [`_MM_FROUND_TO_POS_INF`] : round up
10937/// * [`_MM_FROUND_TO_ZERO`] : truncate
10938/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10939///
10940/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_ph)
10941#[inline]
10942#[target_feature(enable = "avx512fp16")]
10943#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10944#[rustc_legacy_const_generics(1)]
10945#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10946pub fn _mm512_reduce_ph<const IMM8: i32>(a: __m512h) -> __m512h {
10947    static_assert_uimm_bits!(IMM8, 8);
10948    _mm512_mask_reduce_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
10949}
10950
10951/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10952/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10953/// from src when the corresponding mask bit is not set).
10954///
10955/// Rounding is done according to the imm8 parameter, which can be one of:
10956///
10957/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10958/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10959/// * [`_MM_FROUND_TO_POS_INF`] : round up
10960/// * [`_MM_FROUND_TO_ZERO`] : truncate
10961/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10962///
10963/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_ph)
10964#[inline]
10965#[target_feature(enable = "avx512fp16")]
10966#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10967#[rustc_legacy_const_generics(3)]
10968#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10969pub fn _mm512_mask_reduce_ph<const IMM8: i32>(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
10970    static_assert_uimm_bits!(IMM8, 8);
10971    _mm512_mask_reduce_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
10972}
10973
10974/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10975/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10976/// out when the corresponding mask bit is not set).
10977///
10978/// Rounding is done according to the imm8 parameter, which can be one of:
10979///
10980/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10981/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10982/// * [`_MM_FROUND_TO_POS_INF`] : round up
10983/// * [`_MM_FROUND_TO_ZERO`] : truncate
10984/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10985///
10986/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_ph)
10987#[inline]
10988#[target_feature(enable = "avx512fp16")]
10989#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10990#[rustc_legacy_const_generics(2)]
10991#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10992pub fn _mm512_maskz_reduce_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
10993    static_assert_uimm_bits!(IMM8, 8);
10994    _mm512_mask_reduce_ph::<IMM8>(_mm512_setzero_ph(), k, a)
10995}
10996
10997/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10998/// number of bits specified by imm8, and store the results in dst.
10999///
11000/// Rounding is done according to the imm8 parameter, which can be one of:
11001///
11002/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11003/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11004/// * [`_MM_FROUND_TO_POS_INF`] : round up
11005/// * [`_MM_FROUND_TO_ZERO`] : truncate
11006/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11007///
11008/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11009///
11010/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_round_ph)
11011#[inline]
11012#[target_feature(enable = "avx512fp16")]
11013#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
11014#[rustc_legacy_const_generics(1, 2)]
11015#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11016pub fn _mm512_reduce_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
11017    static_assert_uimm_bits!(IMM8, 8);
11018    static_assert_sae!(SAE);
11019    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
11020}
11021
11022/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
11023/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
11024/// from src when the corresponding mask bit is not set).
11025///
11026/// Rounding is done according to the imm8 parameter, which can be one of:
11027///
11028/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11029/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11030/// * [`_MM_FROUND_TO_POS_INF`] : round up
11031/// * [`_MM_FROUND_TO_ZERO`] : truncate
11032/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11033///
11034/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11035///
11036/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_round_ph)
11037#[inline]
11038#[target_feature(enable = "avx512fp16")]
11039#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
11040#[rustc_legacy_const_generics(3, 4)]
11041#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11042pub fn _mm512_mask_reduce_round_ph<const IMM8: i32, const SAE: i32>(
11043    src: __m512h,
11044    k: __mmask32,
11045    a: __m512h,
11046) -> __m512h {
11047    unsafe {
11048        static_assert_uimm_bits!(IMM8, 8);
11049        static_assert_sae!(SAE);
11050        vreduceph_512(a, IMM8, src, k, SAE)
11051    }
11052}
11053
11054/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
11055/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
11056/// out when the corresponding mask bit is not set).
11057///
11058/// Rounding is done according to the imm8 parameter, which can be one of:
11059///
11060/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11061/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11062/// * [`_MM_FROUND_TO_POS_INF`] : round up
11063/// * [`_MM_FROUND_TO_ZERO`] : truncate
11064/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11065///
11066/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11067///
11068/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_round_ph)
11069#[inline]
11070#[target_feature(enable = "avx512fp16")]
11071#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
11072#[rustc_legacy_const_generics(2, 3)]
11073#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11074pub fn _mm512_maskz_reduce_round_ph<const IMM8: i32, const SAE: i32>(
11075    k: __mmask32,
11076    a: __m512h,
11077) -> __m512h {
11078    static_assert_uimm_bits!(IMM8, 8);
11079    static_assert_sae!(SAE);
11080    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
11081}
11082
11083/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
11084/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the
11085/// upper 7 packed elements from a to the upper elements of dst.
11086///
11087/// Rounding is done according to the imm8 parameter, which can be one of:
11088///
11089/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11090/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11091/// * [`_MM_FROUND_TO_POS_INF`] : round up
11092/// * [`_MM_FROUND_TO_ZERO`] : truncate
11093/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11094///
11095/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_sh)
11096#[inline]
11097#[target_feature(enable = "avx512fp16")]
11098#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
11099#[rustc_legacy_const_generics(2)]
11100#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11101pub fn _mm_reduce_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
11102    static_assert_uimm_bits!(IMM8, 8);
11103    _mm_mask_reduce_sh::<IMM8>(f16x8::ZERO.as_m128h(), 0xff, a, b)
11104}
11105
11106/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
11107/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
11108/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from
11109/// a to the upper elements of dst.
11110///
11111/// Rounding is done according to the imm8 parameter, which can be one of:
11112///
11113/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11114/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11115/// * [`_MM_FROUND_TO_POS_INF`] : round up
11116/// * [`_MM_FROUND_TO_ZERO`] : truncate
11117/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11118///
11119/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_sh)
11120#[inline]
11121#[target_feature(enable = "avx512fp16")]
11122#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
11123#[rustc_legacy_const_generics(4)]
11124#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11125pub fn _mm_mask_reduce_sh<const IMM8: i32>(
11126    src: __m128h,
11127    k: __mmask8,
11128    a: __m128h,
11129    b: __m128h,
11130) -> __m128h {
11131    static_assert_uimm_bits!(IMM8, 8);
11132    _mm_mask_reduce_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
11133}
11134
11135/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
11136/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
11137/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
11138/// to the upper elements of dst.
11139///
11140/// Rounding is done according to the imm8 parameter, which can be one of:
11141///
11142/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11143/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11144/// * [`_MM_FROUND_TO_POS_INF`] : round up
11145/// * [`_MM_FROUND_TO_ZERO`] : truncate
11146/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11147///
11148/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_sh)
11149#[inline]
11150#[target_feature(enable = "avx512fp16")]
11151#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
11152#[rustc_legacy_const_generics(3)]
11153#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11154pub fn _mm_maskz_reduce_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
11155    static_assert_uimm_bits!(IMM8, 8);
11156    _mm_mask_reduce_sh::<IMM8>(f16x8::ZERO.as_m128h(), k, a, b)
11157}
11158
11159/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
11160/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the upper
11161/// 7 packed elements from a to the upper elements of dst.
11162///
11163/// Rounding is done according to the imm8 parameter, which can be one of:
11164///
11165/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11166/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11167/// * [`_MM_FROUND_TO_POS_INF`] : round up
11168/// * [`_MM_FROUND_TO_ZERO`] : truncate
11169/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11170///
11171/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11172///
11173/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_round_sh)
11174#[inline]
11175#[target_feature(enable = "avx512fp16")]
11176#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
11177#[rustc_legacy_const_generics(2, 3)]
11178#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11179pub fn _mm_reduce_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
11180    static_assert_uimm_bits!(IMM8, 8);
11181    static_assert_sae!(SAE);
11182    _mm_mask_reduce_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
11183}
11184
11185/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
11186/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
11187/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a
11188/// to the upper elements of dst.
11189///
11190/// Rounding is done according to the imm8 parameter, which can be one of:
11191///
11192/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11193/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11194/// * [`_MM_FROUND_TO_POS_INF`] : round up
11195/// * [`_MM_FROUND_TO_ZERO`] : truncate
11196/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11197///
11198/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11199///
11200/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_round_sh)
11201#[inline]
11202#[target_feature(enable = "avx512fp16")]
11203#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
11204#[rustc_legacy_const_generics(4, 5)]
11205#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11206pub fn _mm_mask_reduce_round_sh<const IMM8: i32, const SAE: i32>(
11207    src: __m128h,
11208    k: __mmask8,
11209    a: __m128h,
11210    b: __m128h,
11211) -> __m128h {
11212    unsafe {
11213        static_assert_uimm_bits!(IMM8, 8);
11214        static_assert_sae!(SAE);
11215        vreducesh(a, b, src, k, IMM8, SAE)
11216    }
11217}
11218
11219/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
11220/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
11221/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
11222/// to the upper elements of dst.
11223///
11224/// Rounding is done according to the imm8 parameter, which can be one of:
11225///
11226/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11227/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11228/// * [`_MM_FROUND_TO_POS_INF`] : round up
11229/// * [`_MM_FROUND_TO_ZERO`] : truncate
11230/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11231///
11232/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11233///
11234/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_round_sh)
11235#[inline]
11236#[target_feature(enable = "avx512fp16")]
11237#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
11238#[rustc_legacy_const_generics(3, 4)]
11239#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11240pub fn _mm_maskz_reduce_round_sh<const IMM8: i32, const SAE: i32>(
11241    k: __mmask8,
11242    a: __m128h,
11243    b: __m128h,
11244) -> __m128h {
11245    static_assert_uimm_bits!(IMM8, 8);
11246    static_assert_sae!(SAE);
11247    _mm_mask_reduce_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
11248}
11249
11250/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11251/// sum of all elements in a.
11252///
11253/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_add_ph)
11254#[inline]
11255#[target_feature(enable = "avx512fp16,avx512vl")]
11256#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11257#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11258pub const fn _mm_reduce_add_ph(a: __m128h) -> f16 {
11259    unsafe {
11260        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11261        let a = _mm_add_ph(a, b);
11262        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11263        let a = _mm_add_ph(a, b);
11264        simd_extract!(a, 0, f16) + simd_extract!(a, 1, f16)
11265    }
11266}
11267
11268/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11269/// sum of all elements in a.
11270///
11271/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_add_ph)
11272#[inline]
11273#[target_feature(enable = "avx512fp16,avx512vl")]
11274#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11275#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11276pub const fn _mm256_reduce_add_ph(a: __m256h) -> f16 {
11277    unsafe {
11278        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11279        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11280        _mm_reduce_add_ph(_mm_add_ph(p, q))
11281    }
11282}
11283
11284/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11285/// sum of all elements in a.
11286///
11287/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_ph)
11288#[inline]
11289#[target_feature(enable = "avx512fp16")]
11290#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11291#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11292pub const fn _mm512_reduce_add_ph(a: __m512h) -> f16 {
11293    unsafe {
11294        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11295        let q = simd_shuffle!(
11296            a,
11297            a,
11298            [
11299                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11300            ]
11301        );
11302        _mm256_reduce_add_ph(_mm256_add_ph(p, q))
11303    }
11304}
11305
11306/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11307/// the product of all elements in a.
11308///
11309/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_mul_ph)
11310#[inline]
11311#[target_feature(enable = "avx512fp16,avx512vl")]
11312#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11313#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11314pub const fn _mm_reduce_mul_ph(a: __m128h) -> f16 {
11315    unsafe {
11316        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11317        let a = _mm_mul_ph(a, b);
11318        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11319        let a = _mm_mul_ph(a, b);
11320        simd_extract!(a, 0, f16) * simd_extract!(a, 1, f16)
11321    }
11322}
11323
11324/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11325/// the product of all elements in a.
11326///
11327/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_mul_ph)
11328#[inline]
11329#[target_feature(enable = "avx512fp16,avx512vl")]
11330#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11331#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11332pub const fn _mm256_reduce_mul_ph(a: __m256h) -> f16 {
11333    unsafe {
11334        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11335        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11336        _mm_reduce_mul_ph(_mm_mul_ph(p, q))
11337    }
11338}
11339
11340/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11341/// the product of all elements in a.
11342///
11343/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_ph)
11344#[inline]
11345#[target_feature(enable = "avx512fp16")]
11346#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11347#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11348pub const fn _mm512_reduce_mul_ph(a: __m512h) -> f16 {
11349    unsafe {
11350        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11351        let q = simd_shuffle!(
11352            a,
11353            a,
11354            [
11355                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11356            ]
11357        );
11358        _mm256_reduce_mul_ph(_mm256_mul_ph(p, q))
11359    }
11360}
11361
11362/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11363/// minimum of all elements in a.
11364///
11365/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_min_ph)
11366#[inline]
11367#[target_feature(enable = "avx512fp16,avx512vl")]
11368#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11369pub fn _mm_reduce_min_ph(a: __m128h) -> f16 {
11370    unsafe {
11371        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11372        let a = _mm_min_ph(a, b);
11373        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11374        let a = _mm_min_ph(a, b);
11375        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
11376        simd_extract!(_mm_min_sh(a, b), 0)
11377    }
11378}
11379
11380/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11381/// minimum of all elements in a.
11382///
11383/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_min_ph)
11384#[inline]
11385#[target_feature(enable = "avx512fp16,avx512vl")]
11386#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11387pub fn _mm256_reduce_min_ph(a: __m256h) -> f16 {
11388    unsafe {
11389        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11390        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11391        _mm_reduce_min_ph(_mm_min_ph(p, q))
11392    }
11393}
11394
11395/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11396/// minimum of all elements in a.
11397///
11398/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_ph)
11399#[inline]
11400#[target_feature(enable = "avx512fp16")]
11401#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11402pub fn _mm512_reduce_min_ph(a: __m512h) -> f16 {
11403    unsafe {
11404        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11405        let q = simd_shuffle!(
11406            a,
11407            a,
11408            [
11409                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11410            ]
11411        );
11412        _mm256_reduce_min_ph(_mm256_min_ph(p, q))
11413    }
11414}
11415
11416/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11417/// maximum of all elements in a.
11418///
11419/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_max_ph)
11420#[inline]
11421#[target_feature(enable = "avx512fp16,avx512vl")]
11422#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11423pub fn _mm_reduce_max_ph(a: __m128h) -> f16 {
11424    unsafe {
11425        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11426        let a = _mm_max_ph(a, b);
11427        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11428        let a = _mm_max_ph(a, b);
11429        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
11430        simd_extract!(_mm_max_sh(a, b), 0)
11431    }
11432}
11433
11434/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11435/// maximum of all elements in a.
11436///
11437/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_max_ph)
11438#[inline]
11439#[target_feature(enable = "avx512fp16,avx512vl")]
11440#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11441pub fn _mm256_reduce_max_ph(a: __m256h) -> f16 {
11442    unsafe {
11443        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11444        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11445        _mm_reduce_max_ph(_mm_max_ph(p, q))
11446    }
11447}
11448
11449/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11450/// maximum of all elements in a.
11451///
11452/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_ph)
11453#[inline]
11454#[target_feature(enable = "avx512fp16")]
11455#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11456pub fn _mm512_reduce_max_ph(a: __m512h) -> f16 {
11457    unsafe {
11458        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11459        let q = simd_shuffle!(
11460            a,
11461            a,
11462            [
11463                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11464            ]
11465        );
11466        _mm256_reduce_max_ph(_mm256_max_ph(p, q))
11467    }
11468}
11469
11470/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11471/// by imm8, and store the results in mask vector k.
11472/// imm can be a combination of:
11473///
11474///     0x01 // QNaN
11475///     0x02 // Positive Zero
11476///     0x04 // Negative Zero
11477///     0x08 // Positive Infinity
11478///     0x10 // Negative Infinity
11479///     0x20 // Denormal
11480///     0x40 // Negative
11481///     0x80 // SNaN
11482///
11483/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_ph_mask)
11484#[inline]
11485#[target_feature(enable = "avx512fp16,avx512vl")]
11486#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11487#[rustc_legacy_const_generics(1)]
11488#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11489pub fn _mm_fpclass_ph_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
11490    unsafe {
11491        static_assert_uimm_bits!(IMM8, 8);
11492        vfpclassph_128(a, IMM8)
11493    }
11494}
11495
11496/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11497/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11498/// corresponding mask bit is not set).
11499/// imm can be a combination of:
11500///
11501///     0x01 // QNaN
11502///     0x02 // Positive Zero
11503///     0x04 // Negative Zero
11504///     0x08 // Positive Infinity
11505///     0x10 // Negative Infinity
11506///     0x20 // Denormal
11507///     0x40 // Negative
11508///     0x80 // SNaN
11509///
11510/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_ph_mask)
11511#[inline]
11512#[target_feature(enable = "avx512fp16,avx512vl")]
11513#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11514#[rustc_legacy_const_generics(2)]
11515#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11516pub fn _mm_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
11517    _mm_fpclass_ph_mask::<IMM8>(a) & k1
11518}
11519
11520/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11521/// by imm8, and store the results in mask vector k.
11522/// imm can be a combination of:
11523///
11524///     0x01 // QNaN
11525///     0x02 // Positive Zero
11526///     0x04 // Negative Zero
11527///     0x08 // Positive Infinity
11528///     0x10 // Negative Infinity
11529///     0x20 // Denormal
11530///     0x40 // Negative
11531///     0x80 // SNaN
11532///
11533/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fpclass_ph_mask)
11534#[inline]
11535#[target_feature(enable = "avx512fp16,avx512vl")]
11536#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11537#[rustc_legacy_const_generics(1)]
11538#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11539pub fn _mm256_fpclass_ph_mask<const IMM8: i32>(a: __m256h) -> __mmask16 {
11540    unsafe {
11541        static_assert_uimm_bits!(IMM8, 8);
11542        vfpclassph_256(a, IMM8)
11543    }
11544}
11545
11546/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11547/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11548/// corresponding mask bit is not set).
11549/// imm can be a combination of:
11550///
11551///     0x01 // QNaN
11552///     0x02 // Positive Zero
11553///     0x04 // Negative Zero
11554///     0x08 // Positive Infinity
11555///     0x10 // Negative Infinity
11556///     0x20 // Denormal
11557///     0x40 // Negative
11558///     0x80 // SNaN
11559///
11560/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fpclass_ph_mask)
11561#[inline]
11562#[target_feature(enable = "avx512fp16,avx512vl")]
11563#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11564#[rustc_legacy_const_generics(2)]
11565#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11566pub fn _mm256_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask16, a: __m256h) -> __mmask16 {
11567    _mm256_fpclass_ph_mask::<IMM8>(a) & k1
11568}
11569
11570/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11571/// by imm8, and store the results in mask vector k.
11572/// imm can be a combination of:
11573///
11574///     0x01 // QNaN
11575///     0x02 // Positive Zero
11576///     0x04 // Negative Zero
11577///     0x08 // Positive Infinity
11578///     0x10 // Negative Infinity
11579///     0x20 // Denormal
11580///     0x40 // Negative
11581///     0x80 // SNaN
11582///
11583/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fpclass_ph_mask)
11584#[inline]
11585#[target_feature(enable = "avx512fp16")]
11586#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11587#[rustc_legacy_const_generics(1)]
11588#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11589pub fn _mm512_fpclass_ph_mask<const IMM8: i32>(a: __m512h) -> __mmask32 {
11590    unsafe {
11591        static_assert_uimm_bits!(IMM8, 8);
11592        vfpclassph_512(a, IMM8)
11593    }
11594}
11595
11596/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11597/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11598/// corresponding mask bit is not set).
11599/// imm can be a combination of:
11600///
11601///     0x01 // QNaN
11602///     0x02 // Positive Zero
11603///     0x04 // Negative Zero
11604///     0x08 // Positive Infinity
11605///     0x10 // Negative Infinity
11606///     0x20 // Denormal
11607///     0x40 // Negative
11608///     0x80 // SNaN
11609///
11610/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fpclass_ph_mask)
11611#[inline]
11612#[target_feature(enable = "avx512fp16")]
11613#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11614#[rustc_legacy_const_generics(2)]
11615#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11616pub fn _mm512_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask32, a: __m512h) -> __mmask32 {
11617    _mm512_fpclass_ph_mask::<IMM8>(a) & k1
11618}
11619
11620/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
11621/// by imm8, and store the result in mask vector k.
11622/// imm can be a combination of:
11623///
11624///     0x01 // QNaN
11625///     0x02 // Positive Zero
11626///     0x04 // Negative Zero
11627///     0x08 // Positive Infinity
11628///     0x10 // Negative Infinity
11629///     0x20 // Denormal
11630///     0x40 // Negative
11631///     0x80 // SNaN
11632///
11633/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_sh_mask)
11634#[inline]
11635#[target_feature(enable = "avx512fp16")]
11636#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
11637#[rustc_legacy_const_generics(1)]
11638#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11639pub fn _mm_fpclass_sh_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
11640    _mm_mask_fpclass_sh_mask::<IMM8>(0xff, a)
11641}
11642
11643/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
11644/// by imm8, and store the result in mask vector k using zeromask k (elements are zeroed out when the
11645/// corresponding mask bit is not set).
11646/// imm can be a combination of:
11647///
11648///     0x01 // QNaN
11649///     0x02 // Positive Zero
11650///     0x04 // Negative Zero
11651///     0x08 // Positive Infinity
11652///     0x10 // Negative Infinity
11653///     0x20 // Denormal
11654///     0x40 // Negative
11655///     0x80 // SNaN
11656///
11657/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_sh_mask)
11658#[inline]
11659#[target_feature(enable = "avx512fp16")]
11660#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
11661#[rustc_legacy_const_generics(2)]
11662#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11663pub fn _mm_mask_fpclass_sh_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
11664    unsafe {
11665        static_assert_uimm_bits!(IMM8, 8);
11666        vfpclasssh(a, IMM8, k1)
11667    }
11668}
11669
11670/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11671/// and store the results in dst.
11672///
11673/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_ph)
11674#[inline]
11675#[target_feature(enable = "avx512fp16,avx512vl")]
11676#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11677#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11678pub const fn _mm_mask_blend_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
11679    unsafe { simd_select_bitmask(k, b, a) }
11680}
11681
11682/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11683/// and store the results in dst.
11684///
11685/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_ph)
11686#[inline]
11687#[target_feature(enable = "avx512fp16,avx512vl")]
11688#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11689#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11690pub const fn _mm256_mask_blend_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
11691    unsafe { simd_select_bitmask(k, b, a) }
11692}
11693
11694/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11695/// and store the results in dst.
11696///
11697/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ph)
11698#[inline]
11699#[target_feature(enable = "avx512fp16")]
11700#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11701#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11702pub const fn _mm512_mask_blend_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
11703    unsafe { simd_select_bitmask(k, b, a) }
11704}
11705
11706/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11707/// and index in idx, and store the results in dst.
11708///
11709/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_ph)
11710#[inline]
11711#[target_feature(enable = "avx512fp16,avx512vl")]
11712#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11713pub fn _mm_permutex2var_ph(a: __m128h, idx: __m128i, b: __m128h) -> __m128h {
11714    _mm_castsi128_ph(_mm_permutex2var_epi16(
11715        _mm_castph_si128(a),
11716        idx,
11717        _mm_castph_si128(b),
11718    ))
11719}
11720
11721/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11722/// and index in idx, and store the results in dst.
11723///
11724/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_ph)
11725#[inline]
11726#[target_feature(enable = "avx512fp16,avx512vl")]
11727#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11728pub fn _mm256_permutex2var_ph(a: __m256h, idx: __m256i, b: __m256h) -> __m256h {
11729    _mm256_castsi256_ph(_mm256_permutex2var_epi16(
11730        _mm256_castph_si256(a),
11731        idx,
11732        _mm256_castph_si256(b),
11733    ))
11734}
11735
11736/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11737/// and index in idx, and store the results in dst.
11738///
11739/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_ph)
11740#[inline]
11741#[target_feature(enable = "avx512fp16")]
11742#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11743pub fn _mm512_permutex2var_ph(a: __m512h, idx: __m512i, b: __m512h) -> __m512h {
11744    _mm512_castsi512_ph(_mm512_permutex2var_epi16(
11745        _mm512_castph_si512(a),
11746        idx,
11747        _mm512_castph_si512(b),
11748    ))
11749}
11750
11751/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11752/// and store the results in dst.
11753///
11754/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutexvar_ph)
11755#[inline]
11756#[target_feature(enable = "avx512fp16,avx512vl")]
11757#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11758pub fn _mm_permutexvar_ph(idx: __m128i, a: __m128h) -> __m128h {
11759    _mm_castsi128_ph(_mm_permutexvar_epi16(idx, _mm_castph_si128(a)))
11760}
11761
11762/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11763/// and store the results in dst.
11764///
11765/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_ph)
11766#[inline]
11767#[target_feature(enable = "avx512fp16,avx512vl")]
11768#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11769pub fn _mm256_permutexvar_ph(idx: __m256i, a: __m256h) -> __m256h {
11770    _mm256_castsi256_ph(_mm256_permutexvar_epi16(idx, _mm256_castph_si256(a)))
11771}
11772
11773/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11774/// and store the results in dst.
11775///
11776/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_ph)
11777#[inline]
11778#[target_feature(enable = "avx512fp16")]
11779#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11780pub fn _mm512_permutexvar_ph(idx: __m512i, a: __m512h) -> __m512h {
11781    _mm512_castsi512_ph(_mm512_permutexvar_epi16(idx, _mm512_castph_si512(a)))
11782}
11783
11784/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11785/// and store the results in dst.
11786///
11787/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_ph)
11788#[inline]
11789#[target_feature(enable = "avx512fp16,avx512vl")]
11790#[cfg_attr(test, assert_instr(vcvtw2ph))]
11791#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11792pub fn _mm_cvtepi16_ph(a: __m128i) -> __m128h {
11793    unsafe { vcvtw2ph_128(a.as_i16x8(), _MM_FROUND_CUR_DIRECTION) }
11794}
11795
11796/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11797/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11798/// mask bit is not set).
11799///
11800/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi16_ph)
11801#[inline]
11802#[target_feature(enable = "avx512fp16,avx512vl")]
11803#[cfg_attr(test, assert_instr(vcvtw2ph))]
11804#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11805pub fn _mm_mask_cvtepi16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11806    unsafe { simd_select_bitmask(k, _mm_cvtepi16_ph(a), src) }
11807}
11808
11809/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11810/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11811///
11812/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi16_ph)
11813#[inline]
11814#[target_feature(enable = "avx512fp16,avx512vl")]
11815#[cfg_attr(test, assert_instr(vcvtw2ph))]
11816#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11817pub fn _mm_maskz_cvtepi16_ph(k: __mmask8, a: __m128i) -> __m128h {
11818    _mm_mask_cvtepi16_ph(_mm_setzero_ph(), k, a)
11819}
11820
11821/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11822/// and store the results in dst.
11823///
11824/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_ph)
11825#[inline]
11826#[target_feature(enable = "avx512fp16,avx512vl")]
11827#[cfg_attr(test, assert_instr(vcvtw2ph))]
11828#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11829pub fn _mm256_cvtepi16_ph(a: __m256i) -> __m256h {
11830    unsafe { vcvtw2ph_256(a.as_i16x16(), _MM_FROUND_CUR_DIRECTION) }
11831}
11832
11833/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11834/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11835/// mask bit is not set).
11836///
11837/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi16_ph)
11838#[inline]
11839#[target_feature(enable = "avx512fp16,avx512vl")]
11840#[cfg_attr(test, assert_instr(vcvtw2ph))]
11841#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11842pub fn _mm256_mask_cvtepi16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
11843    unsafe { simd_select_bitmask(k, _mm256_cvtepi16_ph(a), src) }
11844}
11845
11846/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11847/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11848///
11849/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi16_ph)
11850#[inline]
11851#[target_feature(enable = "avx512fp16,avx512vl")]
11852#[cfg_attr(test, assert_instr(vcvtw2ph))]
11853#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11854pub fn _mm256_maskz_cvtepi16_ph(k: __mmask16, a: __m256i) -> __m256h {
11855    _mm256_mask_cvtepi16_ph(_mm256_setzero_ph(), k, a)
11856}
11857
11858/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11859/// and store the results in dst.
11860///
11861/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi16_ph)
11862#[inline]
11863#[target_feature(enable = "avx512fp16")]
11864#[cfg_attr(test, assert_instr(vcvtw2ph))]
11865#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11866pub fn _mm512_cvtepi16_ph(a: __m512i) -> __m512h {
11867    unsafe { vcvtw2ph_512(a.as_i16x32(), _MM_FROUND_CUR_DIRECTION) }
11868}
11869
11870/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11871/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11872/// mask bit is not set).
11873///
11874/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi16_ph)
11875#[inline]
11876#[target_feature(enable = "avx512fp16")]
11877#[cfg_attr(test, assert_instr(vcvtw2ph))]
11878#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11879pub fn _mm512_mask_cvtepi16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
11880    unsafe { simd_select_bitmask(k, _mm512_cvtepi16_ph(a), src) }
11881}
11882
11883/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11884/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11885///
11886/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi16_ph)
11887#[inline]
11888#[target_feature(enable = "avx512fp16")]
11889#[cfg_attr(test, assert_instr(vcvtw2ph))]
11890#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11891pub fn _mm512_maskz_cvtepi16_ph(k: __mmask32, a: __m512i) -> __m512h {
11892    _mm512_mask_cvtepi16_ph(_mm512_setzero_ph(), k, a)
11893}
11894
11895/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11896/// and store the results in dst.
11897///
11898/// Rounding is done according to the rounding parameter, which can be one of:
11899///
11900/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11901/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11902/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11903/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11904/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11905///
11906/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi16_ph)
11907#[inline]
11908#[target_feature(enable = "avx512fp16")]
11909#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
11910#[rustc_legacy_const_generics(1)]
11911#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11912pub fn _mm512_cvt_roundepi16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
11913    unsafe {
11914        static_assert_rounding!(ROUNDING);
11915        vcvtw2ph_512(a.as_i16x32(), ROUNDING)
11916    }
11917}
11918
11919/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11920/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11921/// mask bit is not set).
11922///
11923/// Rounding is done according to the rounding parameter, which can be one of:
11924///
11925/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11926/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11927/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11928/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11929/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11930///
11931/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi16_ph)
11932#[inline]
11933#[target_feature(enable = "avx512fp16")]
11934#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
11935#[rustc_legacy_const_generics(3)]
11936#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11937pub fn _mm512_mask_cvt_roundepi16_ph<const ROUNDING: i32>(
11938    src: __m512h,
11939    k: __mmask32,
11940    a: __m512i,
11941) -> __m512h {
11942    unsafe {
11943        static_assert_rounding!(ROUNDING);
11944        simd_select_bitmask(k, _mm512_cvt_roundepi16_ph::<ROUNDING>(a), src)
11945    }
11946}
11947
11948/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11949/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11950///
11951/// Rounding is done according to the rounding parameter, which can be one of:
11952///
11953/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11954/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11955/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11956/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11957/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11958///
11959/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi16_ph)
11960#[inline]
11961#[target_feature(enable = "avx512fp16")]
11962#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
11963#[rustc_legacy_const_generics(2)]
11964#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11965pub fn _mm512_maskz_cvt_roundepi16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
11966    static_assert_rounding!(ROUNDING);
11967    _mm512_mask_cvt_roundepi16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
11968}
11969
11970/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11971/// and store the results in dst.
11972///
11973/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_ph)
11974#[inline]
11975#[target_feature(enable = "avx512fp16,avx512vl")]
11976#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11977#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11978pub fn _mm_cvtepu16_ph(a: __m128i) -> __m128h {
11979    unsafe { vcvtuw2ph_128(a.as_u16x8(), _MM_FROUND_CUR_DIRECTION) }
11980}
11981
11982/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11983/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11984/// mask bit is not set).
11985///
11986/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu16_ph)
11987#[inline]
11988#[target_feature(enable = "avx512fp16,avx512vl")]
11989#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11990#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11991pub fn _mm_mask_cvtepu16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11992    unsafe { simd_select_bitmask(k, _mm_cvtepu16_ph(a), src) }
11993}
11994
11995/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11996/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11997///
11998/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu16_ph)
11999#[inline]
12000#[target_feature(enable = "avx512fp16,avx512vl")]
12001#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12002#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12003pub fn _mm_maskz_cvtepu16_ph(k: __mmask8, a: __m128i) -> __m128h {
12004    _mm_mask_cvtepu16_ph(_mm_setzero_ph(), k, a)
12005}
12006
12007/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12008/// and store the results in dst.
12009///
12010/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_ph)
12011#[inline]
12012#[target_feature(enable = "avx512fp16,avx512vl")]
12013#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12014#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12015pub fn _mm256_cvtepu16_ph(a: __m256i) -> __m256h {
12016    unsafe { vcvtuw2ph_256(a.as_u16x16(), _MM_FROUND_CUR_DIRECTION) }
12017}
12018
12019/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12020/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12021/// mask bit is not set).
12022///
12023/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu16_ph)
12024#[inline]
12025#[target_feature(enable = "avx512fp16,avx512vl")]
12026#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12027#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12028pub fn _mm256_mask_cvtepu16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
12029    unsafe { simd_select_bitmask(k, _mm256_cvtepu16_ph(a), src) }
12030}
12031
12032/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12033/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12034///
12035/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu16_ph)
12036#[inline]
12037#[target_feature(enable = "avx512fp16,avx512vl")]
12038#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12039#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12040pub fn _mm256_maskz_cvtepu16_ph(k: __mmask16, a: __m256i) -> __m256h {
12041    _mm256_mask_cvtepu16_ph(_mm256_setzero_ph(), k, a)
12042}
12043
12044/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12045/// and store the results in dst.
12046///
12047/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu16_ph)
12048#[inline]
12049#[target_feature(enable = "avx512fp16")]
12050#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12051#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12052pub fn _mm512_cvtepu16_ph(a: __m512i) -> __m512h {
12053    unsafe { vcvtuw2ph_512(a.as_u16x32(), _MM_FROUND_CUR_DIRECTION) }
12054}
12055
12056/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12057/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12058/// mask bit is not set).
12059///
12060/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu16_ph)
12061#[inline]
12062#[target_feature(enable = "avx512fp16")]
12063#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12064#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12065pub fn _mm512_mask_cvtepu16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
12066    unsafe { simd_select_bitmask(k, _mm512_cvtepu16_ph(a), src) }
12067}
12068
12069/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12070/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12071///
12072/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu16_ph)
12073#[inline]
12074#[target_feature(enable = "avx512fp16")]
12075#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12076#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12077pub fn _mm512_maskz_cvtepu16_ph(k: __mmask32, a: __m512i) -> __m512h {
12078    _mm512_mask_cvtepu16_ph(_mm512_setzero_ph(), k, a)
12079}
12080
12081/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12082/// and store the results in dst.
12083///
12084/// Rounding is done according to the rounding parameter, which can be one of:
12085///
12086/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12087/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12088/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12089/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12090/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12091///
12092/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu16_ph)
12093#[inline]
12094#[target_feature(enable = "avx512fp16")]
12095#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
12096#[rustc_legacy_const_generics(1)]
12097#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12098pub fn _mm512_cvt_roundepu16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
12099    unsafe {
12100        static_assert_rounding!(ROUNDING);
12101        vcvtuw2ph_512(a.as_u16x32(), ROUNDING)
12102    }
12103}
12104
12105/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12106/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12107/// mask bit is not set).
12108///
12109/// Rounding is done according to the rounding parameter, which can be one of:
12110///
12111/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12112/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12113/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12114/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12115/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12116///
12117/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu16_ph)
12118#[inline]
12119#[target_feature(enable = "avx512fp16")]
12120#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
12121#[rustc_legacy_const_generics(3)]
12122#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12123pub fn _mm512_mask_cvt_roundepu16_ph<const ROUNDING: i32>(
12124    src: __m512h,
12125    k: __mmask32,
12126    a: __m512i,
12127) -> __m512h {
12128    unsafe {
12129        static_assert_rounding!(ROUNDING);
12130        simd_select_bitmask(k, _mm512_cvt_roundepu16_ph::<ROUNDING>(a), src)
12131    }
12132}
12133
12134/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12135/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12136///
12137/// Rounding is done according to the rounding parameter, which can be one of:
12138///
12139/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12140/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12141/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12142/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12143/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12144///
12145/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu16_ph)
12146#[inline]
12147#[target_feature(enable = "avx512fp16")]
12148#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
12149#[rustc_legacy_const_generics(2)]
12150#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12151pub fn _mm512_maskz_cvt_roundepu16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
12152    static_assert_rounding!(ROUNDING);
12153    _mm512_mask_cvt_roundepu16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
12154}
12155
12156/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12157/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12158///
12159/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_ph)
12160#[inline]
12161#[target_feature(enable = "avx512fp16,avx512vl")]
12162#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12163#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12164pub fn _mm_cvtepi32_ph(a: __m128i) -> __m128h {
12165    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), 0xff, a)
12166}
12167
12168/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12169/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12170/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12171///
12172/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_ph)
12173#[inline]
12174#[target_feature(enable = "avx512fp16,avx512vl")]
12175#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12176#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12177pub fn _mm_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12178    unsafe { vcvtdq2ph_128(a.as_i32x4(), src, k) }
12179}
12180
12181/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12182/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12183/// The upper 64 bits of dst are zeroed out.
12184///
12185/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_ph)
12186#[inline]
12187#[target_feature(enable = "avx512fp16,avx512vl")]
12188#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12189#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12190pub fn _mm_maskz_cvtepi32_ph(k: __mmask8, a: __m128i) -> __m128h {
12191    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
12192}
12193
12194/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12195/// and store the results in dst.
12196///
12197/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_ph)
12198#[inline]
12199#[target_feature(enable = "avx512fp16,avx512vl")]
12200#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12201#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12202pub fn _mm256_cvtepi32_ph(a: __m256i) -> __m128h {
12203    unsafe { vcvtdq2ph_256(a.as_i32x8(), _MM_FROUND_CUR_DIRECTION) }
12204}
12205
12206/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12207/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12208/// mask bit is not set).
12209///
12210/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_ph)
12211#[inline]
12212#[target_feature(enable = "avx512fp16,avx512vl")]
12213#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12214#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12215pub fn _mm256_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12216    unsafe { simd_select_bitmask(k, _mm256_cvtepi32_ph(a), src) }
12217}
12218
12219/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12220/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12221///
12222/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_ph)
12223#[inline]
12224#[target_feature(enable = "avx512fp16,avx512vl")]
12225#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12226#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12227pub fn _mm256_maskz_cvtepi32_ph(k: __mmask8, a: __m256i) -> __m128h {
12228    _mm256_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
12229}
12230
12231/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12232/// and store the results in dst.
12233///
12234/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_ph)
12235#[inline]
12236#[target_feature(enable = "avx512fp16")]
12237#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12238#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12239pub fn _mm512_cvtepi32_ph(a: __m512i) -> __m256h {
12240    unsafe { vcvtdq2ph_512(a.as_i32x16(), _MM_FROUND_CUR_DIRECTION) }
12241}
12242
12243/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12244/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12245/// mask bit is not set).
12246///
12247/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_ph)
12248#[inline]
12249#[target_feature(enable = "avx512fp16")]
12250#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12251#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12252pub fn _mm512_mask_cvtepi32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
12253    unsafe { simd_select_bitmask(k, _mm512_cvtepi32_ph(a), src) }
12254}
12255
12256/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12257/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12258///
12259/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_ph)
12260#[inline]
12261#[target_feature(enable = "avx512fp16")]
12262#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12263#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12264pub fn _mm512_maskz_cvtepi32_ph(k: __mmask16, a: __m512i) -> __m256h {
12265    _mm512_mask_cvtepi32_ph(f16x16::ZERO.as_m256h(), k, a)
12266}
12267
12268/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12269/// and store the results in dst.
12270///
12271/// Rounding is done according to the rounding parameter, which can be one of:
12272///
12273/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12274/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12275/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12276/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12277/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12278///
12279/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi32_ph)
12280#[inline]
12281#[target_feature(enable = "avx512fp16")]
12282#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12283#[rustc_legacy_const_generics(1)]
12284#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12285pub fn _mm512_cvt_roundepi32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
12286    unsafe {
12287        static_assert_rounding!(ROUNDING);
12288        vcvtdq2ph_512(a.as_i32x16(), ROUNDING)
12289    }
12290}
12291
12292/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12293/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12294/// mask bit is not set).
12295///
12296/// Rounding is done according to the rounding parameter, which can be one of:
12297///
12298/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12299/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12300/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12301/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12302/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12303///
12304/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi32_ph)
12305#[inline]
12306#[target_feature(enable = "avx512fp16")]
12307#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12308#[rustc_legacy_const_generics(3)]
12309#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12310pub fn _mm512_mask_cvt_roundepi32_ph<const ROUNDING: i32>(
12311    src: __m256h,
12312    k: __mmask16,
12313    a: __m512i,
12314) -> __m256h {
12315    unsafe {
12316        static_assert_rounding!(ROUNDING);
12317        simd_select_bitmask(k, _mm512_cvt_roundepi32_ph::<ROUNDING>(a), src)
12318    }
12319}
12320
12321/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12322/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12323///
12324/// Rounding is done according to the rounding parameter, which can be one of:
12325///
12326/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12327/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12328/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12329/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12330/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12331///
12332/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi32_ph)
12333#[inline]
12334#[target_feature(enable = "avx512fp16")]
12335#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12336#[rustc_legacy_const_generics(2)]
12337#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12338pub fn _mm512_maskz_cvt_roundepi32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
12339    static_assert_rounding!(ROUNDING);
12340    _mm512_mask_cvt_roundepi32_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
12341}
12342
12343/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12344/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12345/// of dst.
12346///
12347/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvti32_sh)
12348#[inline]
12349#[target_feature(enable = "avx512fp16")]
12350#[cfg_attr(test, assert_instr(vcvtsi2sh))]
12351#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12352pub fn _mm_cvti32_sh(a: __m128h, b: i32) -> __m128h {
12353    unsafe { vcvtsi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
12354}
12355
12356/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12357/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12358/// of dst.
12359///
12360/// Rounding is done according to the rounding parameter, which can be one of:
12361///
12362/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12363/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12364/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12365/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12366/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12367///
12368/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi32_sh)
12369#[inline]
12370#[target_feature(enable = "avx512fp16")]
12371#[cfg_attr(test, assert_instr(vcvtsi2sh, ROUNDING = 8))]
12372#[rustc_legacy_const_generics(2)]
12373#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12374pub fn _mm_cvt_roundi32_sh<const ROUNDING: i32>(a: __m128h, b: i32) -> __m128h {
12375    unsafe {
12376        static_assert_rounding!(ROUNDING);
12377        vcvtsi2sh(a, b, ROUNDING)
12378    }
12379}
12380
12381/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12382/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12383///
12384/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_ph)
12385#[inline]
12386#[target_feature(enable = "avx512fp16,avx512vl")]
12387#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12388#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12389pub fn _mm_cvtepu32_ph(a: __m128i) -> __m128h {
12390    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), 0xff, a)
12391}
12392
12393/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12394/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12395/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12396///
12397/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu32_ph)
12398#[inline]
12399#[target_feature(enable = "avx512fp16,avx512vl")]
12400#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12401#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12402pub fn _mm_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12403    unsafe { vcvtudq2ph_128(a.as_u32x4(), src, k) }
12404}
12405
12406/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12407/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12408/// The upper 64 bits of dst are zeroed out.
12409///
12410/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu32_ph)
12411#[inline]
12412#[target_feature(enable = "avx512fp16,avx512vl")]
12413#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12414#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12415pub fn _mm_maskz_cvtepu32_ph(k: __mmask8, a: __m128i) -> __m128h {
12416    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
12417}
12418
12419/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12420/// and store the results in dst.
12421///
12422/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_ph)
12423#[inline]
12424#[target_feature(enable = "avx512fp16,avx512vl")]
12425#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12426#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12427pub fn _mm256_cvtepu32_ph(a: __m256i) -> __m128h {
12428    unsafe { vcvtudq2ph_256(a.as_u32x8(), _MM_FROUND_CUR_DIRECTION) }
12429}
12430
12431/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12432/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12433/// mask bit is not set).
12434///
12435/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu32_ph)
12436#[inline]
12437#[target_feature(enable = "avx512fp16,avx512vl")]
12438#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12439#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12440pub fn _mm256_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12441    unsafe { simd_select_bitmask(k, _mm256_cvtepu32_ph(a), src) }
12442}
12443
12444/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12445/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12446///
12447/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu32_ph)
12448#[inline]
12449#[target_feature(enable = "avx512fp16,avx512vl")]
12450#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12451#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12452pub fn _mm256_maskz_cvtepu32_ph(k: __mmask8, a: __m256i) -> __m128h {
12453    _mm256_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
12454}
12455
12456/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12457/// and store the results in dst.
12458///
12459/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu32_ph)
12460#[inline]
12461#[target_feature(enable = "avx512fp16")]
12462#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12463#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12464pub fn _mm512_cvtepu32_ph(a: __m512i) -> __m256h {
12465    unsafe { vcvtudq2ph_512(a.as_u32x16(), _MM_FROUND_CUR_DIRECTION) }
12466}
12467
12468/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12469/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12470/// mask bit is not set).
12471///
12472/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu32_ph)
12473#[inline]
12474#[target_feature(enable = "avx512fp16")]
12475#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12476#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12477pub fn _mm512_mask_cvtepu32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
12478    unsafe { simd_select_bitmask(k, _mm512_cvtepu32_ph(a), src) }
12479}
12480
12481/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12482/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12483///
12484/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu32_ph)
12485#[inline]
12486#[target_feature(enable = "avx512fp16")]
12487#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12488#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12489pub fn _mm512_maskz_cvtepu32_ph(k: __mmask16, a: __m512i) -> __m256h {
12490    _mm512_mask_cvtepu32_ph(f16x16::ZERO.as_m256h(), k, a)
12491}
12492
12493/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12494/// and store the results in dst.
12495///
12496/// Rounding is done according to the rounding parameter, which can be one of:
12497///
12498/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12499/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12500/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12501/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12502/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12503///
12504/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu32_ph)
12505#[inline]
12506#[target_feature(enable = "avx512fp16")]
12507#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12508#[rustc_legacy_const_generics(1)]
12509#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12510pub fn _mm512_cvt_roundepu32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
12511    unsafe {
12512        static_assert_rounding!(ROUNDING);
12513        vcvtudq2ph_512(a.as_u32x16(), ROUNDING)
12514    }
12515}
12516
12517/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12518/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12519/// mask bit is not set).
12520///
12521/// Rounding is done according to the rounding parameter, which can be one of:
12522///
12523/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12524/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12525/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12526/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12527/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12528///
12529/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu32_ph)
12530#[inline]
12531#[target_feature(enable = "avx512fp16")]
12532#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12533#[rustc_legacy_const_generics(3)]
12534#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12535pub fn _mm512_mask_cvt_roundepu32_ph<const ROUNDING: i32>(
12536    src: __m256h,
12537    k: __mmask16,
12538    a: __m512i,
12539) -> __m256h {
12540    unsafe {
12541        static_assert_rounding!(ROUNDING);
12542        simd_select_bitmask(k, _mm512_cvt_roundepu32_ph::<ROUNDING>(a), src)
12543    }
12544}
12545
12546/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12547/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12548///
12549/// Rounding is done according to the rounding parameter, which can be one of:
12550///
12551/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12552/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12553/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12554/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12555/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12556///
12557/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu32_ph)
12558#[inline]
12559#[target_feature(enable = "avx512fp16")]
12560#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12561#[rustc_legacy_const_generics(2)]
12562#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12563pub fn _mm512_maskz_cvt_roundepu32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
12564    static_assert_rounding!(ROUNDING);
12565    _mm512_mask_cvt_roundepu32_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
12566}
12567
12568/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12569/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12570/// of dst.
12571///
12572/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_sh)
12573#[inline]
12574#[target_feature(enable = "avx512fp16")]
12575#[cfg_attr(test, assert_instr(vcvtusi2sh))]
12576#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12577pub fn _mm_cvtu32_sh(a: __m128h, b: u32) -> __m128h {
12578    unsafe { vcvtusi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
12579}
12580
12581/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12582/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12583/// of dst.
12584///
12585/// Rounding is done according to the rounding parameter, which can be one of:
12586///
12587/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12588/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12589/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12590/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12591/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12592///
12593/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu32_sh)
12594#[inline]
12595#[target_feature(enable = "avx512fp16")]
12596#[cfg_attr(test, assert_instr(vcvtusi2sh, ROUNDING = 8))]
12597#[rustc_legacy_const_generics(2)]
12598#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12599pub fn _mm_cvt_roundu32_sh<const ROUNDING: i32>(a: __m128h, b: u32) -> __m128h {
12600    unsafe {
12601        static_assert_rounding!(ROUNDING);
12602        vcvtusi2sh(a, b, ROUNDING)
12603    }
12604}
12605
12606/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12607/// and store the results in dst. The upper 96 bits of dst are zeroed out.
12608///
12609/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi64_ph)
12610#[inline]
12611#[target_feature(enable = "avx512fp16,avx512vl")]
12612#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12613#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12614pub fn _mm_cvtepi64_ph(a: __m128i) -> __m128h {
12615    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
12616}
12617
12618/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12619/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12620/// mask bit is not set). The upper 96 bits of dst are zeroed out.
12621///
12622/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_ph)
12623#[inline]
12624#[target_feature(enable = "avx512fp16,avx512vl")]
12625#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12626#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12627pub fn _mm_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12628    unsafe { vcvtqq2ph_128(a.as_i64x2(), src, k) }
12629}
12630
12631/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12632/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12633/// The upper 96 bits of dst are zeroed out.
12634///
12635/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi64_ph)
12636#[inline]
12637#[target_feature(enable = "avx512fp16,avx512vl")]
12638#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12639#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12640pub fn _mm_maskz_cvtepi64_ph(k: __mmask8, a: __m128i) -> __m128h {
12641    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
12642}
12643
12644/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12645/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12646///
12647/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi64_ph)
12648#[inline]
12649#[target_feature(enable = "avx512fp16,avx512vl")]
12650#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12651#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12652pub fn _mm256_cvtepi64_ph(a: __m256i) -> __m128h {
12653    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
12654}
12655
12656/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12657/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12658/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12659///
12660/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_ph)
12661#[inline]
12662#[target_feature(enable = "avx512fp16,avx512vl")]
12663#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12664#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12665pub fn _mm256_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12666    unsafe { vcvtqq2ph_256(a.as_i64x4(), src, k) }
12667}
12668
12669/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12670/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12671/// The upper 64 bits of dst are zeroed out.
12672///
12673/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi64_ph)
12674#[inline]
12675#[target_feature(enable = "avx512fp16,avx512vl")]
12676#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12677#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12678pub fn _mm256_maskz_cvtepi64_ph(k: __mmask8, a: __m256i) -> __m128h {
12679    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
12680}
12681
12682/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12683/// and store the results in dst.
12684///
12685/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi64_ph)
12686#[inline]
12687#[target_feature(enable = "avx512fp16")]
12688#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12689#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12690pub fn _mm512_cvtepi64_ph(a: __m512i) -> __m128h {
12691    unsafe { vcvtqq2ph_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION) }
12692}
12693
12694/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12695/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12696/// mask bit is not set).
12697///
12698/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_ph)
12699#[inline]
12700#[target_feature(enable = "avx512fp16")]
12701#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12702#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12703pub fn _mm512_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
12704    unsafe { simd_select_bitmask(k, _mm512_cvtepi64_ph(a), src) }
12705}
12706
12707/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12708/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12709///
12710/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi64_ph)
12711#[inline]
12712#[target_feature(enable = "avx512fp16")]
12713#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12714#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12715pub fn _mm512_maskz_cvtepi64_ph(k: __mmask8, a: __m512i) -> __m128h {
12716    _mm512_mask_cvtepi64_ph(f16x8::ZERO.as_m128h(), k, a)
12717}
12718
12719/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12720/// and store the results in dst.
12721///
12722/// Rounding is done according to the rounding parameter, which can be one of:
12723///
12724/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12725/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12726/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12727/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12728/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12729///
12730/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi64_ph)
12731#[inline]
12732#[target_feature(enable = "avx512fp16")]
12733#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12734#[rustc_legacy_const_generics(1)]
12735#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12736pub fn _mm512_cvt_roundepi64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
12737    unsafe {
12738        static_assert_rounding!(ROUNDING);
12739        vcvtqq2ph_512(a.as_i64x8(), ROUNDING)
12740    }
12741}
12742
12743/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12744/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12745/// mask bit is not set).
12746///
12747/// Rounding is done according to the rounding parameter, which can be one of:
12748///
12749/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12750/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12751/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12752/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12753/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12754///
12755/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi64_ph)
12756#[inline]
12757#[target_feature(enable = "avx512fp16")]
12758#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12759#[rustc_legacy_const_generics(3)]
12760#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12761pub fn _mm512_mask_cvt_roundepi64_ph<const ROUNDING: i32>(
12762    src: __m128h,
12763    k: __mmask8,
12764    a: __m512i,
12765) -> __m128h {
12766    unsafe {
12767        static_assert_rounding!(ROUNDING);
12768        simd_select_bitmask(k, _mm512_cvt_roundepi64_ph::<ROUNDING>(a), src)
12769    }
12770}
12771
12772/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12773/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12774///
12775/// Rounding is done according to the rounding parameter, which can be one of:
12776///
12777/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12778/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12779/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12780/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12781/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12782///
12783/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi64_ph)
12784#[inline]
12785#[target_feature(enable = "avx512fp16")]
12786#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12787#[rustc_legacy_const_generics(2)]
12788#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12789pub fn _mm512_maskz_cvt_roundepi64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
12790    static_assert_rounding!(ROUNDING);
12791    _mm512_mask_cvt_roundepi64_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
12792}
12793
12794/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12795/// and store the results in dst. The upper 96 bits of dst are zeroed out.
12796///
12797/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu64_ph)
12798#[inline]
12799#[target_feature(enable = "avx512fp16,avx512vl")]
12800#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12801#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12802pub fn _mm_cvtepu64_ph(a: __m128i) -> __m128h {
12803    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
12804}
12805
12806/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12807/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12808/// mask bit is not set). The upper 96 bits of dst are zeroed out.
12809///
12810/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu64_ph)
12811#[inline]
12812#[target_feature(enable = "avx512fp16,avx512vl")]
12813#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12814#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12815pub fn _mm_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12816    unsafe { vcvtuqq2ph_128(a.as_u64x2(), src, k) }
12817}
12818
12819/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12820/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12821/// The upper 96 bits of dst are zeroed out.
12822///
12823/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu64_ph)
12824#[inline]
12825#[target_feature(enable = "avx512fp16,avx512vl")]
12826#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12827#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12828pub fn _mm_maskz_cvtepu64_ph(k: __mmask8, a: __m128i) -> __m128h {
12829    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
12830}
12831
12832/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12833/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12834///
12835/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu64_ph)
12836#[inline]
12837#[target_feature(enable = "avx512fp16,avx512vl")]
12838#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12839#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12840pub fn _mm256_cvtepu64_ph(a: __m256i) -> __m128h {
12841    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
12842}
12843
12844/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12845/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12846/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12847///
12848/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu64_ph)
12849#[inline]
12850#[target_feature(enable = "avx512fp16,avx512vl")]
12851#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12852#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12853pub fn _mm256_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12854    unsafe { vcvtuqq2ph_256(a.as_u64x4(), src, k) }
12855}
12856
12857/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12858/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12859/// The upper 64 bits of dst are zeroed out.
12860///
12861/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu64_ph)
12862#[inline]
12863#[target_feature(enable = "avx512fp16,avx512vl")]
12864#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12865#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12866pub fn _mm256_maskz_cvtepu64_ph(k: __mmask8, a: __m256i) -> __m128h {
12867    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
12868}
12869
12870/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12871/// and store the results in dst.
12872///
12873/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu64_ph)
12874#[inline]
12875#[target_feature(enable = "avx512fp16")]
12876#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12877#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12878pub fn _mm512_cvtepu64_ph(a: __m512i) -> __m128h {
12879    unsafe { vcvtuqq2ph_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION) }
12880}
12881
12882/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12883/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12884/// mask bit is not set).
12885///
12886/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu64_ph)
12887#[inline]
12888#[target_feature(enable = "avx512fp16")]
12889#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12890#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12891pub fn _mm512_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
12892    unsafe { simd_select_bitmask(k, _mm512_cvtepu64_ph(a), src) }
12893}
12894
12895/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12896/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12897///
12898/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu64_ph)
12899#[inline]
12900#[target_feature(enable = "avx512fp16")]
12901#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12902#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12903pub fn _mm512_maskz_cvtepu64_ph(k: __mmask8, a: __m512i) -> __m128h {
12904    _mm512_mask_cvtepu64_ph(f16x8::ZERO.as_m128h(), k, a)
12905}
12906
12907/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12908/// and store the results in dst.
12909///
12910/// Rounding is done according to the rounding parameter, which can be one of:
12911///
12912/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12913/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12914/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12915/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12916/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12917///
12918/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu64_ph)
12919#[inline]
12920#[target_feature(enable = "avx512fp16")]
12921#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
12922#[rustc_legacy_const_generics(1)]
12923#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12924pub fn _mm512_cvt_roundepu64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
12925    unsafe {
12926        static_assert_rounding!(ROUNDING);
12927        vcvtuqq2ph_512(a.as_u64x8(), ROUNDING)
12928    }
12929}
12930
12931/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12932/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12933/// mask bit is not set).
12934///
12935/// Rounding is done according to the rounding parameter, which can be one of:
12936///
12937/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12938/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12939/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12940/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12941/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12942///
12943/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu64_ph)
12944#[inline]
12945#[target_feature(enable = "avx512fp16")]
12946#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
12947#[rustc_legacy_const_generics(3)]
12948#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12949pub fn _mm512_mask_cvt_roundepu64_ph<const ROUNDING: i32>(
12950    src: __m128h,
12951    k: __mmask8,
12952    a: __m512i,
12953) -> __m128h {
12954    unsafe {
12955        static_assert_rounding!(ROUNDING);
12956        simd_select_bitmask(k, _mm512_cvt_roundepu64_ph::<ROUNDING>(a), src)
12957    }
12958}
12959
12960/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12961/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12962///
12963/// Rounding is done according to the rounding parameter, which can be one of:
12964///
12965/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12966/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12967/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12968/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12969/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12970///
12971/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu64_ph)
12972#[inline]
12973#[target_feature(enable = "avx512fp16")]
12974#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
12975#[rustc_legacy_const_generics(2)]
12976#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12977pub fn _mm512_maskz_cvt_roundepu64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
12978    static_assert_rounding!(ROUNDING);
12979    _mm512_mask_cvt_roundepu64_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
12980}
12981
12982/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12983/// floating-point elements, and store the results in dst.
12984///
12985/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxps_ph)
12986#[inline]
12987#[target_feature(enable = "avx512fp16,avx512vl")]
12988#[cfg_attr(test, assert_instr(vcvtps2phx))]
12989#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12990pub fn _mm_cvtxps_ph(a: __m128) -> __m128h {
12991    _mm_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
12992}
12993
12994/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12995/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12996/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
12997///
12998/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxps_ph)
12999#[inline]
13000#[target_feature(enable = "avx512fp16,avx512vl")]
13001#[cfg_attr(test, assert_instr(vcvtps2phx))]
13002#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13003pub fn _mm_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m128) -> __m128h {
13004    unsafe { vcvtps2phx_128(a, src, k) }
13005}
13006
13007/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13008/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13009/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
13010///
13011/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxps_ph)
13012#[inline]
13013#[target_feature(enable = "avx512fp16,avx512vl")]
13014#[cfg_attr(test, assert_instr(vcvtps2phx))]
13015#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13016pub fn _mm_maskz_cvtxps_ph(k: __mmask8, a: __m128) -> __m128h {
13017    _mm_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
13018}
13019
13020/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13021/// floating-point elements, and store the results in dst.
13022///
13023/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxps_ph)
13024#[inline]
13025#[target_feature(enable = "avx512fp16,avx512vl")]
13026#[cfg_attr(test, assert_instr(vcvtps2phx))]
13027#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13028pub fn _mm256_cvtxps_ph(a: __m256) -> __m128h {
13029    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
13030}
13031
13032/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13033/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13034/// when the corresponding mask bit is not set).
13035///
13036/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxps_ph)
13037#[inline]
13038#[target_feature(enable = "avx512fp16,avx512vl")]
13039#[cfg_attr(test, assert_instr(vcvtps2phx))]
13040#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13041pub fn _mm256_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m256) -> __m128h {
13042    unsafe { vcvtps2phx_256(a, src, k) }
13043}
13044
13045/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13046/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13047/// corresponding mask bit is not set).
13048///
13049/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxps_ph)
13050#[inline]
13051#[target_feature(enable = "avx512fp16,avx512vl")]
13052#[cfg_attr(test, assert_instr(vcvtps2phx))]
13053#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13054pub fn _mm256_maskz_cvtxps_ph(k: __mmask8, a: __m256) -> __m128h {
13055    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
13056}
13057
13058/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13059/// floating-point elements, and store the results in dst.
13060///
13061/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxps_ph)
13062#[inline]
13063#[target_feature(enable = "avx512fp16")]
13064#[cfg_attr(test, assert_instr(vcvtps2phx))]
13065#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13066pub fn _mm512_cvtxps_ph(a: __m512) -> __m256h {
13067    _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), 0xffff, a)
13068}
13069
13070/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13071/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13072/// when the corresponding mask bit is not set).
13073///
13074/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxps_ph)
13075#[inline]
13076#[target_feature(enable = "avx512fp16")]
13077#[cfg_attr(test, assert_instr(vcvtps2phx))]
13078#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13079pub fn _mm512_mask_cvtxps_ph(src: __m256h, k: __mmask16, a: __m512) -> __m256h {
13080    unsafe { vcvtps2phx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
13081}
13082
13083/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13084/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13085/// corresponding mask bit is not set).
13086///
13087/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxps_ph)
13088#[inline]
13089#[target_feature(enable = "avx512fp16")]
13090#[cfg_attr(test, assert_instr(vcvtps2phx))]
13091#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13092pub fn _mm512_maskz_cvtxps_ph(k: __mmask16, a: __m512) -> __m256h {
13093    _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), k, a)
13094}
13095
13096/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13097/// floating-point elements, and store the results in dst.
13098///
13099/// Rounding is done according to the rounding parameter, which can be one of:
13100///
13101/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13102/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13103/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13104/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13105/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13106///
13107/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundps_ph)
13108#[inline]
13109#[target_feature(enable = "avx512fp16")]
13110#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
13111#[rustc_legacy_const_generics(1)]
13112#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13113pub fn _mm512_cvtx_roundps_ph<const ROUNDING: i32>(a: __m512) -> __m256h {
13114    static_assert_rounding!(ROUNDING);
13115    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), 0xffff, a)
13116}
13117
13118/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13119/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13120/// when the corresponding mask bit is not set).
13121///
13122/// Rounding is done according to the rounding parameter, which can be one of:
13123///
13124/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13125/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13126/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13127/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13128/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13129///
13130/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundps_ph)
13131#[inline]
13132#[target_feature(enable = "avx512fp16")]
13133#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
13134#[rustc_legacy_const_generics(3)]
13135#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13136pub fn _mm512_mask_cvtx_roundps_ph<const ROUNDING: i32>(
13137    src: __m256h,
13138    k: __mmask16,
13139    a: __m512,
13140) -> __m256h {
13141    unsafe {
13142        static_assert_rounding!(ROUNDING);
13143        vcvtps2phx_512(a, src, k, ROUNDING)
13144    }
13145}
13146
13147/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13148/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13149/// corresponding mask bit is not set).
13150///
13151/// Rounding is done according to the rounding parameter, which can be one of:
13152///
13153/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13154/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13155/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13156/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13157/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13158///
13159/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundps_ph)
13160#[inline]
13161#[target_feature(enable = "avx512fp16")]
13162#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
13163#[rustc_legacy_const_generics(2)]
13164#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13165pub fn _mm512_maskz_cvtx_roundps_ph<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m256h {
13166    static_assert_rounding!(ROUNDING);
13167    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
13168}
13169
13170/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13171/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13172/// elements from a to the upper elements of dst.
13173///
13174/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sh)
13175#[inline]
13176#[target_feature(enable = "avx512fp16")]
13177#[cfg_attr(test, assert_instr(vcvtss2sh))]
13178#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13179pub fn _mm_cvtss_sh(a: __m128h, b: __m128) -> __m128h {
13180    _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
13181}
13182
13183/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13184/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13185/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13186/// upper elements of dst.
13187///
13188/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtss_sh)
13189#[inline]
13190#[target_feature(enable = "avx512fp16")]
13191#[cfg_attr(test, assert_instr(vcvtss2sh))]
13192#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13193pub fn _mm_mask_cvtss_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128) -> __m128h {
13194    unsafe { vcvtss2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
13195}
13196
13197/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13198/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13199/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13200/// elements of dst.
13201///
13202/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtss_sh)
13203#[inline]
13204#[target_feature(enable = "avx512fp16")]
13205#[cfg_attr(test, assert_instr(vcvtss2sh))]
13206#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13207pub fn _mm_maskz_cvtss_sh(k: __mmask8, a: __m128h, b: __m128) -> __m128h {
13208    _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), k, a, b)
13209}
13210
13211/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13212/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13213/// elements from a to the upper elements of dst.
13214///
13215/// Rounding is done according to the rounding parameter, which can be one of:
13216///
13217/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13218/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13219/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13220/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13221/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13222///
13223/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_sh)
13224#[inline]
13225#[target_feature(enable = "avx512fp16")]
13226#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13227#[rustc_legacy_const_generics(2)]
13228#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13229pub fn _mm_cvt_roundss_sh<const ROUNDING: i32>(a: __m128h, b: __m128) -> __m128h {
13230    static_assert_rounding!(ROUNDING);
13231    _mm_mask_cvt_roundss_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
13232}
13233
13234/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13235/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13236/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13237/// upper elements of dst.
13238///
13239/// Rounding is done according to the rounding parameter, which can be one of:
13240///
13241/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13242/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13243/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13244/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13245/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13246///
13247/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundss_sh)
13248#[inline]
13249#[target_feature(enable = "avx512fp16")]
13250#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13251#[rustc_legacy_const_generics(4)]
13252#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13253pub fn _mm_mask_cvt_roundss_sh<const ROUNDING: i32>(
13254    src: __m128h,
13255    k: __mmask8,
13256    a: __m128h,
13257    b: __m128,
13258) -> __m128h {
13259    unsafe {
13260        static_assert_rounding!(ROUNDING);
13261        vcvtss2sh(a, b, src, k, ROUNDING)
13262    }
13263}
13264
13265/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13266/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13267/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13268/// elements of dst.
13269///
13270/// Rounding is done according to the rounding parameter, which can be one of:
13271///
13272/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13273/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13274/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13275/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13276/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13277///
13278/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundss_sh)
13279#[inline]
13280#[target_feature(enable = "avx512fp16")]
13281#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13282#[rustc_legacy_const_generics(3)]
13283#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13284pub fn _mm_maskz_cvt_roundss_sh<const ROUNDING: i32>(
13285    k: __mmask8,
13286    a: __m128h,
13287    b: __m128,
13288) -> __m128h {
13289    static_assert_rounding!(ROUNDING);
13290    _mm_mask_cvt_roundss_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
13291}
13292
13293/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13294/// floating-point elements, and store the results in dst. The upper 96 bits of dst are zeroed out.
13295///
13296/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ph)
13297#[inline]
13298#[target_feature(enable = "avx512fp16,avx512vl")]
13299#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13300#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13301pub fn _mm_cvtpd_ph(a: __m128d) -> __m128h {
13302    _mm_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
13303}
13304
13305/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13306/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13307/// when the corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
13308///
13309/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpd_ph)
13310#[inline]
13311#[target_feature(enable = "avx512fp16,avx512vl")]
13312#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13313#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13314pub fn _mm_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m128d) -> __m128h {
13315    unsafe { vcvtpd2ph_128(a, src, k) }
13316}
13317
13318/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13319/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13320/// corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
13321///
13322/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpd_ph)
13323#[inline]
13324#[target_feature(enable = "avx512fp16,avx512vl")]
13325#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13326#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13327pub fn _mm_maskz_cvtpd_ph(k: __mmask8, a: __m128d) -> __m128h {
13328    _mm_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
13329}
13330
13331/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13332/// floating-point elements, and store the results in dst. The upper 64 bits of dst are zeroed out.
13333///
13334/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_ph)
13335#[inline]
13336#[target_feature(enable = "avx512fp16,avx512vl")]
13337#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13338#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13339pub fn _mm256_cvtpd_ph(a: __m256d) -> __m128h {
13340    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
13341}
13342
13343/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13344/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13345/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
13346///
13347/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpd_ph)
13348#[inline]
13349#[target_feature(enable = "avx512fp16,avx512vl")]
13350#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13351#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13352pub fn _mm256_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m256d) -> __m128h {
13353    unsafe { vcvtpd2ph_256(a, src, k) }
13354}
13355
13356/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13357/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13358/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
13359///
13360/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpd_ph)
13361#[inline]
13362#[target_feature(enable = "avx512fp16,avx512vl")]
13363#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13364#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13365pub fn _mm256_maskz_cvtpd_ph(k: __mmask8, a: __m256d) -> __m128h {
13366    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
13367}
13368
13369/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13370/// floating-point elements, and store the results in dst.
13371///
13372/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_ph)
13373#[inline]
13374#[target_feature(enable = "avx512fp16")]
13375#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13376#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13377pub fn _mm512_cvtpd_ph(a: __m512d) -> __m128h {
13378    _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), 0xff, a)
13379}
13380
13381/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13382/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13383/// when the corresponding mask bit is not set).
13384///
13385/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_ph)
13386#[inline]
13387#[target_feature(enable = "avx512fp16")]
13388#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13389#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13390pub fn _mm512_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m512d) -> __m128h {
13391    unsafe { vcvtpd2ph_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
13392}
13393
13394/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13395/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13396/// corresponding mask bit is not set).
13397///
13398/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_ph)
13399#[inline]
13400#[target_feature(enable = "avx512fp16")]
13401#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13402#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13403pub fn _mm512_maskz_cvtpd_ph(k: __mmask8, a: __m512d) -> __m128h {
13404    _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), k, a)
13405}
13406
13407/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13408/// floating-point elements, and store the results in dst.
13409///
13410/// Rounding is done according to the rounding parameter, which can be one of:
13411///
13412/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13413/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13414/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13415/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13416/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13417///
13418/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_ph)
13419#[inline]
13420#[target_feature(enable = "avx512fp16")]
13421#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13422#[rustc_legacy_const_generics(1)]
13423#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13424pub fn _mm512_cvt_roundpd_ph<const ROUNDING: i32>(a: __m512d) -> __m128h {
13425    static_assert_rounding!(ROUNDING);
13426    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a)
13427}
13428
13429/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13430/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13431/// when the corresponding mask bit is not set).
13432///
13433/// Rounding is done according to the rounding parameter, which can be one of:
13434///
13435/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13436/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13437/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13438/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13439/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13440///
13441/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_ph)
13442#[inline]
13443#[target_feature(enable = "avx512fp16")]
13444#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13445#[rustc_legacy_const_generics(3)]
13446#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13447pub fn _mm512_mask_cvt_roundpd_ph<const ROUNDING: i32>(
13448    src: __m128h,
13449    k: __mmask8,
13450    a: __m512d,
13451) -> __m128h {
13452    unsafe {
13453        static_assert_rounding!(ROUNDING);
13454        vcvtpd2ph_512(a, src, k, ROUNDING)
13455    }
13456}
13457
13458/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13459/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13460/// corresponding mask bit is not set).
13461///
13462/// Rounding is done according to the rounding parameter, which can be one of:
13463///
13464/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13465/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13466/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13467/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13468/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13469///
13470/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_ph)
13471#[inline]
13472#[target_feature(enable = "avx512fp16")]
13473#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13474#[rustc_legacy_const_generics(2)]
13475#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13476pub fn _mm512_maskz_cvt_roundpd_ph<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m128h {
13477    static_assert_rounding!(ROUNDING);
13478    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
13479}
13480
13481/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13482/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13483/// elements from a to the upper elements of dst.
13484///
13485/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_sh)
13486#[inline]
13487#[target_feature(enable = "avx512fp16")]
13488#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13489#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13490pub fn _mm_cvtsd_sh(a: __m128h, b: __m128d) -> __m128h {
13491    _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
13492}
13493
13494/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13495/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13496/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13497/// upper elements of dst.
13498///
13499/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsd_sh)
13500#[inline]
13501#[target_feature(enable = "avx512fp16")]
13502#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13503#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13504pub fn _mm_mask_cvtsd_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
13505    unsafe { vcvtsd2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
13506}
13507
13508/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13509/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13510/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13511/// elements of dst.
13512///
13513/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsd_sh)
13514#[inline]
13515#[target_feature(enable = "avx512fp16")]
13516#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13517#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13518pub fn _mm_maskz_cvtsd_sh(k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
13519    _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), k, a, b)
13520}
13521
13522/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13523/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13524/// elements from a to the upper elements of dst.
13525///
13526/// Rounding is done according to the rounding parameter, which can be one of:
13527///
13528/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13529/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13530/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13531/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13532/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13533///
13534/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_sh)
13535#[inline]
13536#[target_feature(enable = "avx512fp16")]
13537#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13538#[rustc_legacy_const_generics(2)]
13539#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13540pub fn _mm_cvt_roundsd_sh<const ROUNDING: i32>(a: __m128h, b: __m128d) -> __m128h {
13541    static_assert_rounding!(ROUNDING);
13542    _mm_mask_cvt_roundsd_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
13543}
13544
13545/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13546/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13547/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13548/// upper elements of dst.
13549///
13550/// Rounding is done according to the rounding parameter, which can be one of:
13551///
13552/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13553/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13554/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13555/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13556/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13557///
13558/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsd_sh)
13559#[inline]
13560#[target_feature(enable = "avx512fp16")]
13561#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13562#[rustc_legacy_const_generics(4)]
13563#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13564pub fn _mm_mask_cvt_roundsd_sh<const ROUNDING: i32>(
13565    src: __m128h,
13566    k: __mmask8,
13567    a: __m128h,
13568    b: __m128d,
13569) -> __m128h {
13570    unsafe {
13571        static_assert_rounding!(ROUNDING);
13572        vcvtsd2sh(a, b, src, k, ROUNDING)
13573    }
13574}
13575
13576/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13577/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13578/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13579/// elements of dst.
13580///
13581/// Rounding is done according to the rounding parameter, which can be one of:
13582///
13583/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13584/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13585/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13586/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13587/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13588///
13589/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsd_sh)
13590#[inline]
13591#[target_feature(enable = "avx512fp16")]
13592#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13593#[rustc_legacy_const_generics(3)]
13594#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13595pub fn _mm_maskz_cvt_roundsd_sh<const ROUNDING: i32>(
13596    k: __mmask8,
13597    a: __m128h,
13598    b: __m128d,
13599) -> __m128h {
13600    static_assert_rounding!(ROUNDING);
13601    _mm_mask_cvt_roundsd_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
13602}
13603
13604/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13605/// store the results in dst.
13606///
13607/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi16)
13608#[inline]
13609#[target_feature(enable = "avx512fp16,avx512vl")]
13610#[cfg_attr(test, assert_instr(vcvtph2w))]
13611#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13612pub fn _mm_cvtph_epi16(a: __m128h) -> __m128i {
13613    _mm_mask_cvtph_epi16(_mm_undefined_si128(), 0xff, a)
13614}
13615
13616/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13617/// store the results in dst using writemask k (elements are copied from src when the corresponding
13618/// mask bit is not set).
13619///
13620/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi16)
13621#[inline]
13622#[target_feature(enable = "avx512fp16,avx512vl")]
13623#[cfg_attr(test, assert_instr(vcvtph2w))]
13624#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13625pub fn _mm_mask_cvtph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13626    unsafe { transmute(vcvtph2w_128(a, src.as_i16x8(), k)) }
13627}
13628
13629/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13630/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13631///
13632/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi16)
13633#[inline]
13634#[target_feature(enable = "avx512fp16,avx512vl")]
13635#[cfg_attr(test, assert_instr(vcvtph2w))]
13636#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13637pub fn _mm_maskz_cvtph_epi16(k: __mmask8, a: __m128h) -> __m128i {
13638    _mm_mask_cvtph_epi16(_mm_setzero_si128(), k, a)
13639}
13640
13641/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13642/// store the results in dst.
13643///
13644/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi16)
13645#[inline]
13646#[target_feature(enable = "avx512fp16,avx512vl")]
13647#[cfg_attr(test, assert_instr(vcvtph2w))]
13648#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13649pub fn _mm256_cvtph_epi16(a: __m256h) -> __m256i {
13650    _mm256_mask_cvtph_epi16(_mm256_undefined_si256(), 0xffff, a)
13651}
13652
13653/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13654/// store the results in dst using writemask k (elements are copied from src when the corresponding
13655/// mask bit is not set).
13656///
13657/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi16)
13658#[inline]
13659#[target_feature(enable = "avx512fp16,avx512vl")]
13660#[cfg_attr(test, assert_instr(vcvtph2w))]
13661#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13662pub fn _mm256_mask_cvtph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13663    unsafe { transmute(vcvtph2w_256(a, src.as_i16x16(), k)) }
13664}
13665
13666/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13667/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13668///
13669/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi16)
13670#[inline]
13671#[target_feature(enable = "avx512fp16,avx512vl")]
13672#[cfg_attr(test, assert_instr(vcvtph2w))]
13673#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13674pub fn _mm256_maskz_cvtph_epi16(k: __mmask16, a: __m256h) -> __m256i {
13675    _mm256_mask_cvtph_epi16(_mm256_setzero_si256(), k, a)
13676}
13677
13678/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13679/// store the results in dst.
13680///
13681/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi16)
13682#[inline]
13683#[target_feature(enable = "avx512fp16")]
13684#[cfg_attr(test, assert_instr(vcvtph2w))]
13685#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13686pub fn _mm512_cvtph_epi16(a: __m512h) -> __m512i {
13687    _mm512_mask_cvtph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
13688}
13689
13690/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13691/// store the results in dst using writemask k (elements are copied from src when the corresponding
13692/// mask bit is not set).
13693///
13694/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi16)
13695#[inline]
13696#[target_feature(enable = "avx512fp16")]
13697#[cfg_attr(test, assert_instr(vcvtph2w))]
13698#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13699pub fn _mm512_mask_cvtph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13700    unsafe {
13701        transmute(vcvtph2w_512(
13702            a,
13703            src.as_i16x32(),
13704            k,
13705            _MM_FROUND_CUR_DIRECTION,
13706        ))
13707    }
13708}
13709
13710/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13711/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13712///
13713/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi16)
13714#[inline]
13715#[target_feature(enable = "avx512fp16")]
13716#[cfg_attr(test, assert_instr(vcvtph2w))]
13717#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13718pub fn _mm512_maskz_cvtph_epi16(k: __mmask32, a: __m512h) -> __m512i {
13719    _mm512_mask_cvtph_epi16(_mm512_setzero_si512(), k, a)
13720}
13721
13722/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13723/// store the results in dst.
13724///
13725/// Rounding is done according to the rounding parameter, which can be one of:
13726///
13727/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13728/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13729/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13730/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13731/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13732///
13733/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi16)
13734#[inline]
13735#[target_feature(enable = "avx512fp16")]
13736#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13737#[rustc_legacy_const_generics(1)]
13738#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13739pub fn _mm512_cvt_roundph_epi16<const ROUNDING: i32>(a: __m512h) -> __m512i {
13740    static_assert_rounding!(ROUNDING);
13741    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_undefined_epi32(), 0xffffffff, a)
13742}
13743
13744/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13745/// store the results in dst using writemask k (elements are copied from src when the corresponding
13746/// mask bit is not set).
13747///
13748/// Rounding is done according to the rounding parameter, which can be one of:
13749///
13750/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13751/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13752/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13753/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13754/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13755///
13756/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi16)
13757#[inline]
13758#[target_feature(enable = "avx512fp16")]
13759#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13760#[rustc_legacy_const_generics(3)]
13761#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13762pub fn _mm512_mask_cvt_roundph_epi16<const ROUNDING: i32>(
13763    src: __m512i,
13764    k: __mmask32,
13765    a: __m512h,
13766) -> __m512i {
13767    unsafe {
13768        static_assert_rounding!(ROUNDING);
13769        transmute(vcvtph2w_512(a, src.as_i16x32(), k, ROUNDING))
13770    }
13771}
13772
13773/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13774/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13775///
13776/// Rounding is done according to the rounding parameter, which can be one of:
13777///
13778/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13779/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13780/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13781/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13782/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13783///
13784/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi16)
13785#[inline]
13786#[target_feature(enable = "avx512fp16")]
13787#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13788#[rustc_legacy_const_generics(2)]
13789#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13790pub fn _mm512_maskz_cvt_roundph_epi16<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512i {
13791    static_assert_rounding!(ROUNDING);
13792    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_setzero_si512(), k, a)
13793}
13794
13795/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13796/// and store the results in dst.
13797///
13798/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu16)
13799#[inline]
13800#[target_feature(enable = "avx512fp16,avx512vl")]
13801#[cfg_attr(test, assert_instr(vcvtph2uw))]
13802#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13803pub fn _mm_cvtph_epu16(a: __m128h) -> __m128i {
13804    _mm_mask_cvtph_epu16(_mm_undefined_si128(), 0xff, a)
13805}
13806
13807/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13808/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13809/// mask bit is not set).
13810///
13811/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu16)
13812#[inline]
13813#[target_feature(enable = "avx512fp16,avx512vl")]
13814#[cfg_attr(test, assert_instr(vcvtph2uw))]
13815#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13816pub fn _mm_mask_cvtph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13817    unsafe { transmute(vcvtph2uw_128(a, src.as_u16x8(), k)) }
13818}
13819
13820/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13821/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13822///
13823/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu16)
13824#[inline]
13825#[target_feature(enable = "avx512fp16,avx512vl")]
13826#[cfg_attr(test, assert_instr(vcvtph2uw))]
13827#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13828pub fn _mm_maskz_cvtph_epu16(k: __mmask8, a: __m128h) -> __m128i {
13829    _mm_mask_cvtph_epu16(_mm_setzero_si128(), k, a)
13830}
13831
13832/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13833/// and store the results in dst.
13834///
13835/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu16)
13836#[inline]
13837#[target_feature(enable = "avx512fp16,avx512vl")]
13838#[cfg_attr(test, assert_instr(vcvtph2uw))]
13839#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13840pub fn _mm256_cvtph_epu16(a: __m256h) -> __m256i {
13841    _mm256_mask_cvtph_epu16(_mm256_undefined_si256(), 0xffff, a)
13842}
13843
13844/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13845/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13846/// mask bit is not set).
13847///
13848/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu16)
13849#[inline]
13850#[target_feature(enable = "avx512fp16,avx512vl")]
13851#[cfg_attr(test, assert_instr(vcvtph2uw))]
13852#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13853pub fn _mm256_mask_cvtph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13854    unsafe { transmute(vcvtph2uw_256(a, src.as_u16x16(), k)) }
13855}
13856
13857/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13858/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13859///
13860/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu16)
13861#[inline]
13862#[target_feature(enable = "avx512fp16,avx512vl")]
13863#[cfg_attr(test, assert_instr(vcvtph2uw))]
13864#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13865pub fn _mm256_maskz_cvtph_epu16(k: __mmask16, a: __m256h) -> __m256i {
13866    _mm256_mask_cvtph_epu16(_mm256_setzero_si256(), k, a)
13867}
13868
13869/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13870/// and store the results in dst.
13871///
13872/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu16)
13873#[inline]
13874#[target_feature(enable = "avx512fp16")]
13875#[cfg_attr(test, assert_instr(vcvtph2uw))]
13876#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13877pub fn _mm512_cvtph_epu16(a: __m512h) -> __m512i {
13878    _mm512_mask_cvtph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
13879}
13880
13881/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13882/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13883/// mask bit is not set).
13884///
13885/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu16)
13886#[inline]
13887#[target_feature(enable = "avx512fp16")]
13888#[cfg_attr(test, assert_instr(vcvtph2uw))]
13889#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13890pub fn _mm512_mask_cvtph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13891    unsafe {
13892        transmute(vcvtph2uw_512(
13893            a,
13894            src.as_u16x32(),
13895            k,
13896            _MM_FROUND_CUR_DIRECTION,
13897        ))
13898    }
13899}
13900
13901/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13902/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13903///
13904/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu16)
13905#[inline]
13906#[target_feature(enable = "avx512fp16")]
13907#[cfg_attr(test, assert_instr(vcvtph2uw))]
13908#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13909pub fn _mm512_maskz_cvtph_epu16(k: __mmask32, a: __m512h) -> __m512i {
13910    _mm512_mask_cvtph_epu16(_mm512_setzero_si512(), k, a)
13911}
13912
13913/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13914/// and store the results in dst.
13915///
13916/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
13917///
13918/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu16)
13919#[inline]
13920#[target_feature(enable = "avx512fp16")]
13921#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
13922#[rustc_legacy_const_generics(1)]
13923#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13924pub fn _mm512_cvt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
13925    static_assert_sae!(SAE);
13926    _mm512_mask_cvt_roundph_epu16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
13927}
13928
13929/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13930/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13931/// mask bit is not set).
13932///
13933/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
13934///
13935/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu16)
13936#[inline]
13937#[target_feature(enable = "avx512fp16")]
13938#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
13939#[rustc_legacy_const_generics(3)]
13940#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13941pub fn _mm512_mask_cvt_roundph_epu16<const SAE: i32>(
13942    src: __m512i,
13943    k: __mmask32,
13944    a: __m512h,
13945) -> __m512i {
13946    unsafe {
13947        static_assert_sae!(SAE);
13948        transmute(vcvtph2uw_512(a, src.as_u16x32(), k, SAE))
13949    }
13950}
13951
13952/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13953/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13954///
13955/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
13956///
13957/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu16)
13958#[inline]
13959#[target_feature(enable = "avx512fp16")]
13960#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
13961#[rustc_legacy_const_generics(2)]
13962#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13963pub fn _mm512_maskz_cvt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
13964    static_assert_sae!(SAE);
13965    _mm512_mask_cvt_roundph_epu16::<SAE>(_mm512_setzero_si512(), k, a)
13966}
13967
13968/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13969/// truncation, and store the results in dst.
13970///
13971/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi16)
13972#[inline]
13973#[target_feature(enable = "avx512fp16,avx512vl")]
13974#[cfg_attr(test, assert_instr(vcvttph2w))]
13975#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13976pub fn _mm_cvttph_epi16(a: __m128h) -> __m128i {
13977    _mm_mask_cvttph_epi16(_mm_undefined_si128(), 0xff, a)
13978}
13979
13980/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13981/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13982/// mask bit is not set).
13983///
13984/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi16)
13985#[inline]
13986#[target_feature(enable = "avx512fp16,avx512vl")]
13987#[cfg_attr(test, assert_instr(vcvttph2w))]
13988#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13989pub fn _mm_mask_cvttph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13990    unsafe { transmute(vcvttph2w_128(a, src.as_i16x8(), k)) }
13991}
13992
13993/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13994/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13995/// mask bit is not set).
13996///
13997/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi16)
13998#[inline]
13999#[target_feature(enable = "avx512fp16,avx512vl")]
14000#[cfg_attr(test, assert_instr(vcvttph2w))]
14001#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14002pub fn _mm_maskz_cvttph_epi16(k: __mmask8, a: __m128h) -> __m128i {
14003    _mm_mask_cvttph_epi16(_mm_setzero_si128(), k, a)
14004}
14005
14006/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14007/// truncation, and store the results in dst.
14008///
14009/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi16)
14010#[inline]
14011#[target_feature(enable = "avx512fp16,avx512vl")]
14012#[cfg_attr(test, assert_instr(vcvttph2w))]
14013#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14014pub fn _mm256_cvttph_epi16(a: __m256h) -> __m256i {
14015    _mm256_mask_cvttph_epi16(_mm256_undefined_si256(), 0xffff, a)
14016}
14017
14018/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14019/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14020/// mask bit is not set).
14021///
14022/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi16)
14023#[inline]
14024#[target_feature(enable = "avx512fp16,avx512vl")]
14025#[cfg_attr(test, assert_instr(vcvttph2w))]
14026#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14027pub fn _mm256_mask_cvttph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
14028    unsafe { transmute(vcvttph2w_256(a, src.as_i16x16(), k)) }
14029}
14030
14031/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14032/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14033/// mask bit is not set).
14034///
14035/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi16)
14036#[inline]
14037#[target_feature(enable = "avx512fp16,avx512vl")]
14038#[cfg_attr(test, assert_instr(vcvttph2w))]
14039#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14040pub fn _mm256_maskz_cvttph_epi16(k: __mmask16, a: __m256h) -> __m256i {
14041    _mm256_mask_cvttph_epi16(_mm256_setzero_si256(), k, a)
14042}
14043
14044/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14045/// truncation, and store the results in dst.
14046///
14047/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi16)
14048#[inline]
14049#[target_feature(enable = "avx512fp16")]
14050#[cfg_attr(test, assert_instr(vcvttph2w))]
14051#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14052pub fn _mm512_cvttph_epi16(a: __m512h) -> __m512i {
14053    _mm512_mask_cvttph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
14054}
14055
14056/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14057/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14058/// mask bit is not set).
14059///
14060/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi16)
14061#[inline]
14062#[target_feature(enable = "avx512fp16")]
14063#[cfg_attr(test, assert_instr(vcvttph2w))]
14064#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14065pub fn _mm512_mask_cvttph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
14066    unsafe {
14067        transmute(vcvttph2w_512(
14068            a,
14069            src.as_i16x32(),
14070            k,
14071            _MM_FROUND_CUR_DIRECTION,
14072        ))
14073    }
14074}
14075
14076/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14077/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14078/// mask bit is not set).
14079///
14080/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi16)
14081#[inline]
14082#[target_feature(enable = "avx512fp16")]
14083#[cfg_attr(test, assert_instr(vcvttph2w))]
14084#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14085pub fn _mm512_maskz_cvttph_epi16(k: __mmask32, a: __m512h) -> __m512i {
14086    _mm512_mask_cvttph_epi16(_mm512_setzero_si512(), k, a)
14087}
14088
14089/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14090/// truncation, and store the results in dst.
14091///
14092/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14093///
14094/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi16)
14095#[inline]
14096#[target_feature(enable = "avx512fp16")]
14097#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
14098#[rustc_legacy_const_generics(1)]
14099#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14100pub fn _mm512_cvtt_roundph_epi16<const SAE: i32>(a: __m512h) -> __m512i {
14101    static_assert_sae!(SAE);
14102    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
14103}
14104
14105/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14106/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14107/// mask bit is not set).
14108///
14109/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14110///
14111/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi16)
14112#[inline]
14113#[target_feature(enable = "avx512fp16")]
14114#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
14115#[rustc_legacy_const_generics(3)]
14116#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14117pub fn _mm512_mask_cvtt_roundph_epi16<const SAE: i32>(
14118    src: __m512i,
14119    k: __mmask32,
14120    a: __m512h,
14121) -> __m512i {
14122    unsafe {
14123        static_assert_sae!(SAE);
14124        transmute(vcvttph2w_512(a, src.as_i16x32(), k, SAE))
14125    }
14126}
14127
14128/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14129/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14130/// mask bit is not set).
14131///
14132/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14133///
14134/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi16)
14135#[inline]
14136#[target_feature(enable = "avx512fp16")]
14137#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
14138#[rustc_legacy_const_generics(2)]
14139#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14140pub fn _mm512_maskz_cvtt_roundph_epi16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
14141    static_assert_sae!(SAE);
14142    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_setzero_si512(), k, a)
14143}
14144
14145/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14146/// truncation, and store the results in dst.
14147///
14148/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu16)
14149#[inline]
14150#[target_feature(enable = "avx512fp16,avx512vl")]
14151#[cfg_attr(test, assert_instr(vcvttph2uw))]
14152#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14153pub fn _mm_cvttph_epu16(a: __m128h) -> __m128i {
14154    _mm_mask_cvttph_epu16(_mm_undefined_si128(), 0xff, a)
14155}
14156
14157/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14158/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14159/// mask bit is not set).
14160///
14161/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu16)
14162#[inline]
14163#[target_feature(enable = "avx512fp16,avx512vl")]
14164#[cfg_attr(test, assert_instr(vcvttph2uw))]
14165#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14166pub fn _mm_mask_cvttph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14167    unsafe { transmute(vcvttph2uw_128(a, src.as_u16x8(), k)) }
14168}
14169
14170/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14171/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14172/// mask bit is not set).
14173///
14174/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu16)
14175#[inline]
14176#[target_feature(enable = "avx512fp16,avx512vl")]
14177#[cfg_attr(test, assert_instr(vcvttph2uw))]
14178#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14179pub fn _mm_maskz_cvttph_epu16(k: __mmask8, a: __m128h) -> __m128i {
14180    _mm_mask_cvttph_epu16(_mm_setzero_si128(), k, a)
14181}
14182
14183/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14184/// truncation, and store the results in dst.
14185///
14186/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu16)
14187#[inline]
14188#[target_feature(enable = "avx512fp16,avx512vl")]
14189#[cfg_attr(test, assert_instr(vcvttph2uw))]
14190#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14191pub fn _mm256_cvttph_epu16(a: __m256h) -> __m256i {
14192    _mm256_mask_cvttph_epu16(_mm256_undefined_si256(), 0xffff, a)
14193}
14194
14195/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14196/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14197/// mask bit is not set).
14198///
14199/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu16)
14200#[inline]
14201#[target_feature(enable = "avx512fp16,avx512vl")]
14202#[cfg_attr(test, assert_instr(vcvttph2uw))]
14203#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14204pub fn _mm256_mask_cvttph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
14205    unsafe { transmute(vcvttph2uw_256(a, src.as_u16x16(), k)) }
14206}
14207
14208/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14209/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14210/// mask bit is not set).
14211///
14212/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu16)
14213#[inline]
14214#[target_feature(enable = "avx512fp16,avx512vl")]
14215#[cfg_attr(test, assert_instr(vcvttph2uw))]
14216#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14217pub fn _mm256_maskz_cvttph_epu16(k: __mmask16, a: __m256h) -> __m256i {
14218    _mm256_mask_cvttph_epu16(_mm256_setzero_si256(), k, a)
14219}
14220
14221/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14222/// truncation, and store the results in dst.
14223///
14224/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu16)
14225#[inline]
14226#[target_feature(enable = "avx512fp16")]
14227#[cfg_attr(test, assert_instr(vcvttph2uw))]
14228#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14229pub fn _mm512_cvttph_epu16(a: __m512h) -> __m512i {
14230    _mm512_mask_cvttph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
14231}
14232
14233/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14234/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14235/// mask bit is not set).
14236///
14237/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu16)
14238#[inline]
14239#[target_feature(enable = "avx512fp16")]
14240#[cfg_attr(test, assert_instr(vcvttph2uw))]
14241#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14242pub fn _mm512_mask_cvttph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
14243    unsafe {
14244        transmute(vcvttph2uw_512(
14245            a,
14246            src.as_u16x32(),
14247            k,
14248            _MM_FROUND_CUR_DIRECTION,
14249        ))
14250    }
14251}
14252
14253/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14254/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14255/// mask bit is not set).
14256///
14257/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu16)
14258#[inline]
14259#[target_feature(enable = "avx512fp16")]
14260#[cfg_attr(test, assert_instr(vcvttph2uw))]
14261#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14262pub fn _mm512_maskz_cvttph_epu16(k: __mmask32, a: __m512h) -> __m512i {
14263    _mm512_mask_cvttph_epu16(_mm512_setzero_si512(), k, a)
14264}
14265
14266/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14267/// truncation, and store the results in dst.
14268///
14269/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14270///
14271/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu16)
14272#[inline]
14273#[target_feature(enable = "avx512fp16")]
14274#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14275#[rustc_legacy_const_generics(1)]
14276#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14277pub fn _mm512_cvtt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
14278    static_assert_sae!(SAE);
14279    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
14280}
14281
14282/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14283/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14284/// mask bit is not set).
14285///
14286/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14287///
14288/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu16)
14289#[inline]
14290#[target_feature(enable = "avx512fp16")]
14291#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14292#[rustc_legacy_const_generics(3)]
14293#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14294pub fn _mm512_mask_cvtt_roundph_epu16<const SAE: i32>(
14295    src: __m512i,
14296    k: __mmask32,
14297    a: __m512h,
14298) -> __m512i {
14299    unsafe {
14300        static_assert_sae!(SAE);
14301        transmute(vcvttph2uw_512(a, src.as_u16x32(), k, SAE))
14302    }
14303}
14304
14305/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14306/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14307/// mask bit is not set).
14308///
14309/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14310///
14311/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu16)
14312#[inline]
14313#[target_feature(enable = "avx512fp16")]
14314#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14315#[rustc_legacy_const_generics(2)]
14316#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14317pub fn _mm512_maskz_cvtt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
14318    static_assert_sae!(SAE);
14319    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_setzero_si512(), k, a)
14320}
14321
14322/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14323/// results in dst.
14324///
14325/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi32)
14326#[inline]
14327#[target_feature(enable = "avx512fp16,avx512vl")]
14328#[cfg_attr(test, assert_instr(vcvtph2dq))]
14329#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14330pub fn _mm_cvtph_epi32(a: __m128h) -> __m128i {
14331    _mm_mask_cvtph_epi32(_mm_undefined_si128(), 0xff, a)
14332}
14333
14334/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14335/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14336///
14337/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi32)
14338#[inline]
14339#[target_feature(enable = "avx512fp16,avx512vl")]
14340#[cfg_attr(test, assert_instr(vcvtph2dq))]
14341#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14342pub fn _mm_mask_cvtph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14343    unsafe { transmute(vcvtph2dq_128(a, src.as_i32x4(), k)) }
14344}
14345
14346/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14347/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14348///
14349/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi32)
14350#[inline]
14351#[target_feature(enable = "avx512fp16,avx512vl")]
14352#[cfg_attr(test, assert_instr(vcvtph2dq))]
14353#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14354pub fn _mm_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m128i {
14355    _mm_mask_cvtph_epi32(_mm_setzero_si128(), k, a)
14356}
14357
14358/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14359/// results in dst.
14360///
14361/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi32)
14362#[inline]
14363#[target_feature(enable = "avx512fp16,avx512vl")]
14364#[cfg_attr(test, assert_instr(vcvtph2dq))]
14365#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14366pub fn _mm256_cvtph_epi32(a: __m128h) -> __m256i {
14367    _mm256_mask_cvtph_epi32(_mm256_undefined_si256(), 0xff, a)
14368}
14369
14370/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14371/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14372///
14373/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi32)
14374#[inline]
14375#[target_feature(enable = "avx512fp16,avx512vl")]
14376#[cfg_attr(test, assert_instr(vcvtph2dq))]
14377#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14378pub fn _mm256_mask_cvtph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14379    unsafe { transmute(vcvtph2dq_256(a, src.as_i32x8(), k)) }
14380}
14381
14382/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14383/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14384///
14385/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi32)
14386#[inline]
14387#[target_feature(enable = "avx512fp16,avx512vl")]
14388#[cfg_attr(test, assert_instr(vcvtph2dq))]
14389#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14390pub fn _mm256_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m256i {
14391    _mm256_mask_cvtph_epi32(_mm256_setzero_si256(), k, a)
14392}
14393
14394/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14395/// results in dst.
14396///
14397/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi32)
14398#[inline]
14399#[target_feature(enable = "avx512fp16")]
14400#[cfg_attr(test, assert_instr(vcvtph2dq))]
14401#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14402pub fn _mm512_cvtph_epi32(a: __m256h) -> __m512i {
14403    _mm512_mask_cvtph_epi32(_mm512_undefined_epi32(), 0xffff, a)
14404}
14405
14406/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14407/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14408///
14409/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi32)
14410#[inline]
14411#[target_feature(enable = "avx512fp16")]
14412#[cfg_attr(test, assert_instr(vcvtph2dq))]
14413#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14414pub fn _mm512_mask_cvtph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14415    unsafe {
14416        transmute(vcvtph2dq_512(
14417            a,
14418            src.as_i32x16(),
14419            k,
14420            _MM_FROUND_CUR_DIRECTION,
14421        ))
14422    }
14423}
14424
14425/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14426/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14427///
14428/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi32)
14429#[inline]
14430#[target_feature(enable = "avx512fp16")]
14431#[cfg_attr(test, assert_instr(vcvtph2dq))]
14432#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14433pub fn _mm512_maskz_cvtph_epi32(k: __mmask16, a: __m256h) -> __m512i {
14434    _mm512_mask_cvtph_epi32(_mm512_setzero_si512(), k, a)
14435}
14436
14437/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14438/// results in dst.
14439///
14440/// Rounding is done according to the rounding parameter, which can be one of:
14441///
14442/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14443/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14444/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14445/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14446/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14447///
14448/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi32)
14449#[inline]
14450#[target_feature(enable = "avx512fp16")]
14451#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14452#[rustc_legacy_const_generics(1)]
14453#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14454pub fn _mm512_cvt_roundph_epi32<const ROUNDING: i32>(a: __m256h) -> __m512i {
14455    static_assert_rounding!(ROUNDING);
14456    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
14457}
14458
14459/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14460/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14461///
14462/// Rounding is done according to the rounding parameter, which can be one of:
14463///
14464/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14465/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14466/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14467/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14468/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14469///
14470/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi32)
14471#[inline]
14472#[target_feature(enable = "avx512fp16")]
14473#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14474#[rustc_legacy_const_generics(3)]
14475#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14476pub fn _mm512_mask_cvt_roundph_epi32<const ROUNDING: i32>(
14477    src: __m512i,
14478    k: __mmask16,
14479    a: __m256h,
14480) -> __m512i {
14481    unsafe {
14482        static_assert_rounding!(ROUNDING);
14483        transmute(vcvtph2dq_512(a, src.as_i32x16(), k, ROUNDING))
14484    }
14485}
14486
14487/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14488/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14489///
14490/// Rounding is done according to the rounding parameter, which can be one of:
14491///
14492/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14493/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14494/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14495/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14496/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14497///
14498/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi32)
14499#[inline]
14500#[target_feature(enable = "avx512fp16")]
14501#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14502#[rustc_legacy_const_generics(2)]
14503#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14504pub fn _mm512_maskz_cvt_roundph_epi32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
14505    static_assert_rounding!(ROUNDING);
14506    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_setzero_si512(), k, a)
14507}
14508
14509/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
14510/// the result in dst.
14511///
14512/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_i32)
14513#[inline]
14514#[target_feature(enable = "avx512fp16")]
14515#[cfg_attr(test, assert_instr(vcvtsh2si))]
14516#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14517pub fn _mm_cvtsh_i32(a: __m128h) -> i32 {
14518    unsafe { vcvtsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
14519}
14520
14521/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
14522/// the result in dst.
14523///
14524/// Rounding is done according to the rounding parameter, which can be one of:
14525///
14526/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14527/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14528/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14529/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14530/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14531///
14532/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_i32)
14533#[inline]
14534#[target_feature(enable = "avx512fp16")]
14535#[cfg_attr(test, assert_instr(vcvtsh2si, ROUNDING = 8))]
14536#[rustc_legacy_const_generics(1)]
14537#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14538pub fn _mm_cvt_roundsh_i32<const ROUNDING: i32>(a: __m128h) -> i32 {
14539    unsafe {
14540        static_assert_rounding!(ROUNDING);
14541        vcvtsh2si32(a, ROUNDING)
14542    }
14543}
14544
14545/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14546/// results in dst.
14547///
14548/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu32)
14549#[inline]
14550#[target_feature(enable = "avx512fp16,avx512vl")]
14551#[cfg_attr(test, assert_instr(vcvtph2udq))]
14552#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14553pub fn _mm_cvtph_epu32(a: __m128h) -> __m128i {
14554    _mm_mask_cvtph_epu32(_mm_undefined_si128(), 0xff, a)
14555}
14556
14557/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14558/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14559///
14560/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu32)
14561#[inline]
14562#[target_feature(enable = "avx512fp16,avx512vl")]
14563#[cfg_attr(test, assert_instr(vcvtph2udq))]
14564#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14565pub fn _mm_mask_cvtph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14566    unsafe { transmute(vcvtph2udq_128(a, src.as_u32x4(), k)) }
14567}
14568
14569/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14570/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14571///
14572/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu32)
14573#[inline]
14574#[target_feature(enable = "avx512fp16,avx512vl")]
14575#[cfg_attr(test, assert_instr(vcvtph2udq))]
14576#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14577pub fn _mm_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m128i {
14578    _mm_mask_cvtph_epu32(_mm_setzero_si128(), k, a)
14579}
14580
14581/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14582/// the results in dst.
14583///
14584/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu32)
14585#[inline]
14586#[target_feature(enable = "avx512fp16,avx512vl")]
14587#[cfg_attr(test, assert_instr(vcvtph2udq))]
14588#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14589pub fn _mm256_cvtph_epu32(a: __m128h) -> __m256i {
14590    _mm256_mask_cvtph_epu32(_mm256_undefined_si256(), 0xff, a)
14591}
14592
14593/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14594/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14595///
14596/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu32)
14597#[inline]
14598#[target_feature(enable = "avx512fp16,avx512vl")]
14599#[cfg_attr(test, assert_instr(vcvtph2udq))]
14600#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14601pub fn _mm256_mask_cvtph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14602    unsafe { transmute(vcvtph2udq_256(a, src.as_u32x8(), k)) }
14603}
14604
14605/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14606/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14607///
14608/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu32)
14609#[inline]
14610#[target_feature(enable = "avx512fp16,avx512vl")]
14611#[cfg_attr(test, assert_instr(vcvtph2udq))]
14612#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14613pub fn _mm256_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m256i {
14614    _mm256_mask_cvtph_epu32(_mm256_setzero_si256(), k, a)
14615}
14616
14617/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14618/// the results in dst.
14619///
14620/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu32)
14621#[inline]
14622#[target_feature(enable = "avx512fp16")]
14623#[cfg_attr(test, assert_instr(vcvtph2udq))]
14624#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14625pub fn _mm512_cvtph_epu32(a: __m256h) -> __m512i {
14626    _mm512_mask_cvtph_epu32(_mm512_undefined_epi32(), 0xffff, a)
14627}
14628
14629/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14630/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14631///
14632/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu32)
14633#[inline]
14634#[target_feature(enable = "avx512fp16")]
14635#[cfg_attr(test, assert_instr(vcvtph2udq))]
14636#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14637pub fn _mm512_mask_cvtph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14638    unsafe {
14639        transmute(vcvtph2udq_512(
14640            a,
14641            src.as_u32x16(),
14642            k,
14643            _MM_FROUND_CUR_DIRECTION,
14644        ))
14645    }
14646}
14647
14648/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14649/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14650///
14651/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu32)
14652#[inline]
14653#[target_feature(enable = "avx512fp16")]
14654#[cfg_attr(test, assert_instr(vcvtph2udq))]
14655#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14656pub fn _mm512_maskz_cvtph_epu32(k: __mmask16, a: __m256h) -> __m512i {
14657    _mm512_mask_cvtph_epu32(_mm512_setzero_si512(), k, a)
14658}
14659
14660/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14661/// the results in dst.
14662///
14663/// Rounding is done according to the rounding parameter, which can be one of:
14664///
14665/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14666/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14667/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14668/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14669/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14670///
14671/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu32)
14672#[inline]
14673#[target_feature(enable = "avx512fp16")]
14674#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14675#[rustc_legacy_const_generics(1)]
14676#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14677pub fn _mm512_cvt_roundph_epu32<const ROUNDING: i32>(a: __m256h) -> __m512i {
14678    static_assert_rounding!(ROUNDING);
14679    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
14680}
14681
14682/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14683/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14684///
14685/// Rounding is done according to the rounding parameter, which can be one of:
14686///
14687/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14688/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14689/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14690/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14691/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14692///
14693/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu32)
14694#[inline]
14695#[target_feature(enable = "avx512fp16")]
14696#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14697#[rustc_legacy_const_generics(3)]
14698#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14699pub fn _mm512_mask_cvt_roundph_epu32<const ROUNDING: i32>(
14700    src: __m512i,
14701    k: __mmask16,
14702    a: __m256h,
14703) -> __m512i {
14704    unsafe {
14705        static_assert_rounding!(ROUNDING);
14706        transmute(vcvtph2udq_512(a, src.as_u32x16(), k, ROUNDING))
14707    }
14708}
14709
14710/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14711/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14712///
14713/// Rounding is done according to the rounding parameter, which can be one of:
14714///
14715/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14716/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14717/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14718/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14719/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14720///
14721/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu32)
14722#[inline]
14723#[target_feature(enable = "avx512fp16")]
14724#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14725#[rustc_legacy_const_generics(2)]
14726#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14727pub fn _mm512_maskz_cvt_roundph_epu32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
14728    static_assert_rounding!(ROUNDING);
14729    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_setzero_si512(), k, a)
14730}
14731
14732/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
14733/// the result in dst.
14734///
14735/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_u32)
14736#[inline]
14737#[target_feature(enable = "avx512fp16")]
14738#[cfg_attr(test, assert_instr(vcvtsh2usi))]
14739#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14740pub fn _mm_cvtsh_u32(a: __m128h) -> u32 {
14741    unsafe { vcvtsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
14742}
14743
14744/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
14745/// the result in dst.
14746///
14747/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
14748///
14749/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_u32)
14750#[inline]
14751#[target_feature(enable = "avx512fp16")]
14752#[cfg_attr(test, assert_instr(vcvtsh2usi, SAE = 8))]
14753#[rustc_legacy_const_generics(1)]
14754#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14755pub fn _mm_cvt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
14756    unsafe {
14757        static_assert_rounding!(SAE);
14758        vcvtsh2usi32(a, SAE)
14759    }
14760}
14761
14762/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14763/// store the results in dst.
14764///
14765/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi32)
14766#[inline]
14767#[target_feature(enable = "avx512fp16,avx512vl")]
14768#[cfg_attr(test, assert_instr(vcvttph2dq))]
14769#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14770pub fn _mm_cvttph_epi32(a: __m128h) -> __m128i {
14771    _mm_mask_cvttph_epi32(_mm_undefined_si128(), 0xff, a)
14772}
14773
14774/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14775/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14776///
14777/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi32)
14778#[inline]
14779#[target_feature(enable = "avx512fp16,avx512vl")]
14780#[cfg_attr(test, assert_instr(vcvttph2dq))]
14781#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14782pub fn _mm_mask_cvttph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14783    unsafe { transmute(vcvttph2dq_128(a, src.as_i32x4(), k)) }
14784}
14785
14786/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14787/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14788///
14789/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi32)
14790#[inline]
14791#[target_feature(enable = "avx512fp16,avx512vl")]
14792#[cfg_attr(test, assert_instr(vcvttph2dq))]
14793#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14794pub fn _mm_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m128i {
14795    _mm_mask_cvttph_epi32(_mm_setzero_si128(), k, a)
14796}
14797
14798/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14799/// store the results in dst.
14800///
14801/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi32)
14802#[inline]
14803#[target_feature(enable = "avx512fp16,avx512vl")]
14804#[cfg_attr(test, assert_instr(vcvttph2dq))]
14805#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14806pub fn _mm256_cvttph_epi32(a: __m128h) -> __m256i {
14807    _mm256_mask_cvttph_epi32(_mm256_undefined_si256(), 0xff, a)
14808}
14809
14810/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14811/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14812///
14813/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi32)
14814#[inline]
14815#[target_feature(enable = "avx512fp16,avx512vl")]
14816#[cfg_attr(test, assert_instr(vcvttph2dq))]
14817#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14818pub fn _mm256_mask_cvttph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14819    unsafe { transmute(vcvttph2dq_256(a, src.as_i32x8(), k)) }
14820}
14821
14822/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14823/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14824///
14825/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi32)
14826#[inline]
14827#[target_feature(enable = "avx512fp16,avx512vl")]
14828#[cfg_attr(test, assert_instr(vcvttph2dq))]
14829#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14830pub fn _mm256_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m256i {
14831    _mm256_mask_cvttph_epi32(_mm256_setzero_si256(), k, a)
14832}
14833
14834/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14835/// store the results in dst.
14836///
14837/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi32)
14838#[inline]
14839#[target_feature(enable = "avx512fp16")]
14840#[cfg_attr(test, assert_instr(vcvttph2dq))]
14841#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14842pub fn _mm512_cvttph_epi32(a: __m256h) -> __m512i {
14843    _mm512_mask_cvttph_epi32(_mm512_undefined_epi32(), 0xffff, a)
14844}
14845
14846/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14847/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14848///
14849/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi32)
14850#[inline]
14851#[target_feature(enable = "avx512fp16")]
14852#[cfg_attr(test, assert_instr(vcvttph2dq))]
14853#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14854pub fn _mm512_mask_cvttph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14855    unsafe {
14856        transmute(vcvttph2dq_512(
14857            a,
14858            src.as_i32x16(),
14859            k,
14860            _MM_FROUND_CUR_DIRECTION,
14861        ))
14862    }
14863}
14864
14865/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14866/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14867///
14868/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi32)
14869#[inline]
14870#[target_feature(enable = "avx512fp16")]
14871#[cfg_attr(test, assert_instr(vcvttph2dq))]
14872#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14873pub fn _mm512_maskz_cvttph_epi32(k: __mmask16, a: __m256h) -> __m512i {
14874    _mm512_mask_cvttph_epi32(_mm512_setzero_si512(), k, a)
14875}
14876
14877/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14878/// store the results in dst.
14879///
14880/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14881///
14882/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi32)
14883#[inline]
14884#[target_feature(enable = "avx512fp16")]
14885#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14886#[rustc_legacy_const_generics(1)]
14887#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14888pub fn _mm512_cvtt_roundph_epi32<const SAE: i32>(a: __m256h) -> __m512i {
14889    static_assert_sae!(SAE);
14890    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
14891}
14892
14893/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14894/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14895///
14896/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14897///
14898/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi32)
14899#[inline]
14900#[target_feature(enable = "avx512fp16")]
14901#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14902#[rustc_legacy_const_generics(3)]
14903#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14904pub fn _mm512_mask_cvtt_roundph_epi32<const SAE: i32>(
14905    src: __m512i,
14906    k: __mmask16,
14907    a: __m256h,
14908) -> __m512i {
14909    unsafe {
14910        static_assert_sae!(SAE);
14911        transmute(vcvttph2dq_512(a, src.as_i32x16(), k, SAE))
14912    }
14913}
14914
14915/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14916/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14917///
14918/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14919///
14920/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi32)
14921#[inline]
14922#[target_feature(enable = "avx512fp16")]
14923#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14924#[rustc_legacy_const_generics(2)]
14925#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14926pub fn _mm512_maskz_cvtt_roundph_epi32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
14927    static_assert_sae!(SAE);
14928    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_setzero_si512(), k, a)
14929}
14930
14931/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
14932/// the result in dst.
14933///
14934/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_i32)
14935#[inline]
14936#[target_feature(enable = "avx512fp16")]
14937#[cfg_attr(test, assert_instr(vcvttsh2si))]
14938#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14939pub fn _mm_cvttsh_i32(a: __m128h) -> i32 {
14940    unsafe { vcvttsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
14941}
14942
14943/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
14944/// the result in dst.
14945///
14946/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14947///
14948/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_i32)
14949#[inline]
14950#[target_feature(enable = "avx512fp16")]
14951#[cfg_attr(test, assert_instr(vcvttsh2si, SAE = 8))]
14952#[rustc_legacy_const_generics(1)]
14953#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14954pub fn _mm_cvtt_roundsh_i32<const SAE: i32>(a: __m128h) -> i32 {
14955    unsafe {
14956        static_assert_sae!(SAE);
14957        vcvttsh2si32(a, SAE)
14958    }
14959}
14960
14961/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14962/// store the results in dst.
14963///
14964/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu32)
14965#[inline]
14966#[target_feature(enable = "avx512fp16,avx512vl")]
14967#[cfg_attr(test, assert_instr(vcvttph2udq))]
14968#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14969pub fn _mm_cvttph_epu32(a: __m128h) -> __m128i {
14970    _mm_mask_cvttph_epu32(_mm_undefined_si128(), 0xff, a)
14971}
14972
14973/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14974/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14975///
14976/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu32)
14977#[inline]
14978#[target_feature(enable = "avx512fp16,avx512vl")]
14979#[cfg_attr(test, assert_instr(vcvttph2udq))]
14980#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14981pub fn _mm_mask_cvttph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14982    unsafe { transmute(vcvttph2udq_128(a, src.as_u32x4(), k)) }
14983}
14984
14985/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14986/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14987///
14988/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu32)
14989#[inline]
14990#[target_feature(enable = "avx512fp16,avx512vl")]
14991#[cfg_attr(test, assert_instr(vcvttph2udq))]
14992#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14993pub fn _mm_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m128i {
14994    _mm_mask_cvttph_epu32(_mm_setzero_si128(), k, a)
14995}
14996
14997/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14998/// store the results in dst.
14999///
15000/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu32)
15001#[inline]
15002#[target_feature(enable = "avx512fp16,avx512vl")]
15003#[cfg_attr(test, assert_instr(vcvttph2udq))]
15004#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15005pub fn _mm256_cvttph_epu32(a: __m128h) -> __m256i {
15006    _mm256_mask_cvttph_epu32(_mm256_undefined_si256(), 0xff, a)
15007}
15008
15009/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15010/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15011///
15012/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu32)
15013#[inline]
15014#[target_feature(enable = "avx512fp16,avx512vl")]
15015#[cfg_attr(test, assert_instr(vcvttph2udq))]
15016#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15017pub fn _mm256_mask_cvttph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15018    unsafe { transmute(vcvttph2udq_256(a, src.as_u32x8(), k)) }
15019}
15020
15021/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15022/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15023///
15024/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu32)
15025#[inline]
15026#[target_feature(enable = "avx512fp16,avx512vl")]
15027#[cfg_attr(test, assert_instr(vcvttph2udq))]
15028#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15029pub fn _mm256_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m256i {
15030    _mm256_mask_cvttph_epu32(_mm256_setzero_si256(), k, a)
15031}
15032
15033/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15034/// store the results in dst.
15035///
15036/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu32)
15037#[inline]
15038#[target_feature(enable = "avx512fp16")]
15039#[cfg_attr(test, assert_instr(vcvttph2udq))]
15040#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15041pub fn _mm512_cvttph_epu32(a: __m256h) -> __m512i {
15042    _mm512_mask_cvttph_epu32(_mm512_undefined_epi32(), 0xffff, a)
15043}
15044
15045/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15046/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15047///
15048/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu32)
15049#[inline]
15050#[target_feature(enable = "avx512fp16")]
15051#[cfg_attr(test, assert_instr(vcvttph2udq))]
15052#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15053pub fn _mm512_mask_cvttph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
15054    unsafe {
15055        transmute(vcvttph2udq_512(
15056            a,
15057            src.as_u32x16(),
15058            k,
15059            _MM_FROUND_CUR_DIRECTION,
15060        ))
15061    }
15062}
15063
15064/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15065/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15066///
15067/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu32)
15068#[inline]
15069#[target_feature(enable = "avx512fp16")]
15070#[cfg_attr(test, assert_instr(vcvttph2udq))]
15071#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15072pub fn _mm512_maskz_cvttph_epu32(k: __mmask16, a: __m256h) -> __m512i {
15073    _mm512_mask_cvttph_epu32(_mm512_setzero_si512(), k, a)
15074}
15075
15076/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15077/// store the results in dst.
15078///
15079/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
15080///
15081/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu32)
15082#[inline]
15083#[target_feature(enable = "avx512fp16")]
15084#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
15085#[rustc_legacy_const_generics(1)]
15086#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15087pub fn _mm512_cvtt_roundph_epu32<const SAE: i32>(a: __m256h) -> __m512i {
15088    static_assert_sae!(SAE);
15089    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
15090}
15091
15092/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15093/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15094///
15095/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
15096///
15097/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu32)
15098#[inline]
15099#[target_feature(enable = "avx512fp16")]
15100#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
15101#[rustc_legacy_const_generics(3)]
15102#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15103pub fn _mm512_mask_cvtt_roundph_epu32<const SAE: i32>(
15104    src: __m512i,
15105    k: __mmask16,
15106    a: __m256h,
15107) -> __m512i {
15108    unsafe {
15109        static_assert_sae!(SAE);
15110        transmute(vcvttph2udq_512(a, src.as_u32x16(), k, SAE))
15111    }
15112}
15113
15114/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15115/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15116///
15117/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
15118///
15119/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu32)
15120#[inline]
15121#[target_feature(enable = "avx512fp16")]
15122#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
15123#[rustc_legacy_const_generics(2)]
15124#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15125pub fn _mm512_maskz_cvtt_roundph_epu32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
15126    static_assert_sae!(SAE);
15127    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_setzero_si512(), k, a)
15128}
15129
15130/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
15131/// the result in dst.
15132///
15133/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_u32)
15134#[inline]
15135#[target_feature(enable = "avx512fp16")]
15136#[cfg_attr(test, assert_instr(vcvttsh2usi))]
15137#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15138pub fn _mm_cvttsh_u32(a: __m128h) -> u32 {
15139    unsafe { vcvttsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
15140}
15141
15142/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
15143/// the result in dst.
15144///
15145/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
15146///
15147/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_u32)
15148#[inline]
15149#[target_feature(enable = "avx512fp16")]
15150#[cfg_attr(test, assert_instr(vcvttsh2usi, SAE = 8))]
15151#[rustc_legacy_const_generics(1)]
15152#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15153pub fn _mm_cvtt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
15154    unsafe {
15155        static_assert_sae!(SAE);
15156        vcvttsh2usi32(a, SAE)
15157    }
15158}
15159
15160/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15161/// store the results in dst.
15162///
15163/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi64)
15164#[inline]
15165#[target_feature(enable = "avx512fp16,avx512vl")]
15166#[cfg_attr(test, assert_instr(vcvtph2qq))]
15167#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15168pub fn _mm_cvtph_epi64(a: __m128h) -> __m128i {
15169    _mm_mask_cvtph_epi64(_mm_undefined_si128(), 0xff, a)
15170}
15171
15172/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15173/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15174///
15175/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi64)
15176#[inline]
15177#[target_feature(enable = "avx512fp16,avx512vl")]
15178#[cfg_attr(test, assert_instr(vcvtph2qq))]
15179#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15180pub fn _mm_mask_cvtph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15181    unsafe { transmute(vcvtph2qq_128(a, src.as_i64x2(), k)) }
15182}
15183
15184/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15185/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15186///
15187/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi64)
15188#[inline]
15189#[target_feature(enable = "avx512fp16,avx512vl")]
15190#[cfg_attr(test, assert_instr(vcvtph2qq))]
15191#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15192pub fn _mm_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m128i {
15193    _mm_mask_cvtph_epi64(_mm_setzero_si128(), k, a)
15194}
15195
15196/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15197/// store the results in dst.
15198///
15199/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi64)
15200#[inline]
15201#[target_feature(enable = "avx512fp16,avx512vl")]
15202#[cfg_attr(test, assert_instr(vcvtph2qq))]
15203#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15204pub fn _mm256_cvtph_epi64(a: __m128h) -> __m256i {
15205    _mm256_mask_cvtph_epi64(_mm256_undefined_si256(), 0xff, a)
15206}
15207
15208/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15209/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15210///
15211/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi64)
15212#[inline]
15213#[target_feature(enable = "avx512fp16,avx512vl")]
15214#[cfg_attr(test, assert_instr(vcvtph2qq))]
15215#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15216pub fn _mm256_mask_cvtph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15217    unsafe { transmute(vcvtph2qq_256(a, src.as_i64x4(), k)) }
15218}
15219
15220/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15221/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15222///
15223/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi64)
15224#[inline]
15225#[target_feature(enable = "avx512fp16,avx512vl")]
15226#[cfg_attr(test, assert_instr(vcvtph2qq))]
15227#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15228pub fn _mm256_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m256i {
15229    _mm256_mask_cvtph_epi64(_mm256_setzero_si256(), k, a)
15230}
15231
15232/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15233/// store the results in dst.
15234///
15235/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi64)
15236#[inline]
15237#[target_feature(enable = "avx512fp16")]
15238#[cfg_attr(test, assert_instr(vcvtph2qq))]
15239#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15240pub fn _mm512_cvtph_epi64(a: __m128h) -> __m512i {
15241    _mm512_mask_cvtph_epi64(_mm512_undefined_epi32(), 0xff, a)
15242}
15243
15244/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15245/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15246///
15247/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi64)
15248#[inline]
15249#[target_feature(enable = "avx512fp16")]
15250#[cfg_attr(test, assert_instr(vcvtph2qq))]
15251#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15252pub fn _mm512_mask_cvtph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15253    unsafe {
15254        transmute(vcvtph2qq_512(
15255            a,
15256            src.as_i64x8(),
15257            k,
15258            _MM_FROUND_CUR_DIRECTION,
15259        ))
15260    }
15261}
15262
15263/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15264/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15265///
15266/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi64)
15267#[inline]
15268#[target_feature(enable = "avx512fp16")]
15269#[cfg_attr(test, assert_instr(vcvtph2qq))]
15270#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15271pub fn _mm512_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m512i {
15272    _mm512_mask_cvtph_epi64(_mm512_setzero_si512(), k, a)
15273}
15274
15275/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15276/// store the results in dst.
15277///
15278/// Rounding is done according to the rounding parameter, which can be one of:
15279///
15280/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15281/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15282/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15283/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15284/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15285///
15286/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi64)
15287#[inline]
15288#[target_feature(enable = "avx512fp16")]
15289#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15290#[rustc_legacy_const_generics(1)]
15291#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15292pub fn _mm512_cvt_roundph_epi64<const ROUNDING: i32>(a: __m128h) -> __m512i {
15293    static_assert_rounding!(ROUNDING);
15294    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
15295}
15296
15297/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15298/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15299///
15300/// Rounding is done according to the rounding parameter, which can be one of:
15301///
15302/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15303/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15304/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15305/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15306/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15307///
15308/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi64)
15309#[inline]
15310#[target_feature(enable = "avx512fp16")]
15311#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15312#[rustc_legacy_const_generics(3)]
15313#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15314pub fn _mm512_mask_cvt_roundph_epi64<const ROUNDING: i32>(
15315    src: __m512i,
15316    k: __mmask8,
15317    a: __m128h,
15318) -> __m512i {
15319    unsafe {
15320        static_assert_rounding!(ROUNDING);
15321        transmute(vcvtph2qq_512(a, src.as_i64x8(), k, ROUNDING))
15322    }
15323}
15324
15325/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15326/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15327///
15328/// Rounding is done according to the rounding parameter, which can be one of:
15329///
15330/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15331/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15332/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15333/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15334/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15335///
15336/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi64)
15337#[inline]
15338#[target_feature(enable = "avx512fp16")]
15339#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15340#[rustc_legacy_const_generics(2)]
15341#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15342pub fn _mm512_maskz_cvt_roundph_epi64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
15343    static_assert_rounding!(ROUNDING);
15344    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_setzero_si512(), k, a)
15345}
15346
15347/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15348/// store the results in dst.
15349///
15350/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu64)
15351#[inline]
15352#[target_feature(enable = "avx512fp16,avx512vl")]
15353#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15354#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15355pub fn _mm_cvtph_epu64(a: __m128h) -> __m128i {
15356    _mm_mask_cvtph_epu64(_mm_undefined_si128(), 0xff, a)
15357}
15358
15359/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15360/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15361///
15362/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu64)
15363#[inline]
15364#[target_feature(enable = "avx512fp16,avx512vl")]
15365#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15366#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15367pub fn _mm_mask_cvtph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15368    unsafe { transmute(vcvtph2uqq_128(a, src.as_u64x2(), k)) }
15369}
15370
15371/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15372/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15373///
15374/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu64)
15375#[inline]
15376#[target_feature(enable = "avx512fp16,avx512vl")]
15377#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15378#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15379pub fn _mm_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m128i {
15380    _mm_mask_cvtph_epu64(_mm_setzero_si128(), k, a)
15381}
15382
15383/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15384/// store the results in dst.
15385///
15386/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu64)
15387#[inline]
15388#[target_feature(enable = "avx512fp16,avx512vl")]
15389#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15390#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15391pub fn _mm256_cvtph_epu64(a: __m128h) -> __m256i {
15392    _mm256_mask_cvtph_epu64(_mm256_undefined_si256(), 0xff, a)
15393}
15394
15395/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15396/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15397///
15398/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu64)
15399#[inline]
15400#[target_feature(enable = "avx512fp16,avx512vl")]
15401#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15402#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15403pub fn _mm256_mask_cvtph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15404    unsafe { transmute(vcvtph2uqq_256(a, src.as_u64x4(), k)) }
15405}
15406
15407/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15408/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15409///
15410/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu64)
15411#[inline]
15412#[target_feature(enable = "avx512fp16,avx512vl")]
15413#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15414#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15415pub fn _mm256_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m256i {
15416    _mm256_mask_cvtph_epu64(_mm256_setzero_si256(), k, a)
15417}
15418
15419/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15420/// store the results in dst.
15421///
15422/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu64)
15423#[inline]
15424#[target_feature(enable = "avx512fp16")]
15425#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15426#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15427pub fn _mm512_cvtph_epu64(a: __m128h) -> __m512i {
15428    _mm512_mask_cvtph_epu64(_mm512_undefined_epi32(), 0xff, a)
15429}
15430
15431/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15432/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15433///
15434/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu64)
15435#[inline]
15436#[target_feature(enable = "avx512fp16")]
15437#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15438#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15439pub fn _mm512_mask_cvtph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15440    unsafe {
15441        transmute(vcvtph2uqq_512(
15442            a,
15443            src.as_u64x8(),
15444            k,
15445            _MM_FROUND_CUR_DIRECTION,
15446        ))
15447    }
15448}
15449
15450/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15451/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15452///
15453/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu64)
15454#[inline]
15455#[target_feature(enable = "avx512fp16")]
15456#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15457#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15458pub fn _mm512_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m512i {
15459    _mm512_mask_cvtph_epu64(_mm512_setzero_si512(), k, a)
15460}
15461
15462/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15463/// store the results in dst.
15464///
15465/// Rounding is done according to the rounding parameter, which can be one of:
15466///
15467/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15468/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15469/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15470/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15471/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15472///
15473/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu64)
15474#[inline]
15475#[target_feature(enable = "avx512fp16")]
15476#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15477#[rustc_legacy_const_generics(1)]
15478#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15479pub fn _mm512_cvt_roundph_epu64<const ROUNDING: i32>(a: __m128h) -> __m512i {
15480    static_assert_rounding!(ROUNDING);
15481    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
15482}
15483
15484/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15485/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15486///
15487/// Rounding is done according to the rounding parameter, which can be one of:
15488///
15489/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15490/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15491/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15492/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15493/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15494///
15495/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu64)
15496#[inline]
15497#[target_feature(enable = "avx512fp16")]
15498#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15499#[rustc_legacy_const_generics(3)]
15500#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15501pub fn _mm512_mask_cvt_roundph_epu64<const ROUNDING: i32>(
15502    src: __m512i,
15503    k: __mmask8,
15504    a: __m128h,
15505) -> __m512i {
15506    unsafe {
15507        static_assert_rounding!(ROUNDING);
15508        transmute(vcvtph2uqq_512(a, src.as_u64x8(), k, ROUNDING))
15509    }
15510}
15511
15512/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15513/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15514///
15515/// Rounding is done according to the rounding parameter, which can be one of:
15516///
15517/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15518/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15519/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15520/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15521/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15522///
15523/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu64)
15524#[inline]
15525#[target_feature(enable = "avx512fp16")]
15526#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15527#[rustc_legacy_const_generics(2)]
15528#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15529pub fn _mm512_maskz_cvt_roundph_epu64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
15530    static_assert_rounding!(ROUNDING);
15531    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_setzero_si512(), k, a)
15532}
15533
15534/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15535/// store the results in dst.
15536///
15537/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi64)
15538#[inline]
15539#[target_feature(enable = "avx512fp16,avx512vl")]
15540#[cfg_attr(test, assert_instr(vcvttph2qq))]
15541#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15542pub fn _mm_cvttph_epi64(a: __m128h) -> __m128i {
15543    _mm_mask_cvttph_epi64(_mm_undefined_si128(), 0xff, a)
15544}
15545
15546/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15547/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15548///
15549/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi64)
15550#[inline]
15551#[target_feature(enable = "avx512fp16,avx512vl")]
15552#[cfg_attr(test, assert_instr(vcvttph2qq))]
15553#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15554pub fn _mm_mask_cvttph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15555    unsafe { transmute(vcvttph2qq_128(a, src.as_i64x2(), k)) }
15556}
15557
15558/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15559/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15560///
15561/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi64)
15562#[inline]
15563#[target_feature(enable = "avx512fp16,avx512vl")]
15564#[cfg_attr(test, assert_instr(vcvttph2qq))]
15565#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15566pub fn _mm_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m128i {
15567    _mm_mask_cvttph_epi64(_mm_setzero_si128(), k, a)
15568}
15569
15570/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15571/// store the results in dst.
15572///
15573/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi64)
15574#[inline]
15575#[target_feature(enable = "avx512fp16,avx512vl")]
15576#[cfg_attr(test, assert_instr(vcvttph2qq))]
15577#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15578pub fn _mm256_cvttph_epi64(a: __m128h) -> __m256i {
15579    _mm256_mask_cvttph_epi64(_mm256_undefined_si256(), 0xff, a)
15580}
15581
15582/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15583/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15584///
15585/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi64)
15586#[inline]
15587#[target_feature(enable = "avx512fp16,avx512vl")]
15588#[cfg_attr(test, assert_instr(vcvttph2qq))]
15589#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15590pub fn _mm256_mask_cvttph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15591    unsafe { transmute(vcvttph2qq_256(a, src.as_i64x4(), k)) }
15592}
15593
15594/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15595/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15596///
15597/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi64)
15598#[inline]
15599#[target_feature(enable = "avx512fp16,avx512vl")]
15600#[cfg_attr(test, assert_instr(vcvttph2qq))]
15601#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15602pub fn _mm256_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m256i {
15603    _mm256_mask_cvttph_epi64(_mm256_setzero_si256(), k, a)
15604}
15605
15606/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15607/// store the results in dst.
15608///
15609/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi64)
15610#[inline]
15611#[target_feature(enable = "avx512fp16")]
15612#[cfg_attr(test, assert_instr(vcvttph2qq))]
15613#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15614pub fn _mm512_cvttph_epi64(a: __m128h) -> __m512i {
15615    _mm512_mask_cvttph_epi64(_mm512_undefined_epi32(), 0xff, a)
15616}
15617
15618/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15619/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15620///
15621/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi64)
15622#[inline]
15623#[target_feature(enable = "avx512fp16")]
15624#[cfg_attr(test, assert_instr(vcvttph2qq))]
15625#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15626pub fn _mm512_mask_cvttph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15627    unsafe {
15628        transmute(vcvttph2qq_512(
15629            a,
15630            src.as_i64x8(),
15631            k,
15632            _MM_FROUND_CUR_DIRECTION,
15633        ))
15634    }
15635}
15636
15637/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15638/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15639///
15640/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi64)
15641#[inline]
15642#[target_feature(enable = "avx512fp16")]
15643#[cfg_attr(test, assert_instr(vcvttph2qq))]
15644#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15645pub fn _mm512_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m512i {
15646    _mm512_mask_cvttph_epi64(_mm512_setzero_si512(), k, a)
15647}
15648
15649/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15650/// store the results in dst.
15651///
15652/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15653///
15654/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi64)
15655#[inline]
15656#[target_feature(enable = "avx512fp16")]
15657#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15658#[rustc_legacy_const_generics(1)]
15659#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15660pub fn _mm512_cvtt_roundph_epi64<const SAE: i32>(a: __m128h) -> __m512i {
15661    static_assert_sae!(SAE);
15662    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
15663}
15664
15665/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15666/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15667///
15668/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15669///
15670/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi64)
15671#[inline]
15672#[target_feature(enable = "avx512fp16")]
15673#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15674#[rustc_legacy_const_generics(3)]
15675#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15676pub fn _mm512_mask_cvtt_roundph_epi64<const SAE: i32>(
15677    src: __m512i,
15678    k: __mmask8,
15679    a: __m128h,
15680) -> __m512i {
15681    unsafe {
15682        static_assert_sae!(SAE);
15683        transmute(vcvttph2qq_512(a, src.as_i64x8(), k, SAE))
15684    }
15685}
15686
15687/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15688/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15689///
15690/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15691///
15692/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi64)
15693#[inline]
15694#[target_feature(enable = "avx512fp16")]
15695#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15696#[rustc_legacy_const_generics(2)]
15697#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15698pub fn _mm512_maskz_cvtt_roundph_epi64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
15699    static_assert_sae!(SAE);
15700    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_setzero_si512(), k, a)
15701}
15702
15703/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15704/// store the results in dst.
15705///
15706/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu64)
15707#[inline]
15708#[target_feature(enable = "avx512fp16,avx512vl")]
15709#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15710#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15711pub fn _mm_cvttph_epu64(a: __m128h) -> __m128i {
15712    _mm_mask_cvttph_epu64(_mm_undefined_si128(), 0xff, a)
15713}
15714
15715/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15716/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15717///
15718/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu64)
15719#[inline]
15720#[target_feature(enable = "avx512fp16,avx512vl")]
15721#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15722#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15723pub fn _mm_mask_cvttph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15724    unsafe { transmute(vcvttph2uqq_128(a, src.as_u64x2(), k)) }
15725}
15726
15727/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15728/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15729///
15730/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu64)
15731#[inline]
15732#[target_feature(enable = "avx512fp16,avx512vl")]
15733#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15734#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15735pub fn _mm_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m128i {
15736    _mm_mask_cvttph_epu64(_mm_setzero_si128(), k, a)
15737}
15738
15739/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15740/// store the results in dst.
15741///
15742/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu64)
15743#[inline]
15744#[target_feature(enable = "avx512fp16,avx512vl")]
15745#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15746#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15747pub fn _mm256_cvttph_epu64(a: __m128h) -> __m256i {
15748    _mm256_mask_cvttph_epu64(_mm256_undefined_si256(), 0xff, a)
15749}
15750
15751/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15752/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15753///
15754/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu64)
15755#[inline]
15756#[target_feature(enable = "avx512fp16,avx512vl")]
15757#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15758#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15759pub fn _mm256_mask_cvttph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15760    unsafe { transmute(vcvttph2uqq_256(a, src.as_u64x4(), k)) }
15761}
15762
15763/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15764/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15765///
15766/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu64)
15767#[inline]
15768#[target_feature(enable = "avx512fp16,avx512vl")]
15769#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15770#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15771pub fn _mm256_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m256i {
15772    _mm256_mask_cvttph_epu64(_mm256_setzero_si256(), k, a)
15773}
15774
15775/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15776/// store the results in dst.
15777///
15778/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu64)
15779#[inline]
15780#[target_feature(enable = "avx512fp16")]
15781#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15782#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15783pub fn _mm512_cvttph_epu64(a: __m128h) -> __m512i {
15784    _mm512_mask_cvttph_epu64(_mm512_undefined_epi32(), 0xff, a)
15785}
15786
15787/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15788/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15789///
15790/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu64)
15791#[inline]
15792#[target_feature(enable = "avx512fp16")]
15793#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15794#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15795pub fn _mm512_mask_cvttph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15796    unsafe {
15797        transmute(vcvttph2uqq_512(
15798            a,
15799            src.as_u64x8(),
15800            k,
15801            _MM_FROUND_CUR_DIRECTION,
15802        ))
15803    }
15804}
15805
15806/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15807/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15808///
15809/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu64)
15810#[inline]
15811#[target_feature(enable = "avx512fp16")]
15812#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15813#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15814pub fn _mm512_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m512i {
15815    _mm512_mask_cvttph_epu64(_mm512_setzero_si512(), k, a)
15816}
15817
15818/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15819/// store the results in dst.
15820///
15821/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15822///
15823/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu64)
15824#[inline]
15825#[target_feature(enable = "avx512fp16")]
15826#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15827#[rustc_legacy_const_generics(1)]
15828#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15829pub fn _mm512_cvtt_roundph_epu64<const SAE: i32>(a: __m128h) -> __m512i {
15830    static_assert_sae!(SAE);
15831    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
15832}
15833
15834/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15835/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15836///
15837/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15838///
15839/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu64)
15840#[inline]
15841#[target_feature(enable = "avx512fp16")]
15842#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15843#[rustc_legacy_const_generics(3)]
15844#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15845pub fn _mm512_mask_cvtt_roundph_epu64<const SAE: i32>(
15846    src: __m512i,
15847    k: __mmask8,
15848    a: __m128h,
15849) -> __m512i {
15850    unsafe {
15851        static_assert_sae!(SAE);
15852        transmute(vcvttph2uqq_512(a, src.as_u64x8(), k, SAE))
15853    }
15854}
15855
15856/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15857/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15858///
15859/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15860///
15861/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu64)
15862#[inline]
15863#[target_feature(enable = "avx512fp16")]
15864#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15865#[rustc_legacy_const_generics(2)]
15866#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15867pub fn _mm512_maskz_cvtt_roundph_epu64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
15868    static_assert_sae!(SAE);
15869    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_setzero_si512(), k, a)
15870}
15871
15872/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15873/// floating-point elements, and store the results in dst.
15874///
15875/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxph_ps)
15876#[inline]
15877#[target_feature(enable = "avx512fp16,avx512vl")]
15878#[cfg_attr(test, assert_instr(vcvtph2psx))]
15879#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15880pub fn _mm_cvtxph_ps(a: __m128h) -> __m128 {
15881    _mm_mask_cvtxph_ps(_mm_setzero_ps(), 0xff, a)
15882}
15883
15884/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15885/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15886/// dst when the corresponding mask bit is not set).
15887///
15888/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxph_ps)
15889#[inline]
15890#[target_feature(enable = "avx512fp16,avx512vl")]
15891#[cfg_attr(test, assert_instr(vcvtph2psx))]
15892#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15893pub fn _mm_mask_cvtxph_ps(src: __m128, k: __mmask8, a: __m128h) -> __m128 {
15894    unsafe { vcvtph2psx_128(a, src, k) }
15895}
15896
15897/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15898/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15899/// corresponding mask bit is not set).
15900///
15901/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxph_ps)
15902#[inline]
15903#[target_feature(enable = "avx512fp16,avx512vl")]
15904#[cfg_attr(test, assert_instr(vcvtph2psx))]
15905#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15906pub fn _mm_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m128 {
15907    _mm_mask_cvtxph_ps(_mm_setzero_ps(), k, a)
15908}
15909
15910/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15911/// floating-point elements, and store the results in dst.
15912///
15913/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxph_ps)
15914#[inline]
15915#[target_feature(enable = "avx512fp16,avx512vl")]
15916#[cfg_attr(test, assert_instr(vcvtph2psx))]
15917#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15918pub fn _mm256_cvtxph_ps(a: __m128h) -> __m256 {
15919    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), 0xff, a)
15920}
15921
15922/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15923/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15924/// dst when the corresponding mask bit is not set).
15925///
15926/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxph_ps)
15927#[inline]
15928#[target_feature(enable = "avx512fp16,avx512vl")]
15929#[cfg_attr(test, assert_instr(vcvtph2psx))]
15930#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15931pub fn _mm256_mask_cvtxph_ps(src: __m256, k: __mmask8, a: __m128h) -> __m256 {
15932    unsafe { vcvtph2psx_256(a, src, k) }
15933}
15934
15935/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15936/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15937/// corresponding mask bit is not set).
15938///
15939/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxph_ps)
15940#[inline]
15941#[target_feature(enable = "avx512fp16,avx512vl")]
15942#[cfg_attr(test, assert_instr(vcvtph2psx))]
15943#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15944pub fn _mm256_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m256 {
15945    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), k, a)
15946}
15947
15948/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15949/// floating-point elements, and store the results in dst.
15950///
15951/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxph_ps)
15952#[inline]
15953#[target_feature(enable = "avx512fp16")]
15954#[cfg_attr(test, assert_instr(vcvtph2psx))]
15955#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15956pub fn _mm512_cvtxph_ps(a: __m256h) -> __m512 {
15957    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), 0xffff, a)
15958}
15959
15960/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15961/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15962/// dst when the corresponding mask bit is not set).
15963///
15964/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxph_ps)
15965#[inline]
15966#[target_feature(enable = "avx512fp16")]
15967#[cfg_attr(test, assert_instr(vcvtph2psx))]
15968#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15969pub fn _mm512_mask_cvtxph_ps(src: __m512, k: __mmask16, a: __m256h) -> __m512 {
15970    unsafe { vcvtph2psx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
15971}
15972
15973/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15974/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15975/// corresponding mask bit is not set).
15976///
15977/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxph_ps)
15978#[inline]
15979#[target_feature(enable = "avx512fp16")]
15980#[cfg_attr(test, assert_instr(vcvtph2psx))]
15981#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15982pub fn _mm512_maskz_cvtxph_ps(k: __mmask16, a: __m256h) -> __m512 {
15983    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), k, a)
15984}
15985
15986/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15987/// floating-point elements, and store the results in dst.
15988///
15989/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15990///
15991/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundph_ps)
15992#[inline]
15993#[target_feature(enable = "avx512fp16")]
15994#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
15995#[rustc_legacy_const_generics(1)]
15996#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15997pub fn _mm512_cvtx_roundph_ps<const SAE: i32>(a: __m256h) -> __m512 {
15998    static_assert_sae!(SAE);
15999    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), 0xffff, a)
16000}
16001
16002/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16003/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16004/// dst when the corresponding mask bit is not set).
16005///
16006/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16007///
16008/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundph_ps)
16009#[inline]
16010#[target_feature(enable = "avx512fp16")]
16011#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
16012#[rustc_legacy_const_generics(3)]
16013#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16014pub fn _mm512_mask_cvtx_roundph_ps<const SAE: i32>(
16015    src: __m512,
16016    k: __mmask16,
16017    a: __m256h,
16018) -> __m512 {
16019    unsafe {
16020        static_assert_sae!(SAE);
16021        vcvtph2psx_512(a, src, k, SAE)
16022    }
16023}
16024
16025/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16026/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16027/// corresponding mask bit is not set).
16028///
16029/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16030///
16031/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundph_ps)
16032#[inline]
16033#[target_feature(enable = "avx512fp16")]
16034#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
16035#[rustc_legacy_const_generics(2)]
16036#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16037pub fn _mm512_maskz_cvtx_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512 {
16038    static_assert_sae!(SAE);
16039    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), k, a)
16040}
16041
16042/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
16043/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed
16044/// elements from a to the upper elements of dst.
16045///
16046/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_ss)
16047#[inline]
16048#[target_feature(enable = "avx512fp16")]
16049#[cfg_attr(test, assert_instr(vcvtsh2ss))]
16050#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16051pub fn _mm_cvtsh_ss(a: __m128, b: __m128h) -> __m128 {
16052    _mm_mask_cvtsh_ss(a, 0xff, a, b)
16053}
16054
16055/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
16056/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16057/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
16058/// upper elements of dst.
16059///
16060/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_ss)
16061#[inline]
16062#[target_feature(enable = "avx512fp16")]
16063#[cfg_attr(test, assert_instr(vcvtsh2ss))]
16064#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16065pub fn _mm_mask_cvtsh_ss(src: __m128, k: __mmask8, a: __m128, b: __m128h) -> __m128 {
16066    unsafe { vcvtsh2ss(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
16067}
16068
16069/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
16070/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16071/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
16072/// of dst.
16073///
16074/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_ss)
16075#[inline]
16076#[target_feature(enable = "avx512fp16")]
16077#[cfg_attr(test, assert_instr(vcvtsh2ss))]
16078#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16079pub fn _mm_maskz_cvtsh_ss(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
16080    _mm_mask_cvtsh_ss(_mm_set_ss(0.0), k, a, b)
16081}
16082
16083/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
16084/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements
16085/// from a to the upper elements of dst.
16086///
16087/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16088///
16089/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_ss)
16090#[inline]
16091#[target_feature(enable = "avx512fp16")]
16092#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
16093#[rustc_legacy_const_generics(2)]
16094#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16095pub fn _mm_cvt_roundsh_ss<const SAE: i32>(a: __m128, b: __m128h) -> __m128 {
16096    static_assert_sae!(SAE);
16097    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_undefined_ps(), 0xff, a, b)
16098}
16099
16100/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
16101/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16102/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
16103/// upper elements of dst.
16104///
16105/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16106///
16107/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_ss)
16108#[inline]
16109#[target_feature(enable = "avx512fp16")]
16110#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
16111#[rustc_legacy_const_generics(4)]
16112#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16113pub fn _mm_mask_cvt_roundsh_ss<const SAE: i32>(
16114    src: __m128,
16115    k: __mmask8,
16116    a: __m128,
16117    b: __m128h,
16118) -> __m128 {
16119    unsafe {
16120        static_assert_sae!(SAE);
16121        vcvtsh2ss(a, b, src, k, SAE)
16122    }
16123}
16124
16125/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
16126/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16127/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
16128/// of dst.
16129///
16130/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16131///
16132/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_ss)
16133#[inline]
16134#[target_feature(enable = "avx512fp16")]
16135#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
16136#[rustc_legacy_const_generics(3)]
16137#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16138pub fn _mm_maskz_cvt_roundsh_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
16139    static_assert_sae!(SAE);
16140    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_set_ss(0.0), k, a, b)
16141}
16142
16143/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16144/// floating-point elements, and store the results in dst.
16145///
16146/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_pd)
16147#[inline]
16148#[target_feature(enable = "avx512fp16,avx512vl")]
16149#[cfg_attr(test, assert_instr(vcvtph2pd))]
16150#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16151pub fn _mm_cvtph_pd(a: __m128h) -> __m128d {
16152    _mm_mask_cvtph_pd(_mm_setzero_pd(), 0xff, a)
16153}
16154
16155/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16156/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16157/// dst when the corresponding mask bit is not set).
16158///
16159/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_pd)
16160#[inline]
16161#[target_feature(enable = "avx512fp16,avx512vl")]
16162#[cfg_attr(test, assert_instr(vcvtph2pd))]
16163#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16164pub fn _mm_mask_cvtph_pd(src: __m128d, k: __mmask8, a: __m128h) -> __m128d {
16165    unsafe { vcvtph2pd_128(a, src, k) }
16166}
16167
16168/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16169/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16170/// corresponding mask bit is not set).
16171///
16172/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_pd)
16173#[inline]
16174#[target_feature(enable = "avx512fp16,avx512vl")]
16175#[cfg_attr(test, assert_instr(vcvtph2pd))]
16176#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16177pub fn _mm_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m128d {
16178    _mm_mask_cvtph_pd(_mm_setzero_pd(), k, a)
16179}
16180
16181/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16182/// floating-point elements, and store the results in dst.
16183///
16184/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_pd)
16185#[inline]
16186#[target_feature(enable = "avx512fp16,avx512vl")]
16187#[cfg_attr(test, assert_instr(vcvtph2pd))]
16188#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16189pub fn _mm256_cvtph_pd(a: __m128h) -> __m256d {
16190    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), 0xff, a)
16191}
16192
16193/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16194/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16195/// dst when the corresponding mask bit is not set).
16196///
16197/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_pd)
16198#[inline]
16199#[target_feature(enable = "avx512fp16,avx512vl")]
16200#[cfg_attr(test, assert_instr(vcvtph2pd))]
16201#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16202pub fn _mm256_mask_cvtph_pd(src: __m256d, k: __mmask8, a: __m128h) -> __m256d {
16203    unsafe { vcvtph2pd_256(a, src, k) }
16204}
16205
16206/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16207/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16208/// corresponding mask bit is not set).
16209///
16210/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_pd)
16211#[inline]
16212#[target_feature(enable = "avx512fp16,avx512vl")]
16213#[cfg_attr(test, assert_instr(vcvtph2pd))]
16214#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16215pub fn _mm256_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m256d {
16216    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), k, a)
16217}
16218
16219/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16220/// floating-point elements, and store the results in dst.
16221///
16222/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_pd)
16223#[inline]
16224#[target_feature(enable = "avx512fp16")]
16225#[cfg_attr(test, assert_instr(vcvtph2pd))]
16226#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16227pub fn _mm512_cvtph_pd(a: __m128h) -> __m512d {
16228    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), 0xff, a)
16229}
16230
16231/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16232/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16233/// dst when the corresponding mask bit is not set).
16234///
16235/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_pd)
16236#[inline]
16237#[target_feature(enable = "avx512fp16")]
16238#[cfg_attr(test, assert_instr(vcvtph2pd))]
16239#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16240pub fn _mm512_mask_cvtph_pd(src: __m512d, k: __mmask8, a: __m128h) -> __m512d {
16241    unsafe { vcvtph2pd_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
16242}
16243
16244/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16245/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16246/// corresponding mask bit is not set).
16247///
16248/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_pd)
16249#[inline]
16250#[target_feature(enable = "avx512fp16")]
16251#[cfg_attr(test, assert_instr(vcvtph2pd))]
16252#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16253pub fn _mm512_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m512d {
16254    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), k, a)
16255}
16256
16257/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16258/// floating-point elements, and store the results in dst.
16259///
16260/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16261///
16262/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_pd)
16263#[inline]
16264#[target_feature(enable = "avx512fp16")]
16265#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16266#[rustc_legacy_const_generics(1)]
16267#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16268pub fn _mm512_cvt_roundph_pd<const SAE: i32>(a: __m128h) -> __m512d {
16269    static_assert_sae!(SAE);
16270    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), 0xff, a)
16271}
16272
16273/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16274/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16275/// dst when the corresponding mask bit is not set).
16276///
16277/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16278///
16279/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_pd)
16280#[inline]
16281#[target_feature(enable = "avx512fp16")]
16282#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16283#[rustc_legacy_const_generics(3)]
16284#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16285pub fn _mm512_mask_cvt_roundph_pd<const SAE: i32>(
16286    src: __m512d,
16287    k: __mmask8,
16288    a: __m128h,
16289) -> __m512d {
16290    unsafe {
16291        static_assert_sae!(SAE);
16292        vcvtph2pd_512(a, src, k, SAE)
16293    }
16294}
16295
16296/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16297/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16298/// corresponding mask bit is not set).
16299///
16300/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16301///
16302/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_pd)
16303#[inline]
16304#[target_feature(enable = "avx512fp16")]
16305#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16306#[rustc_legacy_const_generics(2)]
16307#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16308pub fn _mm512_maskz_cvt_roundph_pd<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512d {
16309    static_assert_sae!(SAE);
16310    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), k, a)
16311}
16312
16313/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16314/// floating-point element, store the result in the lower element of dst, and copy the upper element
16315/// from a to the upper element of dst.
16316///
16317/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_sd)
16318#[inline]
16319#[target_feature(enable = "avx512fp16")]
16320#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16321#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16322pub fn _mm_cvtsh_sd(a: __m128d, b: __m128h) -> __m128d {
16323    _mm_mask_cvtsh_sd(a, 0xff, a, b)
16324}
16325
16326/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16327/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16328/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
16329/// of dst.
16330///
16331/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_sd)
16332#[inline]
16333#[target_feature(enable = "avx512fp16")]
16334#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16335#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16336pub fn _mm_mask_cvtsh_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16337    unsafe { vcvtsh2sd(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
16338}
16339
16340/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16341/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16342/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
16343///
16344/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_sd)
16345#[inline]
16346#[target_feature(enable = "avx512fp16")]
16347#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16348#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16349pub fn _mm_maskz_cvtsh_sd(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16350    _mm_mask_cvtsh_sd(_mm_set_sd(0.0), k, a, b)
16351}
16352
16353/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16354/// floating-point element, store the result in the lower element of dst, and copy the upper element from a
16355/// to the upper element of dst.
16356///
16357/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16358///
16359/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_sd)
16360#[inline]
16361#[target_feature(enable = "avx512fp16")]
16362#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16363#[rustc_legacy_const_generics(2)]
16364#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16365pub fn _mm_cvt_roundsh_sd<const SAE: i32>(a: __m128d, b: __m128h) -> __m128d {
16366    static_assert_sae!(SAE);
16367    _mm_mask_cvt_roundsh_sd::<SAE>(a, 0xff, a, b)
16368}
16369
16370/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16371/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16372/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
16373/// of dst.
16374///
16375/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16376///
16377/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_sd)
16378#[inline]
16379#[target_feature(enable = "avx512fp16")]
16380#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16381#[rustc_legacy_const_generics(4)]
16382#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16383pub fn _mm_mask_cvt_roundsh_sd<const SAE: i32>(
16384    src: __m128d,
16385    k: __mmask8,
16386    a: __m128d,
16387    b: __m128h,
16388) -> __m128d {
16389    unsafe {
16390        static_assert_sae!(SAE);
16391        vcvtsh2sd(a, b, src, k, SAE)
16392    }
16393}
16394
16395/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16396/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16397/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
16398///
16399/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16400///
16401/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_sd)
16402#[inline]
16403#[target_feature(enable = "avx512fp16")]
16404#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16405#[rustc_legacy_const_generics(3)]
16406#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16407pub fn _mm_maskz_cvt_roundsh_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16408    static_assert_sae!(SAE);
16409    _mm_mask_cvt_roundsh_sd::<SAE>(_mm_set_sd(0.0), k, a, b)
16410}
16411
16412/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16413///
16414/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_h)
16415#[inline]
16416#[target_feature(enable = "avx512fp16")]
16417#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16418#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
16419pub const fn _mm_cvtsh_h(a: __m128h) -> f16 {
16420    unsafe { simd_extract!(a, 0) }
16421}
16422
16423/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16424///
16425/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsh_h)
16426#[inline]
16427#[target_feature(enable = "avx512fp16")]
16428#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16429#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
16430pub const fn _mm256_cvtsh_h(a: __m256h) -> f16 {
16431    unsafe { simd_extract!(a, 0) }
16432}
16433
16434/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16435///
16436/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsh_h)
16437#[inline]
16438#[target_feature(enable = "avx512fp16")]
16439#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16440#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
16441pub const fn _mm512_cvtsh_h(a: __m512h) -> f16 {
16442    unsafe { simd_extract!(a, 0) }
16443}
16444
16445/// Copy the lower 16-bit integer in a to dst.
16446///
16447/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si16)
16448#[inline]
16449#[target_feature(enable = "avx512fp16")]
16450#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16451#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
16452pub const fn _mm_cvtsi128_si16(a: __m128i) -> i16 {
16453    unsafe { simd_extract!(a.as_i16x8(), 0) }
16454}
16455
16456/// Copy 16-bit integer a to the lower elements of dst, and zero the upper elements of dst.
16457///
16458/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi16_si128)
16459#[inline]
16460#[target_feature(enable = "avx512fp16")]
16461#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16462#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
16463pub const fn _mm_cvtsi16_si128(a: i16) -> __m128i {
16464    unsafe { transmute(simd_insert!(i16x8::ZERO, 0, a)) }
16465}
16466
16467#[allow(improper_ctypes)]
16468unsafe extern "unadjusted" {
16469    #[link_name = "llvm.x86.avx512fp16.mask.cmp.ph.128"]
16470    fn vcmpph_128(a: __m128h, b: __m128h, imm5: i32, mask: __mmask8) -> __mmask8;
16471    #[link_name = "llvm.x86.avx512fp16.mask.cmp.ph.256"]
16472    fn vcmpph_256(a: __m256h, b: __m256h, imm5: i32, mask: __mmask16) -> __mmask16;
16473    #[link_name = "llvm.x86.avx512fp16.mask.cmp.ph.512"]
16474    fn vcmpph_512(a: __m512h, b: __m512h, imm5: i32, mask: __mmask32, sae: i32) -> __mmask32;
16475
16476    #[link_name = "llvm.x86.avx512fp16.mask.cmp.sh"]
16477    fn vcmpsh(a: __m128h, b: __m128h, imm5: i32, mask: __mmask8, sae: i32) -> __mmask8;
16478    #[link_name = "llvm.x86.avx512fp16.vcomi.sh"]
16479    fn vcomish(a: __m128h, b: __m128h, imm5: i32, sae: i32) -> i32;
16480
16481    #[link_name = "llvm.x86.avx512fp16.add.ph.512"]
16482    fn vaddph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16483    #[link_name = "llvm.x86.avx512fp16.sub.ph.512"]
16484    fn vsubph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16485    #[link_name = "llvm.x86.avx512fp16.mul.ph.512"]
16486    fn vmulph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16487    #[link_name = "llvm.x86.avx512fp16.div.ph.512"]
16488    fn vdivph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16489
16490    #[link_name = "llvm.x86.avx512fp16.mask.add.sh.round"]
16491    fn vaddsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16492    #[link_name = "llvm.x86.avx512fp16.mask.sub.sh.round"]
16493    fn vsubsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16494    #[link_name = "llvm.x86.avx512fp16.mask.mul.sh.round"]
16495    fn vmulsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16496    #[link_name = "llvm.x86.avx512fp16.mask.div.sh.round"]
16497    fn vdivsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16498
16499    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.128"]
16500    fn vfmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
16501    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.256"]
16502    fn vfmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
16503    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.512"]
16504    fn vfmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
16505    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.csh"]
16506    fn vfmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
16507
16508    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.128"]
16509    fn vfcmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
16510    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.256"]
16511    fn vfcmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
16512    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.512"]
16513    fn vfcmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
16514    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.csh"]
16515    fn vfcmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
16516
16517    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.128"]
16518    fn vfmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16519    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.128"]
16520    fn vfmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16521    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.256"]
16522    fn vfmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16523    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.256"]
16524    fn vfmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16525    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.512"]
16526    fn vfmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
16527    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.512"]
16528    fn vfmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
16529    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.csh"]
16530    fn vfmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16531    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.csh"]
16532    fn vfmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16533
16534    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.128"]
16535    fn vfcmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16536    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.128"]
16537    fn vfcmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16538    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.256"]
16539    fn vfcmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16540    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.256"]
16541    fn vfcmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16542    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.512"]
16543    fn vfcmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
16544    -> __m512;
16545    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.512"]
16546    fn vfcmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
16547    -> __m512;
16548    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.csh"]
16549    fn vfcmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16550    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.csh"]
16551    fn vfcmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16552
16553    #[link_name = "llvm.x86.avx512fp16.vfmadd.ph.512"]
16554    fn vfmaddph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
16555    #[link_name = "llvm.x86.avx512fp16.vfmadd.f16"]
16556    fn vfmaddsh(a: f16, b: f16, c: f16, rounding: i32) -> f16;
16557
16558    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.512"]
16559    fn vfmaddsubph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
16560
16561    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.128"]
16562    fn vrcpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16563    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.256"]
16564    fn vrcpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16565    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.512"]
16566    fn vrcpph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
16567    #[link_name = "llvm.x86.avx512fp16.mask.rcp.sh"]
16568    fn vrcpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16569
16570    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.128"]
16571    fn vrsqrtph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16572    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.256"]
16573    fn vrsqrtph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16574    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.512"]
16575    fn vrsqrtph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
16576    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.sh"]
16577    fn vrsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16578
16579    #[link_name = "llvm.x86.avx512fp16.sqrt.ph.512"]
16580    fn vsqrtph_512(a: __m512h, rounding: i32) -> __m512h;
16581    #[link_name = "llvm.x86.avx512fp16.mask.sqrt.sh"]
16582    fn vsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16583
16584    #[link_name = "llvm.x86.avx512fp16.max.ph.128"]
16585    fn vmaxph_128(a: __m128h, b: __m128h) -> __m128h;
16586    #[link_name = "llvm.x86.avx512fp16.max.ph.256"]
16587    fn vmaxph_256(a: __m256h, b: __m256h) -> __m256h;
16588    #[link_name = "llvm.x86.avx512fp16.max.ph.512"]
16589    fn vmaxph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
16590    #[link_name = "llvm.x86.avx512fp16.mask.max.sh.round"]
16591    fn vmaxsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16592
16593    #[link_name = "llvm.x86.avx512fp16.min.ph.128"]
16594    fn vminph_128(a: __m128h, b: __m128h) -> __m128h;
16595    #[link_name = "llvm.x86.avx512fp16.min.ph.256"]
16596    fn vminph_256(a: __m256h, b: __m256h) -> __m256h;
16597    #[link_name = "llvm.x86.avx512fp16.min.ph.512"]
16598    fn vminph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
16599    #[link_name = "llvm.x86.avx512fp16.mask.min.sh.round"]
16600    fn vminsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16601
16602    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.128"]
16603    fn vgetexpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16604    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.256"]
16605    fn vgetexpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16606    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.512"]
16607    fn vgetexpph_512(a: __m512h, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16608    #[link_name = "llvm.x86.avx512fp16.mask.getexp.sh"]
16609    fn vgetexpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16610
16611    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.128"]
16612    fn vgetmantph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16613    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.256"]
16614    fn vgetmantph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16615    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.512"]
16616    fn vgetmantph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16617    #[link_name = "llvm.x86.avx512fp16.mask.getmant.sh"]
16618    fn vgetmantsh(
16619        a: __m128h,
16620        b: __m128h,
16621        imm8: i32,
16622        src: __m128h,
16623        k: __mmask8,
16624        sae: i32,
16625    ) -> __m128h;
16626
16627    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.128"]
16628    fn vrndscaleph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16629    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.256"]
16630    fn vrndscaleph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16631    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.512"]
16632    fn vrndscaleph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16633    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.sh"]
16634    fn vrndscalesh(
16635        a: __m128h,
16636        b: __m128h,
16637        src: __m128h,
16638        k: __mmask8,
16639        imm8: i32,
16640        sae: i32,
16641    ) -> __m128h;
16642
16643    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.128"]
16644    fn vscalefph_128(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16645    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.256"]
16646    fn vscalefph_256(a: __m256h, b: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16647    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.512"]
16648    fn vscalefph_512(a: __m512h, b: __m512h, src: __m512h, k: __mmask32, rounding: i32) -> __m512h;
16649    #[link_name = "llvm.x86.avx512fp16.mask.scalef.sh"]
16650    fn vscalefsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16651
16652    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.128"]
16653    fn vreduceph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16654    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.256"]
16655    fn vreduceph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16656    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.512"]
16657    fn vreduceph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16658    #[link_name = "llvm.x86.avx512fp16.mask.reduce.sh"]
16659    fn vreducesh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, imm8: i32, sae: i32)
16660    -> __m128h;
16661
16662    #[link_name = "llvm.x86.avx512fp16.fpclass.ph.128"]
16663    fn vfpclassph_128(a: __m128h, imm8: i32) -> __mmask8;
16664    #[link_name = "llvm.x86.avx512fp16.fpclass.ph.256"]
16665    fn vfpclassph_256(a: __m256h, imm8: i32) -> __mmask16;
16666    #[link_name = "llvm.x86.avx512fp16.fpclass.ph.512"]
16667    fn vfpclassph_512(a: __m512h, imm8: i32) -> __mmask32;
16668
16669    #[link_name = "llvm.x86.avx512fp16.mask.fpclass.sh"]
16670    fn vfpclasssh(a: __m128h, imm8: i32, k: __mmask8) -> __mmask8;
16671
16672    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i16"]
16673    fn vcvtw2ph_128(a: i16x8, rounding: i32) -> __m128h;
16674    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i16"]
16675    fn vcvtw2ph_256(a: i16x16, rounding: i32) -> __m256h;
16676    #[link_name = "llvm.x86.avx512.sitofp.round.v32f16.v32i16"]
16677    fn vcvtw2ph_512(a: i16x32, rounding: i32) -> __m512h;
16678    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i16"]
16679    fn vcvtuw2ph_128(a: u16x8, rounding: i32) -> __m128h;
16680    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16i16"]
16681    fn vcvtuw2ph_256(a: u16x16, rounding: i32) -> __m256h;
16682    #[link_name = "llvm.x86.avx512.uitofp.round.v32f16.v32i16"]
16683    fn vcvtuw2ph_512(a: u16x32, rounding: i32) -> __m512h;
16684
16685    #[link_name = "llvm.x86.avx512fp16.mask.vcvtdq2ph.128"]
16686    fn vcvtdq2ph_128(a: i32x4, src: __m128h, k: __mmask8) -> __m128h;
16687    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i32"]
16688    fn vcvtdq2ph_256(a: i32x8, rounding: i32) -> __m128h;
16689    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i32"]
16690    fn vcvtdq2ph_512(a: i32x16, rounding: i32) -> __m256h;
16691    #[link_name = "llvm.x86.avx512fp16.vcvtsi2sh"]
16692    fn vcvtsi2sh(a: __m128h, b: i32, rounding: i32) -> __m128h;
16693    #[link_name = "llvm.x86.avx512fp16.mask.vcvtudq2ph.128"]
16694    fn vcvtudq2ph_128(a: u32x4, src: __m128h, k: __mmask8) -> __m128h;
16695    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i32"]
16696    fn vcvtudq2ph_256(a: u32x8, rounding: i32) -> __m128h;
16697    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16i32"]
16698    fn vcvtudq2ph_512(a: u32x16, rounding: i32) -> __m256h;
16699    #[link_name = "llvm.x86.avx512fp16.vcvtusi2sh"]
16700    fn vcvtusi2sh(a: __m128h, b: u32, rounding: i32) -> __m128h;
16701
16702    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.128"]
16703    fn vcvtqq2ph_128(a: i64x2, src: __m128h, k: __mmask8) -> __m128h;
16704    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.256"]
16705    fn vcvtqq2ph_256(a: i64x4, src: __m128h, k: __mmask8) -> __m128h;
16706    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i64"]
16707    fn vcvtqq2ph_512(a: i64x8, rounding: i32) -> __m128h;
16708    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.128"]
16709    fn vcvtuqq2ph_128(a: u64x2, src: __m128h, k: __mmask8) -> __m128h;
16710    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.256"]
16711    fn vcvtuqq2ph_256(a: u64x4, src: __m128h, k: __mmask8) -> __m128h;
16712    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i64"]
16713    fn vcvtuqq2ph_512(a: u64x8, rounding: i32) -> __m128h;
16714
16715    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.128"]
16716    fn vcvtps2phx_128(a: __m128, src: __m128h, k: __mmask8) -> __m128h;
16717    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.256"]
16718    fn vcvtps2phx_256(a: __m256, src: __m128h, k: __mmask8) -> __m128h;
16719    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.512"]
16720    fn vcvtps2phx_512(a: __m512, src: __m256h, k: __mmask16, rounding: i32) -> __m256h;
16721    #[link_name = "llvm.x86.avx512fp16.mask.vcvtss2sh.round"]
16722    fn vcvtss2sh(a: __m128h, b: __m128, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16723
16724    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.128"]
16725    fn vcvtpd2ph_128(a: __m128d, src: __m128h, k: __mmask8) -> __m128h;
16726    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.256"]
16727    fn vcvtpd2ph_256(a: __m256d, src: __m128h, k: __mmask8) -> __m128h;
16728    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.512"]
16729    fn vcvtpd2ph_512(a: __m512d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16730    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsd2sh.round"]
16731    fn vcvtsd2sh(a: __m128h, b: __m128d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16732
16733    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.128"]
16734    fn vcvtph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
16735    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.256"]
16736    fn vcvtph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
16737    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.512"]
16738    fn vcvtph2w_512(a: __m512h, src: i16x32, k: __mmask32, rounding: i32) -> i16x32;
16739    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.128"]
16740    fn vcvtph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
16741    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.256"]
16742    fn vcvtph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
16743    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.512"]
16744    fn vcvtph2uw_512(a: __m512h, src: u16x32, k: __mmask32, sae: i32) -> u16x32;
16745
16746    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.128"]
16747    fn vcvttph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
16748    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.256"]
16749    fn vcvttph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
16750    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.512"]
16751    fn vcvttph2w_512(a: __m512h, src: i16x32, k: __mmask32, sae: i32) -> i16x32;
16752    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.128"]
16753    fn vcvttph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
16754    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.256"]
16755    fn vcvttph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
16756    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.512"]
16757    fn vcvttph2uw_512(a: __m512h, src: u16x32, k: __mmask32, sae: i32) -> u16x32;
16758
16759    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.128"]
16760    fn vcvtph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
16761    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.256"]
16762    fn vcvtph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
16763    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.512"]
16764    fn vcvtph2dq_512(a: __m256h, src: i32x16, k: __mmask16, rounding: i32) -> i32x16;
16765    #[link_name = "llvm.x86.avx512fp16.vcvtsh2si32"]
16766    fn vcvtsh2si32(a: __m128h, rounding: i32) -> i32;
16767    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.128"]
16768    fn vcvtph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
16769    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.256"]
16770    fn vcvtph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
16771    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.512"]
16772    fn vcvtph2udq_512(a: __m256h, src: u32x16, k: __mmask16, rounding: i32) -> u32x16;
16773    #[link_name = "llvm.x86.avx512fp16.vcvtsh2usi32"]
16774    fn vcvtsh2usi32(a: __m128h, sae: i32) -> u32;
16775
16776    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.128"]
16777    fn vcvttph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
16778    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.256"]
16779    fn vcvttph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
16780    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.512"]
16781    fn vcvttph2dq_512(a: __m256h, src: i32x16, k: __mmask16, sae: i32) -> i32x16;
16782    #[link_name = "llvm.x86.avx512fp16.vcvttsh2si32"]
16783    fn vcvttsh2si32(a: __m128h, sae: i32) -> i32;
16784    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.128"]
16785    fn vcvttph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
16786    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.256"]
16787    fn vcvttph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
16788    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.512"]
16789    fn vcvttph2udq_512(a: __m256h, src: u32x16, k: __mmask16, sae: i32) -> u32x16;
16790    #[link_name = "llvm.x86.avx512fp16.vcvttsh2usi32"]
16791    fn vcvttsh2usi32(a: __m128h, sae: i32) -> u32;
16792
16793    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.128"]
16794    fn vcvtph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
16795    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.256"]
16796    fn vcvtph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
16797    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.512"]
16798    fn vcvtph2qq_512(a: __m128h, src: i64x8, k: __mmask8, rounding: i32) -> i64x8;
16799    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.128"]
16800    fn vcvtph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
16801    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.256"]
16802    fn vcvtph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
16803    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.512"]
16804    fn vcvtph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, rounding: i32) -> u64x8;
16805
16806    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.128"]
16807    fn vcvttph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
16808    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.256"]
16809    fn vcvttph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
16810    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.512"]
16811    fn vcvttph2qq_512(a: __m128h, src: i64x8, k: __mmask8, sae: i32) -> i64x8;
16812    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.128"]
16813    fn vcvttph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
16814    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.256"]
16815    fn vcvttph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
16816    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.512"]
16817    fn vcvttph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, sae: i32) -> u64x8;
16818
16819    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.128"]
16820    fn vcvtph2psx_128(a: __m128h, src: __m128, k: __mmask8) -> __m128;
16821    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.256"]
16822    fn vcvtph2psx_256(a: __m128h, src: __m256, k: __mmask8) -> __m256;
16823    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.512"]
16824    fn vcvtph2psx_512(a: __m256h, src: __m512, k: __mmask16, sae: i32) -> __m512;
16825    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2ss.round"]
16826    fn vcvtsh2ss(a: __m128, b: __m128h, src: __m128, k: __mmask8, sae: i32) -> __m128;
16827
16828    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.128"]
16829    fn vcvtph2pd_128(a: __m128h, src: __m128d, k: __mmask8) -> __m128d;
16830    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.256"]
16831    fn vcvtph2pd_256(a: __m128h, src: __m256d, k: __mmask8) -> __m256d;
16832    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.512"]
16833    fn vcvtph2pd_512(a: __m128h, src: __m512d, k: __mmask8, sae: i32) -> __m512d;
16834    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2sd.round"]
16835    fn vcvtsh2sd(a: __m128d, b: __m128h, src: __m128d, k: __mmask8, sae: i32) -> __m128d;
16836
16837}
16838
16839#[cfg(test)]
16840mod tests {
16841    use crate::core_arch::assert_eq_const as assert_eq;
16842    use crate::core_arch::x86::*;
16843    use crate::ptr::{addr_of, addr_of_mut};
16844    use stdarch_test::simd_test;
16845
16846    #[target_feature(enable = "avx512fp16")]
16847    #[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
16848    const fn _mm_set1_pch(re: f16, im: f16) -> __m128h {
16849        _mm_setr_ph(re, im, re, im, re, im, re, im)
16850    }
16851
16852    #[target_feature(enable = "avx512fp16")]
16853    #[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
16854    const fn _mm256_set1_pch(re: f16, im: f16) -> __m256h {
16855        _mm256_setr_ph(
16856            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
16857        )
16858    }
16859
16860    #[target_feature(enable = "avx512fp16")]
16861    #[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
16862    const fn _mm512_set1_pch(re: f16, im: f16) -> __m512h {
16863        _mm512_setr_ph(
16864            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
16865            re, im, re, im, re, im, re, im, re, im,
16866        )
16867    }
16868
16869    #[simd_test(enable = "avx512fp16,avx512vl")]
16870    const fn test_mm_set_ph() {
16871        let r = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
16872        let e = _mm_setr_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
16873        assert_eq_m128h(r, e);
16874    }
16875
16876    #[simd_test(enable = "avx512fp16,avx512vl")]
16877    const fn test_mm256_set_ph() {
16878        let r = _mm256_set_ph(
16879            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16880        );
16881        let e = _mm256_setr_ph(
16882            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
16883        );
16884        assert_eq_m256h(r, e);
16885    }
16886
16887    #[simd_test(enable = "avx512fp16")]
16888    const fn test_mm512_set_ph() {
16889        let r = _mm512_set_ph(
16890            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16891            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16892            31.0, 32.0,
16893        );
16894        let e = _mm512_setr_ph(
16895            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
16896            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
16897            3.0, 2.0, 1.0,
16898        );
16899        assert_eq_m512h(r, e);
16900    }
16901
16902    #[simd_test(enable = "avx512fp16,avx512vl")]
16903    const fn test_mm_set_sh() {
16904        let r = _mm_set_sh(1.0);
16905        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0);
16906        assert_eq_m128h(r, e);
16907    }
16908
16909    #[simd_test(enable = "avx512fp16,avx512vl")]
16910    const fn test_mm_set1_ph() {
16911        let r = _mm_set1_ph(1.0);
16912        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
16913        assert_eq_m128h(r, e);
16914    }
16915
16916    #[simd_test(enable = "avx512fp16,avx512vl")]
16917    const fn test_mm256_set1_ph() {
16918        let r = _mm256_set1_ph(1.0);
16919        let e = _mm256_set_ph(
16920            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
16921        );
16922        assert_eq_m256h(r, e);
16923    }
16924
16925    #[simd_test(enable = "avx512fp16")]
16926    const fn test_mm512_set1_ph() {
16927        let r = _mm512_set1_ph(1.0);
16928        let e = _mm512_set_ph(
16929            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
16930            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
16931        );
16932        assert_eq_m512h(r, e);
16933    }
16934
16935    #[simd_test(enable = "avx512fp16,avx512vl")]
16936    const fn test_mm_setr_ph() {
16937        let r = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
16938        let e = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
16939        assert_eq_m128h(r, e);
16940    }
16941
16942    #[simd_test(enable = "avx512fp16,avx512vl")]
16943    const fn test_mm256_setr_ph() {
16944        let r = _mm256_setr_ph(
16945            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16946        );
16947        let e = _mm256_set_ph(
16948            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
16949        );
16950        assert_eq_m256h(r, e);
16951    }
16952
16953    #[simd_test(enable = "avx512fp16")]
16954    const fn test_mm512_setr_ph() {
16955        let r = _mm512_setr_ph(
16956            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16957            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16958            31.0, 32.0,
16959        );
16960        let e = _mm512_set_ph(
16961            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
16962            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
16963            3.0, 2.0, 1.0,
16964        );
16965        assert_eq_m512h(r, e);
16966    }
16967
16968    #[simd_test(enable = "avx512fp16,avx512vl")]
16969    const fn test_mm_setzero_ph() {
16970        let r = _mm_setzero_ph();
16971        let e = _mm_set1_ph(0.0);
16972        assert_eq_m128h(r, e);
16973    }
16974
16975    #[simd_test(enable = "avx512fp16,avx512vl")]
16976    const fn test_mm256_setzero_ph() {
16977        let r = _mm256_setzero_ph();
16978        let e = _mm256_set1_ph(0.0);
16979        assert_eq_m256h(r, e);
16980    }
16981
16982    #[simd_test(enable = "avx512fp16")]
16983    const fn test_mm512_setzero_ph() {
16984        let r = _mm512_setzero_ph();
16985        let e = _mm512_set1_ph(0.0);
16986        assert_eq_m512h(r, e);
16987    }
16988
16989    #[simd_test(enable = "avx512fp16,avx512vl")]
16990    const fn test_mm_castsi128_ph() {
16991        let a = _mm_set1_epi16(0x3c00);
16992        let r = _mm_castsi128_ph(a);
16993        let e = _mm_set1_ph(1.0);
16994        assert_eq_m128h(r, e);
16995    }
16996
16997    #[simd_test(enable = "avx512fp16,avx512vl")]
16998    const fn test_mm256_castsi256_ph() {
16999        let a = _mm256_set1_epi16(0x3c00);
17000        let r = _mm256_castsi256_ph(a);
17001        let e = _mm256_set1_ph(1.0);
17002        assert_eq_m256h(r, e);
17003    }
17004
17005    #[simd_test(enable = "avx512fp16")]
17006    const fn test_mm512_castsi512_ph() {
17007        let a = _mm512_set1_epi16(0x3c00);
17008        let r = _mm512_castsi512_ph(a);
17009        let e = _mm512_set1_ph(1.0);
17010        assert_eq_m512h(r, e);
17011    }
17012
17013    #[simd_test(enable = "avx512fp16")]
17014    const fn test_mm_castph_si128() {
17015        let a = _mm_set1_ph(1.0);
17016        let r = _mm_castph_si128(a);
17017        let e = _mm_set1_epi16(0x3c00);
17018        assert_eq_m128i(r, e);
17019    }
17020
17021    #[simd_test(enable = "avx512fp16")]
17022    const fn test_mm256_castph_si256() {
17023        let a = _mm256_set1_ph(1.0);
17024        let r = _mm256_castph_si256(a);
17025        let e = _mm256_set1_epi16(0x3c00);
17026        assert_eq_m256i(r, e);
17027    }
17028
17029    #[simd_test(enable = "avx512fp16")]
17030    const fn test_mm512_castph_si512() {
17031        let a = _mm512_set1_ph(1.0);
17032        let r = _mm512_castph_si512(a);
17033        let e = _mm512_set1_epi16(0x3c00);
17034        assert_eq_m512i(r, e);
17035    }
17036
17037    #[simd_test(enable = "avx512fp16,avx512vl")]
17038    const fn test_mm_castps_ph() {
17039        let a = _mm_castsi128_ps(_mm_set1_epi16(0x3c00));
17040        let r = _mm_castps_ph(a);
17041        let e = _mm_set1_ph(1.0);
17042        assert_eq_m128h(r, e);
17043    }
17044
17045    #[simd_test(enable = "avx512fp16,avx512vl")]
17046    const fn test_mm256_castps_ph() {
17047        let a = _mm256_castsi256_ps(_mm256_set1_epi16(0x3c00));
17048        let r = _mm256_castps_ph(a);
17049        let e = _mm256_set1_ph(1.0);
17050        assert_eq_m256h(r, e);
17051    }
17052
17053    #[simd_test(enable = "avx512fp16")]
17054    const fn test_mm512_castps_ph() {
17055        let a = _mm512_castsi512_ps(_mm512_set1_epi16(0x3c00));
17056        let r = _mm512_castps_ph(a);
17057        let e = _mm512_set1_ph(1.0);
17058        assert_eq_m512h(r, e);
17059    }
17060
17061    #[simd_test(enable = "avx512fp16")]
17062    const fn test_mm_castph_ps() {
17063        let a = _mm_castsi128_ph(_mm_set1_epi32(0x3f800000));
17064        let r = _mm_castph_ps(a);
17065        let e = _mm_set1_ps(1.0);
17066        assert_eq_m128(r, e);
17067    }
17068
17069    #[simd_test(enable = "avx512fp16")]
17070    const fn test_mm256_castph_ps() {
17071        let a = _mm256_castsi256_ph(_mm256_set1_epi32(0x3f800000));
17072        let r = _mm256_castph_ps(a);
17073        let e = _mm256_set1_ps(1.0);
17074        assert_eq_m256(r, e);
17075    }
17076
17077    #[simd_test(enable = "avx512fp16")]
17078    const fn test_mm512_castph_ps() {
17079        let a = _mm512_castsi512_ph(_mm512_set1_epi32(0x3f800000));
17080        let r = _mm512_castph_ps(a);
17081        let e = _mm512_set1_ps(1.0);
17082        assert_eq_m512(r, e);
17083    }
17084
17085    #[simd_test(enable = "avx512fp16,avx512vl")]
17086    const fn test_mm_castpd_ph() {
17087        let a = _mm_castsi128_pd(_mm_set1_epi16(0x3c00));
17088        let r = _mm_castpd_ph(a);
17089        let e = _mm_set1_ph(1.0);
17090        assert_eq_m128h(r, e);
17091    }
17092
17093    #[simd_test(enable = "avx512fp16,avx512vl")]
17094    const fn test_mm256_castpd_ph() {
17095        let a = _mm256_castsi256_pd(_mm256_set1_epi16(0x3c00));
17096        let r = _mm256_castpd_ph(a);
17097        let e = _mm256_set1_ph(1.0);
17098        assert_eq_m256h(r, e);
17099    }
17100
17101    #[simd_test(enable = "avx512fp16")]
17102    const fn test_mm512_castpd_ph() {
17103        let a = _mm512_castsi512_pd(_mm512_set1_epi16(0x3c00));
17104        let r = _mm512_castpd_ph(a);
17105        let e = _mm512_set1_ph(1.0);
17106        assert_eq_m512h(r, e);
17107    }
17108
17109    #[simd_test(enable = "avx512fp16")]
17110    const fn test_mm_castph_pd() {
17111        let a = _mm_castsi128_ph(_mm_set1_epi64x(0x3ff0000000000000));
17112        let r = _mm_castph_pd(a);
17113        let e = _mm_set1_pd(1.0);
17114        assert_eq_m128d(r, e);
17115    }
17116
17117    #[simd_test(enable = "avx512fp16")]
17118    const fn test_mm256_castph_pd() {
17119        let a = _mm256_castsi256_ph(_mm256_set1_epi64x(0x3ff0000000000000));
17120        let r = _mm256_castph_pd(a);
17121        let e = _mm256_set1_pd(1.0);
17122        assert_eq_m256d(r, e);
17123    }
17124
17125    #[simd_test(enable = "avx512fp16")]
17126    const fn test_mm512_castph_pd() {
17127        let a = _mm512_castsi512_ph(_mm512_set1_epi64(0x3ff0000000000000));
17128        let r = _mm512_castph_pd(a);
17129        let e = _mm512_set1_pd(1.0);
17130        assert_eq_m512d(r, e);
17131    }
17132
17133    #[simd_test(enable = "avx512fp16,avx512vl")]
17134    const fn test_mm256_castph256_ph128() {
17135        let a = _mm256_setr_ph(
17136            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
17137        );
17138        let r = _mm256_castph256_ph128(a);
17139        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17140        assert_eq_m128h(r, e);
17141    }
17142
17143    #[simd_test(enable = "avx512fp16,avx512vl")]
17144    const fn test_mm512_castph512_ph128() {
17145        let a = _mm512_setr_ph(
17146            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
17147            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
17148        );
17149        let r = _mm512_castph512_ph128(a);
17150        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17151        assert_eq_m128h(r, e);
17152    }
17153
17154    #[simd_test(enable = "avx512fp16,avx512vl")]
17155    const fn test_mm512_castph512_ph256() {
17156        let a = _mm512_setr_ph(
17157            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
17158            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
17159        );
17160        let r = _mm512_castph512_ph256(a);
17161        let e = _mm256_setr_ph(
17162            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
17163        );
17164        assert_eq_m256h(r, e);
17165    }
17166
17167    #[simd_test(enable = "avx512fp16,avx512vl")]
17168    const fn test_mm256_castph128_ph256() {
17169        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17170        let r = _mm256_castph128_ph256(a);
17171        assert_eq_m128h(_mm256_castph256_ph128(r), a);
17172    }
17173
17174    #[simd_test(enable = "avx512fp16,avx512vl")]
17175    const fn test_mm512_castph128_ph512() {
17176        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17177        let r = _mm512_castph128_ph512(a);
17178        assert_eq_m128h(_mm512_castph512_ph128(r), a);
17179    }
17180
17181    #[simd_test(enable = "avx512fp16,avx512vl")]
17182    const fn test_mm512_castph256_ph512() {
17183        let a = _mm256_setr_ph(
17184            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
17185        );
17186        let r = _mm512_castph256_ph512(a);
17187        assert_eq_m256h(_mm512_castph512_ph256(r), a);
17188    }
17189
17190    #[simd_test(enable = "avx512fp16,avx512vl")]
17191    const fn test_mm256_zextph128_ph256() {
17192        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17193        let r = _mm256_zextph128_ph256(a);
17194        let e = _mm256_setr_ph(
17195            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
17196        );
17197        assert_eq_m256h(r, e);
17198    }
17199
17200    #[simd_test(enable = "avx512fp16")]
17201    const fn test_mm512_zextph128_ph512() {
17202        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17203        let r = _mm512_zextph128_ph512(a);
17204        let e = _mm512_setr_ph(
17205            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
17206            0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
17207        );
17208        assert_eq_m512h(r, e);
17209    }
17210
17211    #[simd_test(enable = "avx512fp16")]
17212    const fn test_mm512_zextph256_ph512() {
17213        let a = _mm256_setr_ph(
17214            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
17215        );
17216        let r = _mm512_zextph256_ph512(a);
17217        let e = _mm512_setr_ph(
17218            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 0., 0., 0., 0.,
17219            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
17220        );
17221        assert_eq_m512h(r, e);
17222    }
17223
17224    #[simd_test(enable = "avx512fp16,avx512vl")]
17225    fn test_mm_cmp_ph_mask() {
17226        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17227        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
17228        let r = _mm_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17229        assert_eq!(r, 0b11110000);
17230    }
17231
17232    #[simd_test(enable = "avx512fp16,avx512vl")]
17233    fn test_mm_mask_cmp_ph_mask() {
17234        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17235        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
17236        let r = _mm_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101, a, b);
17237        assert_eq!(r, 0b01010000);
17238    }
17239
17240    #[simd_test(enable = "avx512fp16,avx512vl")]
17241    fn test_mm256_cmp_ph_mask() {
17242        let a = _mm256_set_ph(
17243            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17244        );
17245        let b = _mm256_set_ph(
17246            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17247            -16.0,
17248        );
17249        let r = _mm256_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17250        assert_eq!(r, 0b1111000011110000);
17251    }
17252
17253    #[simd_test(enable = "avx512fp16,avx512vl")]
17254    fn test_mm256_mask_cmp_ph_mask() {
17255        let a = _mm256_set_ph(
17256            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17257        );
17258        let b = _mm256_set_ph(
17259            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17260            -16.0,
17261        );
17262        let r = _mm256_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b0101010101010101, a, b);
17263        assert_eq!(r, 0b0101000001010000);
17264    }
17265
17266    #[simd_test(enable = "avx512fp16")]
17267    fn test_mm512_cmp_ph_mask() {
17268        let a = _mm512_set_ph(
17269            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17270            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17271            31.0, 32.0,
17272        );
17273        let b = _mm512_set_ph(
17274            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17275            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17276            -29.0, -30.0, -31.0, -32.0,
17277        );
17278        let r = _mm512_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17279        assert_eq!(r, 0b11110000111100001111000011110000);
17280    }
17281
17282    #[simd_test(enable = "avx512fp16")]
17283    fn test_mm512_mask_cmp_ph_mask() {
17284        let a = _mm512_set_ph(
17285            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17286            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17287            31.0, 32.0,
17288        );
17289        let b = _mm512_set_ph(
17290            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17291            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17292            -29.0, -30.0, -31.0, -32.0,
17293        );
17294        let r = _mm512_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101010101010101010101010101, a, b);
17295        assert_eq!(r, 0b01010000010100000101000001010000);
17296    }
17297
17298    #[simd_test(enable = "avx512fp16")]
17299    fn test_mm512_cmp_round_ph_mask() {
17300        let a = _mm512_set_ph(
17301            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17302            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17303            31.0, 32.0,
17304        );
17305        let b = _mm512_set_ph(
17306            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17307            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17308            -29.0, -30.0, -31.0, -32.0,
17309        );
17310        let r = _mm512_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17311        assert_eq!(r, 0b11110000111100001111000011110000);
17312    }
17313
17314    #[simd_test(enable = "avx512fp16")]
17315    fn test_mm512_mask_cmp_round_ph_mask() {
17316        let a = _mm512_set_ph(
17317            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17318            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17319            31.0, 32.0,
17320        );
17321        let b = _mm512_set_ph(
17322            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17323            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17324            -29.0, -30.0, -31.0, -32.0,
17325        );
17326        let r = _mm512_mask_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(
17327            0b01010101010101010101010101010101,
17328            a,
17329            b,
17330        );
17331        assert_eq!(r, 0b01010000010100000101000001010000);
17332    }
17333
17334    #[simd_test(enable = "avx512fp16")]
17335    fn test_mm_cmp_round_sh_mask() {
17336        let a = _mm_set_sh(1.0);
17337        let b = _mm_set_sh(1.0);
17338        let r = _mm_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17339        assert_eq!(r, 1);
17340    }
17341
17342    #[simd_test(enable = "avx512fp16")]
17343    fn test_mm_mask_cmp_round_sh_mask() {
17344        let a = _mm_set_sh(1.0);
17345        let b = _mm_set_sh(1.0);
17346        let r = _mm_mask_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(0, a, b);
17347        assert_eq!(r, 0);
17348    }
17349
17350    #[simd_test(enable = "avx512fp16")]
17351    fn test_mm_cmp_sh_mask() {
17352        let a = _mm_set_sh(1.0);
17353        let b = _mm_set_sh(1.0);
17354        let r = _mm_cmp_sh_mask::<_CMP_EQ_OQ>(a, b);
17355        assert_eq!(r, 1);
17356    }
17357
17358    #[simd_test(enable = "avx512fp16")]
17359    fn test_mm_mask_cmp_sh_mask() {
17360        let a = _mm_set_sh(1.0);
17361        let b = _mm_set_sh(1.0);
17362        let r = _mm_mask_cmp_sh_mask::<_CMP_EQ_OQ>(0, a, b);
17363        assert_eq!(r, 0);
17364    }
17365
17366    #[simd_test(enable = "avx512fp16")]
17367    fn test_mm_comi_round_sh() {
17368        let a = _mm_set_sh(1.0);
17369        let b = _mm_set_sh(1.0);
17370        let r = _mm_comi_round_sh::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17371        assert_eq!(r, 1);
17372    }
17373
17374    #[simd_test(enable = "avx512fp16")]
17375    fn test_mm_comi_sh() {
17376        let a = _mm_set_sh(1.0);
17377        let b = _mm_set_sh(1.0);
17378        let r = _mm_comi_sh::<_CMP_EQ_OQ>(a, b);
17379        assert_eq!(r, 1);
17380    }
17381
17382    #[simd_test(enable = "avx512fp16")]
17383    fn test_mm_comieq_sh() {
17384        let a = _mm_set_sh(1.0);
17385        let b = _mm_set_sh(1.0);
17386        let r = _mm_comieq_sh(a, b);
17387        assert_eq!(r, 1);
17388    }
17389
17390    #[simd_test(enable = "avx512fp16")]
17391    fn test_mm_comige_sh() {
17392        let a = _mm_set_sh(2.0);
17393        let b = _mm_set_sh(1.0);
17394        let r = _mm_comige_sh(a, b);
17395        assert_eq!(r, 1);
17396    }
17397
17398    #[simd_test(enable = "avx512fp16")]
17399    fn test_mm_comigt_sh() {
17400        let a = _mm_set_sh(2.0);
17401        let b = _mm_set_sh(1.0);
17402        let r = _mm_comigt_sh(a, b);
17403        assert_eq!(r, 1);
17404    }
17405
17406    #[simd_test(enable = "avx512fp16")]
17407    fn test_mm_comile_sh() {
17408        let a = _mm_set_sh(1.0);
17409        let b = _mm_set_sh(2.0);
17410        let r = _mm_comile_sh(a, b);
17411        assert_eq!(r, 1);
17412    }
17413
17414    #[simd_test(enable = "avx512fp16")]
17415    fn test_mm_comilt_sh() {
17416        let a = _mm_set_sh(1.0);
17417        let b = _mm_set_sh(2.0);
17418        let r = _mm_comilt_sh(a, b);
17419        assert_eq!(r, 1);
17420    }
17421
17422    #[simd_test(enable = "avx512fp16")]
17423    fn test_mm_comineq_sh() {
17424        let a = _mm_set_sh(1.0);
17425        let b = _mm_set_sh(2.0);
17426        let r = _mm_comineq_sh(a, b);
17427        assert_eq!(r, 1);
17428    }
17429
17430    #[simd_test(enable = "avx512fp16")]
17431    fn test_mm_ucomieq_sh() {
17432        let a = _mm_set_sh(1.0);
17433        let b = _mm_set_sh(1.0);
17434        let r = _mm_ucomieq_sh(a, b);
17435        assert_eq!(r, 1);
17436    }
17437
17438    #[simd_test(enable = "avx512fp16")]
17439    fn test_mm_ucomige_sh() {
17440        let a = _mm_set_sh(2.0);
17441        let b = _mm_set_sh(1.0);
17442        let r = _mm_ucomige_sh(a, b);
17443        assert_eq!(r, 1);
17444    }
17445
17446    #[simd_test(enable = "avx512fp16")]
17447    fn test_mm_ucomigt_sh() {
17448        let a = _mm_set_sh(2.0);
17449        let b = _mm_set_sh(1.0);
17450        let r = _mm_ucomigt_sh(a, b);
17451        assert_eq!(r, 1);
17452    }
17453
17454    #[simd_test(enable = "avx512fp16")]
17455    fn test_mm_ucomile_sh() {
17456        let a = _mm_set_sh(1.0);
17457        let b = _mm_set_sh(2.0);
17458        let r = _mm_ucomile_sh(a, b);
17459        assert_eq!(r, 1);
17460    }
17461
17462    #[simd_test(enable = "avx512fp16")]
17463    fn test_mm_ucomilt_sh() {
17464        let a = _mm_set_sh(1.0);
17465        let b = _mm_set_sh(2.0);
17466        let r = _mm_ucomilt_sh(a, b);
17467        assert_eq!(r, 1);
17468    }
17469
17470    #[simd_test(enable = "avx512fp16")]
17471    fn test_mm_ucomineq_sh() {
17472        let a = _mm_set_sh(1.0);
17473        let b = _mm_set_sh(2.0);
17474        let r = _mm_ucomineq_sh(a, b);
17475        assert_eq!(r, 1);
17476    }
17477
17478    #[simd_test(enable = "avx512fp16,avx512vl")]
17479    const fn test_mm_load_ph() {
17480        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17481        let b = unsafe { _mm_load_ph(addr_of!(a).cast()) };
17482        assert_eq_m128h(a, b);
17483    }
17484
17485    #[simd_test(enable = "avx512fp16,avx512vl")]
17486    const fn test_mm256_load_ph() {
17487        let a = _mm256_set_ph(
17488            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17489        );
17490        let b = unsafe { _mm256_load_ph(addr_of!(a).cast()) };
17491        assert_eq_m256h(a, b);
17492    }
17493
17494    #[simd_test(enable = "avx512fp16")]
17495    const fn test_mm512_load_ph() {
17496        let a = _mm512_set_ph(
17497            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17498            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17499            31.0, 32.0,
17500        );
17501        let b = unsafe { _mm512_load_ph(addr_of!(a).cast()) };
17502        assert_eq_m512h(a, b);
17503    }
17504
17505    #[simd_test(enable = "avx512fp16,avx512vl")]
17506    const fn test_mm_load_sh() {
17507        let a = _mm_set_sh(1.0);
17508        let b = unsafe { _mm_load_sh(addr_of!(a).cast()) };
17509        assert_eq_m128h(a, b);
17510    }
17511
17512    #[simd_test(enable = "avx512fp16,avx512vl")]
17513    fn test_mm_mask_load_sh() {
17514        let a = _mm_set_sh(1.0);
17515        let src = _mm_set_sh(2.);
17516        let b = unsafe { _mm_mask_load_sh(src, 1, addr_of!(a).cast()) };
17517        assert_eq_m128h(a, b);
17518        let b = unsafe { _mm_mask_load_sh(src, 0, addr_of!(a).cast()) };
17519        assert_eq_m128h(src, b);
17520    }
17521
17522    #[simd_test(enable = "avx512fp16,avx512vl")]
17523    fn test_mm_maskz_load_sh() {
17524        let a = _mm_set_sh(1.0);
17525        let b = unsafe { _mm_maskz_load_sh(1, addr_of!(a).cast()) };
17526        assert_eq_m128h(a, b);
17527        let b = unsafe { _mm_maskz_load_sh(0, addr_of!(a).cast()) };
17528        assert_eq_m128h(_mm_setzero_ph(), b);
17529    }
17530
17531    #[simd_test(enable = "avx512fp16,avx512vl")]
17532    const fn test_mm_loadu_ph() {
17533        let array = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
17534        let r = unsafe { _mm_loadu_ph(array.as_ptr()) };
17535        let e = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17536        assert_eq_m128h(r, e);
17537    }
17538
17539    #[simd_test(enable = "avx512fp16,avx512vl")]
17540    const fn test_mm256_loadu_ph() {
17541        let array = [
17542            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17543        ];
17544        let r = unsafe { _mm256_loadu_ph(array.as_ptr()) };
17545        let e = _mm256_setr_ph(
17546            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17547        );
17548        assert_eq_m256h(r, e);
17549    }
17550
17551    #[simd_test(enable = "avx512fp16")]
17552    const fn test_mm512_loadu_ph() {
17553        let array = [
17554            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17555            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17556            31.0, 32.0,
17557        ];
17558        let r = unsafe { _mm512_loadu_ph(array.as_ptr()) };
17559        let e = _mm512_setr_ph(
17560            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17561            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17562            31.0, 32.0,
17563        );
17564        assert_eq_m512h(r, e);
17565    }
17566
17567    #[simd_test(enable = "avx512fp16,avx512vl")]
17568    const fn test_mm_move_sh() {
17569        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17570        let b = _mm_set_sh(9.0);
17571        let r = _mm_move_sh(a, b);
17572        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 9.0);
17573        assert_eq_m128h(r, e);
17574    }
17575
17576    #[simd_test(enable = "avx512fp16,avx512vl")]
17577    const fn test_mm_mask_move_sh() {
17578        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17579        let b = _mm_set_sh(9.0);
17580        let src = _mm_set_sh(10.0);
17581        let r = _mm_mask_move_sh(src, 0, a, b);
17582        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 10.0);
17583        assert_eq_m128h(r, e);
17584    }
17585
17586    #[simd_test(enable = "avx512fp16,avx512vl")]
17587    const fn test_mm_maskz_move_sh() {
17588        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17589        let b = _mm_set_sh(9.0);
17590        let r = _mm_maskz_move_sh(0, a, b);
17591        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 0.0);
17592        assert_eq_m128h(r, e);
17593    }
17594
17595    #[simd_test(enable = "avx512fp16,avx512vl")]
17596    const fn test_mm_store_ph() {
17597        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17598        let mut b = _mm_setzero_ph();
17599        unsafe {
17600            _mm_store_ph(addr_of_mut!(b).cast(), a);
17601        }
17602        assert_eq_m128h(a, b);
17603    }
17604
17605    #[simd_test(enable = "avx512fp16,avx512vl")]
17606    const fn test_mm256_store_ph() {
17607        let a = _mm256_set_ph(
17608            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17609        );
17610        let mut b = _mm256_setzero_ph();
17611        unsafe {
17612            _mm256_store_ph(addr_of_mut!(b).cast(), a);
17613        }
17614        assert_eq_m256h(a, b);
17615    }
17616
17617    #[simd_test(enable = "avx512fp16")]
17618    const fn test_mm512_store_ph() {
17619        let a = _mm512_set_ph(
17620            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17621            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17622            31.0, 32.0,
17623        );
17624        let mut b = _mm512_setzero_ph();
17625        unsafe {
17626            _mm512_store_ph(addr_of_mut!(b).cast(), a);
17627        }
17628        assert_eq_m512h(a, b);
17629    }
17630
17631    #[simd_test(enable = "avx512fp16,avx512vl")]
17632    const fn test_mm_store_sh() {
17633        let a = _mm_set_sh(1.0);
17634        let mut b = _mm_setzero_ph();
17635        unsafe {
17636            _mm_store_sh(addr_of_mut!(b).cast(), a);
17637        }
17638        assert_eq_m128h(a, b);
17639    }
17640
17641    #[simd_test(enable = "avx512fp16,avx512vl")]
17642    fn test_mm_mask_store_sh() {
17643        let a = _mm_set_sh(1.0);
17644        let mut b = _mm_setzero_ph();
17645        unsafe {
17646            _mm_mask_store_sh(addr_of_mut!(b).cast(), 0, a);
17647        }
17648        assert_eq_m128h(_mm_setzero_ph(), b);
17649        unsafe {
17650            _mm_mask_store_sh(addr_of_mut!(b).cast(), 1, a);
17651        }
17652        assert_eq_m128h(a, b);
17653    }
17654
17655    #[simd_test(enable = "avx512fp16,avx512vl")]
17656    const fn test_mm_storeu_ph() {
17657        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17658        let mut array = [0.0; 8];
17659        unsafe {
17660            _mm_storeu_ph(array.as_mut_ptr(), a);
17661        }
17662        assert_eq_m128h(a, unsafe { _mm_loadu_ph(array.as_ptr()) });
17663    }
17664
17665    #[simd_test(enable = "avx512fp16,avx512vl")]
17666    const fn test_mm256_storeu_ph() {
17667        let a = _mm256_set_ph(
17668            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17669        );
17670        let mut array = [0.0; 16];
17671        unsafe {
17672            _mm256_storeu_ph(array.as_mut_ptr(), a);
17673        }
17674        assert_eq_m256h(a, unsafe { _mm256_loadu_ph(array.as_ptr()) });
17675    }
17676
17677    #[simd_test(enable = "avx512fp16")]
17678    const fn test_mm512_storeu_ph() {
17679        let a = _mm512_set_ph(
17680            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17681            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17682            31.0, 32.0,
17683        );
17684        let mut array = [0.0; 32];
17685        unsafe {
17686            _mm512_storeu_ph(array.as_mut_ptr(), a);
17687        }
17688        assert_eq_m512h(a, unsafe { _mm512_loadu_ph(array.as_ptr()) });
17689    }
17690
17691    #[simd_test(enable = "avx512fp16,avx512vl")]
17692    const fn test_mm_add_ph() {
17693        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17694        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17695        let r = _mm_add_ph(a, b);
17696        let e = _mm_set1_ph(9.0);
17697        assert_eq_m128h(r, e);
17698    }
17699
17700    #[simd_test(enable = "avx512fp16,avx512vl")]
17701    const fn test_mm_mask_add_ph() {
17702        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17703        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17704        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
17705        let r = _mm_mask_add_ph(src, 0b01010101, a, b);
17706        let e = _mm_set_ph(10., 9., 12., 9., 14., 9., 16., 9.);
17707        assert_eq_m128h(r, e);
17708    }
17709
17710    #[simd_test(enable = "avx512fp16,avx512vl")]
17711    const fn test_mm_maskz_add_ph() {
17712        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17713        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17714        let r = _mm_maskz_add_ph(0b01010101, a, b);
17715        let e = _mm_set_ph(0., 9., 0., 9., 0., 9., 0., 9.);
17716        assert_eq_m128h(r, e);
17717    }
17718
17719    #[simd_test(enable = "avx512fp16,avx512vl")]
17720    const fn test_mm256_add_ph() {
17721        let a = _mm256_set_ph(
17722            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17723        );
17724        let b = _mm256_set_ph(
17725            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17726        );
17727        let r = _mm256_add_ph(a, b);
17728        let e = _mm256_set1_ph(17.0);
17729        assert_eq_m256h(r, e);
17730    }
17731
17732    #[simd_test(enable = "avx512fp16,avx512vl")]
17733    const fn test_mm256_mask_add_ph() {
17734        let a = _mm256_set_ph(
17735            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17736        );
17737        let b = _mm256_set_ph(
17738            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17739        );
17740        let src = _mm256_set_ph(
17741            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
17742        );
17743        let r = _mm256_mask_add_ph(src, 0b0101010101010101, a, b);
17744        let e = _mm256_set_ph(
17745            18., 17., 20., 17., 22., 17., 24., 17., 26., 17., 28., 17., 30., 17., 32., 17.,
17746        );
17747        assert_eq_m256h(r, e);
17748    }
17749
17750    #[simd_test(enable = "avx512fp16,avx512vl")]
17751    const fn test_mm256_maskz_add_ph() {
17752        let a = _mm256_set_ph(
17753            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17754        );
17755        let b = _mm256_set_ph(
17756            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17757        );
17758        let r = _mm256_maskz_add_ph(0b0101010101010101, a, b);
17759        let e = _mm256_set_ph(
17760            0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17.,
17761        );
17762        assert_eq_m256h(r, e);
17763    }
17764
17765    #[simd_test(enable = "avx512fp16")]
17766    const fn test_mm512_add_ph() {
17767        let a = _mm512_set_ph(
17768            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17769            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17770            31.0, 32.0,
17771        );
17772        let b = _mm512_set_ph(
17773            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17774            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17775            3.0, 2.0, 1.0,
17776        );
17777        let r = _mm512_add_ph(a, b);
17778        let e = _mm512_set1_ph(33.0);
17779        assert_eq_m512h(r, e);
17780    }
17781
17782    #[simd_test(enable = "avx512fp16")]
17783    const fn test_mm512_mask_add_ph() {
17784        let a = _mm512_set_ph(
17785            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17786            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17787            31.0, 32.0,
17788        );
17789        let b = _mm512_set_ph(
17790            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17791            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17792            3.0, 2.0, 1.0,
17793        );
17794        let src = _mm512_set_ph(
17795            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17796            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17797        );
17798        let r = _mm512_mask_add_ph(src, 0b01010101010101010101010101010101, a, b);
17799        let e = _mm512_set_ph(
17800            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
17801            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
17802        );
17803        assert_eq_m512h(r, e);
17804    }
17805
17806    #[simd_test(enable = "avx512fp16")]
17807    const fn test_mm512_maskz_add_ph() {
17808        let a = _mm512_set_ph(
17809            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17810            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17811            31.0, 32.0,
17812        );
17813        let b = _mm512_set_ph(
17814            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17815            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17816            3.0, 2.0, 1.0,
17817        );
17818        let r = _mm512_maskz_add_ph(0b01010101010101010101010101010101, a, b);
17819        let e = _mm512_set_ph(
17820            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
17821            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
17822        );
17823        assert_eq_m512h(r, e);
17824    }
17825
17826    #[simd_test(enable = "avx512fp16")]
17827    fn test_mm512_add_round_ph() {
17828        let a = _mm512_set_ph(
17829            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17830            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17831            31.0, 32.0,
17832        );
17833        let b = _mm512_set_ph(
17834            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17835            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17836            3.0, 2.0, 1.0,
17837        );
17838        let r = _mm512_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17839        let e = _mm512_set1_ph(33.0);
17840        assert_eq_m512h(r, e);
17841    }
17842
17843    #[simd_test(enable = "avx512fp16")]
17844    fn test_mm512_mask_add_round_ph() {
17845        let a = _mm512_set_ph(
17846            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17847            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17848            31.0, 32.0,
17849        );
17850        let b = _mm512_set_ph(
17851            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17852            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17853            3.0, 2.0, 1.0,
17854        );
17855        let src = _mm512_set_ph(
17856            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17857            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17858        );
17859        let r = _mm512_mask_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17860            src,
17861            0b01010101010101010101010101010101,
17862            a,
17863            b,
17864        );
17865        let e = _mm512_set_ph(
17866            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
17867            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
17868        );
17869        assert_eq_m512h(r, e);
17870    }
17871
17872    #[simd_test(enable = "avx512fp16")]
17873    fn test_mm512_maskz_add_round_ph() {
17874        let a = _mm512_set_ph(
17875            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17876            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17877            31.0, 32.0,
17878        );
17879        let b = _mm512_set_ph(
17880            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17881            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17882            3.0, 2.0, 1.0,
17883        );
17884        let r = _mm512_maskz_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17885            0b01010101010101010101010101010101,
17886            a,
17887            b,
17888        );
17889        let e = _mm512_set_ph(
17890            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
17891            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
17892        );
17893        assert_eq_m512h(r, e);
17894    }
17895
17896    #[simd_test(enable = "avx512fp16,avx512vl")]
17897    fn test_mm_add_round_sh() {
17898        let a = _mm_set_sh(1.0);
17899        let b = _mm_set_sh(2.0);
17900        let r = _mm_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17901        let e = _mm_set_sh(3.0);
17902        assert_eq_m128h(r, e);
17903    }
17904
17905    #[simd_test(enable = "avx512fp16,avx512vl")]
17906    fn test_mm_mask_add_round_sh() {
17907        let a = _mm_set_sh(1.0);
17908        let b = _mm_set_sh(2.0);
17909        let src = _mm_set_sh(4.0);
17910        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17911            src, 0, a, b,
17912        );
17913        let e = _mm_set_sh(4.0);
17914        assert_eq_m128h(r, e);
17915        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17916            src, 1, a, b,
17917        );
17918        let e = _mm_set_sh(3.0);
17919        assert_eq_m128h(r, e);
17920    }
17921
17922    #[simd_test(enable = "avx512fp16,avx512vl")]
17923    fn test_mm_maskz_add_round_sh() {
17924        let a = _mm_set_sh(1.0);
17925        let b = _mm_set_sh(2.0);
17926        let r =
17927            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
17928        let e = _mm_set_sh(0.0);
17929        assert_eq_m128h(r, e);
17930        let r =
17931            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
17932        let e = _mm_set_sh(3.0);
17933        assert_eq_m128h(r, e);
17934    }
17935
17936    #[simd_test(enable = "avx512fp16,avx512vl")]
17937    const fn test_mm_add_sh() {
17938        let a = _mm_set_sh(1.0);
17939        let b = _mm_set_sh(2.0);
17940        let r = _mm_add_sh(a, b);
17941        let e = _mm_set_sh(3.0);
17942        assert_eq_m128h(r, e);
17943    }
17944
17945    #[simd_test(enable = "avx512fp16,avx512vl")]
17946    const fn test_mm_mask_add_sh() {
17947        let a = _mm_set_sh(1.0);
17948        let b = _mm_set_sh(2.0);
17949        let src = _mm_set_sh(4.0);
17950        let r = _mm_mask_add_sh(src, 0, a, b);
17951        let e = _mm_set_sh(4.0);
17952        assert_eq_m128h(r, e);
17953        let r = _mm_mask_add_sh(src, 1, a, b);
17954        let e = _mm_set_sh(3.0);
17955        assert_eq_m128h(r, e);
17956    }
17957
17958    #[simd_test(enable = "avx512fp16,avx512vl")]
17959    const fn test_mm_maskz_add_sh() {
17960        let a = _mm_set_sh(1.0);
17961        let b = _mm_set_sh(2.0);
17962        let r = _mm_maskz_add_sh(0, a, b);
17963        let e = _mm_set_sh(0.0);
17964        assert_eq_m128h(r, e);
17965        let r = _mm_maskz_add_sh(1, a, b);
17966        let e = _mm_set_sh(3.0);
17967        assert_eq_m128h(r, e);
17968    }
17969
17970    #[simd_test(enable = "avx512fp16,avx512vl")]
17971    const fn test_mm_sub_ph() {
17972        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17973        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17974        let r = _mm_sub_ph(a, b);
17975        let e = _mm_set_ph(-7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0);
17976        assert_eq_m128h(r, e);
17977    }
17978
17979    #[simd_test(enable = "avx512fp16,avx512vl")]
17980    const fn test_mm_mask_sub_ph() {
17981        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17982        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17983        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
17984        let r = _mm_mask_sub_ph(src, 0b01010101, a, b);
17985        let e = _mm_set_ph(10., -5., 12., -1., 14., 3., 16., 7.);
17986        assert_eq_m128h(r, e);
17987    }
17988
17989    #[simd_test(enable = "avx512fp16,avx512vl")]
17990    const fn test_mm_maskz_sub_ph() {
17991        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17992        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17993        let r = _mm_maskz_sub_ph(0b01010101, a, b);
17994        let e = _mm_set_ph(0., -5., 0., -1., 0., 3., 0., 7.);
17995        assert_eq_m128h(r, e);
17996    }
17997
17998    #[simd_test(enable = "avx512fp16,avx512vl")]
17999    const fn test_mm256_sub_ph() {
18000        let a = _mm256_set_ph(
18001            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18002        );
18003        let b = _mm256_set_ph(
18004            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18005        );
18006        let r = _mm256_sub_ph(a, b);
18007        let e = _mm256_set_ph(
18008            -15.0, -13.0, -11.0, -9.0, -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0,
18009            15.0,
18010        );
18011        assert_eq_m256h(r, e);
18012    }
18013
18014    #[simd_test(enable = "avx512fp16,avx512vl")]
18015    const fn test_mm256_mask_sub_ph() {
18016        let a = _mm256_set_ph(
18017            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18018        );
18019        let b = _mm256_set_ph(
18020            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18021        );
18022        let src = _mm256_set_ph(
18023            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
18024        );
18025        let r = _mm256_mask_sub_ph(src, 0b0101010101010101, a, b);
18026        let e = _mm256_set_ph(
18027            18., -13., 20., -9., 22., -5., 24., -1., 26., 3., 28., 7., 30., 11., 32., 15.,
18028        );
18029        assert_eq_m256h(r, e);
18030    }
18031
18032    #[simd_test(enable = "avx512fp16,avx512vl")]
18033    const fn test_mm256_maskz_sub_ph() {
18034        let a = _mm256_set_ph(
18035            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18036        );
18037        let b = _mm256_set_ph(
18038            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18039        );
18040        let r = _mm256_maskz_sub_ph(0b0101010101010101, a, b);
18041        let e = _mm256_set_ph(
18042            0., -13., 0., -9., 0., -5., 0., -1., 0., 3., 0., 7., 0., 11., 0., 15.,
18043        );
18044        assert_eq_m256h(r, e);
18045    }
18046
18047    #[simd_test(enable = "avx512fp16")]
18048    const fn test_mm512_sub_ph() {
18049        let a = _mm512_set_ph(
18050            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18051            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18052            31.0, 32.0,
18053        );
18054        let b = _mm512_set_ph(
18055            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18056            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18057            3.0, 2.0, 1.0,
18058        );
18059        let r = _mm512_sub_ph(a, b);
18060        let e = _mm512_set_ph(
18061            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
18062            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
18063            23.0, 25.0, 27.0, 29.0, 31.0,
18064        );
18065        assert_eq_m512h(r, e);
18066    }
18067
18068    #[simd_test(enable = "avx512fp16")]
18069    const fn test_mm512_mask_sub_ph() {
18070        let a = _mm512_set_ph(
18071            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18072            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18073            31.0, 32.0,
18074        );
18075        let b = _mm512_set_ph(
18076            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18077            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18078            3.0, 2.0, 1.0,
18079        );
18080        let src = _mm512_set_ph(
18081            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18082            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18083        );
18084        let r = _mm512_mask_sub_ph(src, 0b01010101010101010101010101010101, a, b);
18085        let e = _mm512_set_ph(
18086            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
18087            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
18088        );
18089        assert_eq_m512h(r, e);
18090    }
18091
18092    #[simd_test(enable = "avx512fp16")]
18093    const fn test_mm512_maskz_sub_ph() {
18094        let a = _mm512_set_ph(
18095            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18096            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18097            31.0, 32.0,
18098        );
18099        let b = _mm512_set_ph(
18100            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18101            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18102            3.0, 2.0, 1.0,
18103        );
18104        let r = _mm512_maskz_sub_ph(0b01010101010101010101010101010101, a, b);
18105        let e = _mm512_set_ph(
18106            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
18107            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
18108        );
18109        assert_eq_m512h(r, e);
18110    }
18111
18112    #[simd_test(enable = "avx512fp16")]
18113    fn test_mm512_sub_round_ph() {
18114        let a = _mm512_set_ph(
18115            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18116            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18117            31.0, 32.0,
18118        );
18119        let b = _mm512_set_ph(
18120            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18121            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18122            3.0, 2.0, 1.0,
18123        );
18124        let r = _mm512_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18125        let e = _mm512_set_ph(
18126            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
18127            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
18128            23.0, 25.0, 27.0, 29.0, 31.0,
18129        );
18130        assert_eq_m512h(r, e);
18131    }
18132
18133    #[simd_test(enable = "avx512fp16")]
18134    fn test_mm512_mask_sub_round_ph() {
18135        let a = _mm512_set_ph(
18136            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18137            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18138            31.0, 32.0,
18139        );
18140        let b = _mm512_set_ph(
18141            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18142            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18143            3.0, 2.0, 1.0,
18144        );
18145        let src = _mm512_set_ph(
18146            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18147            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18148        );
18149        let r = _mm512_mask_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18150            src,
18151            0b01010101010101010101010101010101,
18152            a,
18153            b,
18154        );
18155        let e = _mm512_set_ph(
18156            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
18157            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
18158        );
18159        assert_eq_m512h(r, e);
18160    }
18161
18162    #[simd_test(enable = "avx512fp16")]
18163    fn test_mm512_maskz_sub_round_ph() {
18164        let a = _mm512_set_ph(
18165            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18166            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18167            31.0, 32.0,
18168        );
18169        let b = _mm512_set_ph(
18170            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18171            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18172            3.0, 2.0, 1.0,
18173        );
18174        let r = _mm512_maskz_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18175            0b01010101010101010101010101010101,
18176            a,
18177            b,
18178        );
18179        let e = _mm512_set_ph(
18180            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
18181            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
18182        );
18183        assert_eq_m512h(r, e);
18184    }
18185
18186    #[simd_test(enable = "avx512fp16,avx512vl")]
18187    fn test_mm_sub_round_sh() {
18188        let a = _mm_set_sh(1.0);
18189        let b = _mm_set_sh(2.0);
18190        let r = _mm_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18191        let e = _mm_set_sh(-1.0);
18192        assert_eq_m128h(r, e);
18193    }
18194
18195    #[simd_test(enable = "avx512fp16,avx512vl")]
18196    fn test_mm_mask_sub_round_sh() {
18197        let a = _mm_set_sh(1.0);
18198        let b = _mm_set_sh(2.0);
18199        let src = _mm_set_sh(4.0);
18200        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18201            src, 0, a, b,
18202        );
18203        let e = _mm_set_sh(4.0);
18204        assert_eq_m128h(r, e);
18205        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18206            src, 1, a, b,
18207        );
18208        let e = _mm_set_sh(-1.0);
18209        assert_eq_m128h(r, e);
18210    }
18211
18212    #[simd_test(enable = "avx512fp16,avx512vl")]
18213    fn test_mm_maskz_sub_round_sh() {
18214        let a = _mm_set_sh(1.0);
18215        let b = _mm_set_sh(2.0);
18216        let r =
18217            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18218        let e = _mm_set_sh(0.0);
18219        assert_eq_m128h(r, e);
18220        let r =
18221            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18222        let e = _mm_set_sh(-1.0);
18223        assert_eq_m128h(r, e);
18224    }
18225
18226    #[simd_test(enable = "avx512fp16,avx512vl")]
18227    const fn test_mm_sub_sh() {
18228        let a = _mm_set_sh(1.0);
18229        let b = _mm_set_sh(2.0);
18230        let r = _mm_sub_sh(a, b);
18231        let e = _mm_set_sh(-1.0);
18232        assert_eq_m128h(r, e);
18233    }
18234
18235    #[simd_test(enable = "avx512fp16,avx512vl")]
18236    const fn test_mm_mask_sub_sh() {
18237        let a = _mm_set_sh(1.0);
18238        let b = _mm_set_sh(2.0);
18239        let src = _mm_set_sh(4.0);
18240        let r = _mm_mask_sub_sh(src, 0, a, b);
18241        let e = _mm_set_sh(4.0);
18242        assert_eq_m128h(r, e);
18243        let r = _mm_mask_sub_sh(src, 1, a, b);
18244        let e = _mm_set_sh(-1.0);
18245        assert_eq_m128h(r, e);
18246    }
18247
18248    #[simd_test(enable = "avx512fp16,avx512vl")]
18249    const fn test_mm_maskz_sub_sh() {
18250        let a = _mm_set_sh(1.0);
18251        let b = _mm_set_sh(2.0);
18252        let r = _mm_maskz_sub_sh(0, a, b);
18253        let e = _mm_set_sh(0.0);
18254        assert_eq_m128h(r, e);
18255        let r = _mm_maskz_sub_sh(1, a, b);
18256        let e = _mm_set_sh(-1.0);
18257        assert_eq_m128h(r, e);
18258    }
18259
18260    #[simd_test(enable = "avx512fp16,avx512vl")]
18261    const fn test_mm_mul_ph() {
18262        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18263        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18264        let r = _mm_mul_ph(a, b);
18265        let e = _mm_set_ph(8.0, 14.0, 18.0, 20.0, 20.0, 18.0, 14.0, 8.0);
18266        assert_eq_m128h(r, e);
18267    }
18268
18269    #[simd_test(enable = "avx512fp16,avx512vl")]
18270    const fn test_mm_mask_mul_ph() {
18271        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18272        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18273        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
18274        let r = _mm_mask_mul_ph(src, 0b01010101, a, b);
18275        let e = _mm_set_ph(10., 14., 12., 20., 14., 18., 16., 8.);
18276        assert_eq_m128h(r, e);
18277    }
18278
18279    #[simd_test(enable = "avx512fp16,avx512vl")]
18280    const fn test_mm_maskz_mul_ph() {
18281        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18282        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18283        let r = _mm_maskz_mul_ph(0b01010101, a, b);
18284        let e = _mm_set_ph(0., 14., 0., 20., 0., 18., 0., 8.);
18285        assert_eq_m128h(r, e);
18286    }
18287
18288    #[simd_test(enable = "avx512fp16,avx512vl")]
18289    const fn test_mm256_mul_ph() {
18290        let a = _mm256_set_ph(
18291            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18292        );
18293        let b = _mm256_set_ph(
18294            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18295        );
18296        let r = _mm256_mul_ph(a, b);
18297        let e = _mm256_set_ph(
18298            16.0, 30.0, 42.0, 52.0, 60.0, 66.0, 70.0, 72.0, 72.0, 70.0, 66.0, 60.0, 52.0, 42.0,
18299            30.0, 16.0,
18300        );
18301        assert_eq_m256h(r, e);
18302    }
18303
18304    #[simd_test(enable = "avx512fp16,avx512vl")]
18305    const fn test_mm256_mask_mul_ph() {
18306        let a = _mm256_set_ph(
18307            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18308        );
18309        let b = _mm256_set_ph(
18310            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18311        );
18312        let src = _mm256_set_ph(
18313            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
18314        );
18315        let r = _mm256_mask_mul_ph(src, 0b0101010101010101, a, b);
18316        let e = _mm256_set_ph(
18317            18., 30., 20., 52., 22., 66., 24., 72., 26., 70., 28., 60., 30., 42., 32., 16.,
18318        );
18319        assert_eq_m256h(r, e);
18320    }
18321
18322    #[simd_test(enable = "avx512fp16,avx512vl")]
18323    const fn test_mm256_maskz_mul_ph() {
18324        let a = _mm256_set_ph(
18325            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18326        );
18327        let b = _mm256_set_ph(
18328            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18329        );
18330        let r = _mm256_maskz_mul_ph(0b0101010101010101, a, b);
18331        let e = _mm256_set_ph(
18332            0., 30., 0., 52., 0., 66., 0., 72., 0., 70., 0., 60., 0., 42., 0., 16.,
18333        );
18334        assert_eq_m256h(r, e);
18335    }
18336
18337    #[simd_test(enable = "avx512fp16")]
18338    const fn test_mm512_mul_ph() {
18339        let a = _mm512_set_ph(
18340            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18341            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18342            31.0, 32.0,
18343        );
18344        let b = _mm512_set_ph(
18345            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18346            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18347            3.0, 2.0, 1.0,
18348        );
18349        let r = _mm512_mul_ph(a, b);
18350        let e = _mm512_set_ph(
18351            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
18352            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
18353            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
18354        );
18355        assert_eq_m512h(r, e);
18356    }
18357
18358    #[simd_test(enable = "avx512fp16")]
18359    const fn test_mm512_mask_mul_ph() {
18360        let a = _mm512_set_ph(
18361            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18362            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18363            31.0, 32.0,
18364        );
18365        let b = _mm512_set_ph(
18366            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18367            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18368            3.0, 2.0, 1.0,
18369        );
18370        let src = _mm512_set_ph(
18371            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18372            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18373        );
18374        let r = _mm512_mask_mul_ph(src, 0b01010101010101010101010101010101, a, b);
18375        let e = _mm512_set_ph(
18376            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
18377            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
18378        );
18379        assert_eq_m512h(r, e);
18380    }
18381
18382    #[simd_test(enable = "avx512fp16")]
18383    const fn test_mm512_maskz_mul_ph() {
18384        let a = _mm512_set_ph(
18385            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18386            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18387            31.0, 32.0,
18388        );
18389        let b = _mm512_set_ph(
18390            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18391            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18392            3.0, 2.0, 1.0,
18393        );
18394        let r = _mm512_maskz_mul_ph(0b01010101010101010101010101010101, a, b);
18395        let e = _mm512_set_ph(
18396            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
18397            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
18398        );
18399        assert_eq_m512h(r, e);
18400    }
18401
18402    #[simd_test(enable = "avx512fp16")]
18403    fn test_mm512_mul_round_ph() {
18404        let a = _mm512_set_ph(
18405            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18406            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18407            31.0, 32.0,
18408        );
18409        let b = _mm512_set_ph(
18410            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18411            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18412            3.0, 2.0, 1.0,
18413        );
18414        let r = _mm512_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18415        let e = _mm512_set_ph(
18416            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
18417            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
18418            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
18419        );
18420        assert_eq_m512h(r, e);
18421    }
18422
18423    #[simd_test(enable = "avx512fp16")]
18424    fn test_mm512_mask_mul_round_ph() {
18425        let a = _mm512_set_ph(
18426            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18427            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18428            31.0, 32.0,
18429        );
18430        let b = _mm512_set_ph(
18431            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18432            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18433            3.0, 2.0, 1.0,
18434        );
18435        let src = _mm512_set_ph(
18436            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18437            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18438        );
18439        let r = _mm512_mask_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18440            src,
18441            0b01010101010101010101010101010101,
18442            a,
18443            b,
18444        );
18445        let e = _mm512_set_ph(
18446            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
18447            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
18448        );
18449        assert_eq_m512h(r, e);
18450    }
18451
18452    #[simd_test(enable = "avx512fp16")]
18453    fn test_mm512_maskz_mul_round_ph() {
18454        let a = _mm512_set_ph(
18455            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18456            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18457            31.0, 32.0,
18458        );
18459        let b = _mm512_set_ph(
18460            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18461            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18462            3.0, 2.0, 1.0,
18463        );
18464        let r = _mm512_maskz_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18465            0b01010101010101010101010101010101,
18466            a,
18467            b,
18468        );
18469        let e = _mm512_set_ph(
18470            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
18471            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
18472        );
18473        assert_eq_m512h(r, e);
18474    }
18475
18476    #[simd_test(enable = "avx512fp16,avx512vl")]
18477    fn test_mm_mul_round_sh() {
18478        let a = _mm_set_sh(1.0);
18479        let b = _mm_set_sh(2.0);
18480        let r = _mm_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18481        let e = _mm_set_sh(2.0);
18482        assert_eq_m128h(r, e);
18483    }
18484
18485    #[simd_test(enable = "avx512fp16,avx512vl")]
18486    fn test_mm_mask_mul_round_sh() {
18487        let a = _mm_set_sh(1.0);
18488        let b = _mm_set_sh(2.0);
18489        let src = _mm_set_sh(4.0);
18490        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18491            src, 0, a, b,
18492        );
18493        let e = _mm_set_sh(4.0);
18494        assert_eq_m128h(r, e);
18495        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18496            src, 1, a, b,
18497        );
18498        let e = _mm_set_sh(2.0);
18499        assert_eq_m128h(r, e);
18500    }
18501
18502    #[simd_test(enable = "avx512fp16,avx512vl")]
18503    fn test_mm_maskz_mul_round_sh() {
18504        let a = _mm_set_sh(1.0);
18505        let b = _mm_set_sh(2.0);
18506        let r =
18507            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18508        let e = _mm_set_sh(0.0);
18509        assert_eq_m128h(r, e);
18510        let r =
18511            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18512        let e = _mm_set_sh(2.0);
18513        assert_eq_m128h(r, e);
18514    }
18515
18516    #[simd_test(enable = "avx512fp16,avx512vl")]
18517    const fn test_mm_mul_sh() {
18518        let a = _mm_set_sh(1.0);
18519        let b = _mm_set_sh(2.0);
18520        let r = _mm_mul_sh(a, b);
18521        let e = _mm_set_sh(2.0);
18522        assert_eq_m128h(r, e);
18523    }
18524
18525    #[simd_test(enable = "avx512fp16,avx512vl")]
18526    const fn test_mm_mask_mul_sh() {
18527        let a = _mm_set_sh(1.0);
18528        let b = _mm_set_sh(2.0);
18529        let src = _mm_set_sh(4.0);
18530        let r = _mm_mask_mul_sh(src, 0, a, b);
18531        let e = _mm_set_sh(4.0);
18532        assert_eq_m128h(r, e);
18533        let r = _mm_mask_mul_sh(src, 1, a, b);
18534        let e = _mm_set_sh(2.0);
18535        assert_eq_m128h(r, e);
18536    }
18537
18538    #[simd_test(enable = "avx512fp16,avx512vl")]
18539    const fn test_mm_maskz_mul_sh() {
18540        let a = _mm_set_sh(1.0);
18541        let b = _mm_set_sh(2.0);
18542        let r = _mm_maskz_mul_sh(0, a, b);
18543        let e = _mm_set_sh(0.0);
18544        assert_eq_m128h(r, e);
18545        let r = _mm_maskz_mul_sh(1, a, b);
18546        let e = _mm_set_sh(2.0);
18547        assert_eq_m128h(r, e);
18548    }
18549
18550    #[simd_test(enable = "avx512fp16,avx512vl")]
18551    const fn test_mm_div_ph() {
18552        let a = _mm_set1_ph(1.0);
18553        let b = _mm_set1_ph(2.0);
18554        let r = _mm_div_ph(a, b);
18555        let e = _mm_set1_ph(0.5);
18556        assert_eq_m128h(r, e);
18557    }
18558
18559    #[simd_test(enable = "avx512fp16,avx512vl")]
18560    const fn test_mm_mask_div_ph() {
18561        let a = _mm_set1_ph(1.0);
18562        let b = _mm_set1_ph(2.0);
18563        let src = _mm_set_ph(4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0);
18564        let r = _mm_mask_div_ph(src, 0b01010101, a, b);
18565        let e = _mm_set_ph(4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5);
18566        assert_eq_m128h(r, e);
18567    }
18568
18569    #[simd_test(enable = "avx512fp16,avx512vl")]
18570    const fn test_mm_maskz_div_ph() {
18571        let a = _mm_set1_ph(1.0);
18572        let b = _mm_set1_ph(2.0);
18573        let r = _mm_maskz_div_ph(0b01010101, a, b);
18574        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
18575        assert_eq_m128h(r, e);
18576    }
18577
18578    #[simd_test(enable = "avx512fp16,avx512vl")]
18579    const fn test_mm256_div_ph() {
18580        let a = _mm256_set1_ph(1.0);
18581        let b = _mm256_set1_ph(2.0);
18582        let r = _mm256_div_ph(a, b);
18583        let e = _mm256_set1_ph(0.5);
18584        assert_eq_m256h(r, e);
18585    }
18586
18587    #[simd_test(enable = "avx512fp16,avx512vl")]
18588    const fn test_mm256_mask_div_ph() {
18589        let a = _mm256_set1_ph(1.0);
18590        let b = _mm256_set1_ph(2.0);
18591        let src = _mm256_set_ph(
18592            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18593            19.0,
18594        );
18595        let r = _mm256_mask_div_ph(src, 0b0101010101010101, a, b);
18596        let e = _mm256_set_ph(
18597            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18598        );
18599        assert_eq_m256h(r, e);
18600    }
18601
18602    #[simd_test(enable = "avx512fp16,avx512vl")]
18603    const fn test_mm256_maskz_div_ph() {
18604        let a = _mm256_set1_ph(1.0);
18605        let b = _mm256_set1_ph(2.0);
18606        let r = _mm256_maskz_div_ph(0b0101010101010101, a, b);
18607        let e = _mm256_set_ph(
18608            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18609        );
18610        assert_eq_m256h(r, e);
18611    }
18612
18613    #[simd_test(enable = "avx512fp16")]
18614    const fn test_mm512_div_ph() {
18615        let a = _mm512_set1_ph(1.0);
18616        let b = _mm512_set1_ph(2.0);
18617        let r = _mm512_div_ph(a, b);
18618        let e = _mm512_set1_ph(0.5);
18619        assert_eq_m512h(r, e);
18620    }
18621
18622    #[simd_test(enable = "avx512fp16")]
18623    const fn test_mm512_mask_div_ph() {
18624        let a = _mm512_set1_ph(1.0);
18625        let b = _mm512_set1_ph(2.0);
18626        let src = _mm512_set_ph(
18627            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18628            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
18629            33.0, 34.0, 35.0,
18630        );
18631        let r = _mm512_mask_div_ph(src, 0b01010101010101010101010101010101, a, b);
18632        let e = _mm512_set_ph(
18633            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18634            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
18635        );
18636        assert_eq_m512h(r, e);
18637    }
18638
18639    #[simd_test(enable = "avx512fp16")]
18640    const fn test_mm512_maskz_div_ph() {
18641        let a = _mm512_set1_ph(1.0);
18642        let b = _mm512_set1_ph(2.0);
18643        let r = _mm512_maskz_div_ph(0b01010101010101010101010101010101, a, b);
18644        let e = _mm512_set_ph(
18645            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
18646            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18647        );
18648        assert_eq_m512h(r, e);
18649    }
18650
18651    #[simd_test(enable = "avx512fp16")]
18652    fn test_mm512_div_round_ph() {
18653        let a = _mm512_set1_ph(1.0);
18654        let b = _mm512_set1_ph(2.0);
18655        let r = _mm512_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18656        let e = _mm512_set1_ph(0.5);
18657        assert_eq_m512h(r, e);
18658    }
18659
18660    #[simd_test(enable = "avx512fp16")]
18661    fn test_mm512_mask_div_round_ph() {
18662        let a = _mm512_set1_ph(1.0);
18663        let b = _mm512_set1_ph(2.0);
18664        let src = _mm512_set_ph(
18665            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18666            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
18667            33.0, 34.0, 35.0,
18668        );
18669        let r = _mm512_mask_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18670            src,
18671            0b01010101010101010101010101010101,
18672            a,
18673            b,
18674        );
18675        let e = _mm512_set_ph(
18676            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18677            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
18678        );
18679        assert_eq_m512h(r, e);
18680    }
18681
18682    #[simd_test(enable = "avx512fp16")]
18683    fn test_mm512_maskz_div_round_ph() {
18684        let a = _mm512_set1_ph(1.0);
18685        let b = _mm512_set1_ph(2.0);
18686        let r = _mm512_maskz_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18687            0b01010101010101010101010101010101,
18688            a,
18689            b,
18690        );
18691        let e = _mm512_set_ph(
18692            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
18693            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18694        );
18695        assert_eq_m512h(r, e);
18696    }
18697
18698    #[simd_test(enable = "avx512fp16,avx512vl")]
18699    fn test_mm_div_round_sh() {
18700        let a = _mm_set_sh(1.0);
18701        let b = _mm_set_sh(2.0);
18702        let r = _mm_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18703        let e = _mm_set_sh(0.5);
18704        assert_eq_m128h(r, e);
18705    }
18706
18707    #[simd_test(enable = "avx512fp16,avx512vl")]
18708    fn test_mm_mask_div_round_sh() {
18709        let a = _mm_set_sh(1.0);
18710        let b = _mm_set_sh(2.0);
18711        let src = _mm_set_sh(4.0);
18712        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18713            src, 0, a, b,
18714        );
18715        let e = _mm_set_sh(4.0);
18716        assert_eq_m128h(r, e);
18717        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18718            src, 1, a, b,
18719        );
18720        let e = _mm_set_sh(0.5);
18721        assert_eq_m128h(r, e);
18722    }
18723
18724    #[simd_test(enable = "avx512fp16,avx512vl")]
18725    fn test_mm_maskz_div_round_sh() {
18726        let a = _mm_set_sh(1.0);
18727        let b = _mm_set_sh(2.0);
18728        let r =
18729            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18730        let e = _mm_set_sh(0.0);
18731        assert_eq_m128h(r, e);
18732        let r =
18733            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18734        let e = _mm_set_sh(0.5);
18735        assert_eq_m128h(r, e);
18736    }
18737
18738    #[simd_test(enable = "avx512fp16,avx512vl")]
18739    const fn test_mm_div_sh() {
18740        let a = _mm_set_sh(1.0);
18741        let b = _mm_set_sh(2.0);
18742        let r = _mm_div_sh(a, b);
18743        let e = _mm_set_sh(0.5);
18744        assert_eq_m128h(r, e);
18745    }
18746
18747    #[simd_test(enable = "avx512fp16,avx512vl")]
18748    const fn test_mm_mask_div_sh() {
18749        let a = _mm_set_sh(1.0);
18750        let b = _mm_set_sh(2.0);
18751        let src = _mm_set_sh(4.0);
18752        let r = _mm_mask_div_sh(src, 0, a, b);
18753        let e = _mm_set_sh(4.0);
18754        assert_eq_m128h(r, e);
18755        let r = _mm_mask_div_sh(src, 1, a, b);
18756        let e = _mm_set_sh(0.5);
18757        assert_eq_m128h(r, e);
18758    }
18759
18760    #[simd_test(enable = "avx512fp16,avx512vl")]
18761    const fn test_mm_maskz_div_sh() {
18762        let a = _mm_set_sh(1.0);
18763        let b = _mm_set_sh(2.0);
18764        let r = _mm_maskz_div_sh(0, a, b);
18765        let e = _mm_set_sh(0.0);
18766        assert_eq_m128h(r, e);
18767        let r = _mm_maskz_div_sh(1, a, b);
18768        let e = _mm_set_sh(0.5);
18769        assert_eq_m128h(r, e);
18770    }
18771
18772    #[simd_test(enable = "avx512fp16,avx512vl")]
18773    fn test_mm_mul_pch() {
18774        let a = _mm_set1_pch(0.0, 1.0);
18775        let b = _mm_set1_pch(0.0, 1.0);
18776        let r = _mm_mul_pch(a, b);
18777        let e = _mm_set1_pch(-1.0, 0.0);
18778        assert_eq_m128h(r, e);
18779    }
18780
18781    #[simd_test(enable = "avx512fp16,avx512vl")]
18782    fn test_mm_mask_mul_pch() {
18783        let a = _mm_set1_pch(0.0, 1.0);
18784        let b = _mm_set1_pch(0.0, 1.0);
18785        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18786        let r = _mm_mask_mul_pch(src, 0b0101, a, b);
18787        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18788        assert_eq_m128h(r, e);
18789    }
18790
18791    #[simd_test(enable = "avx512fp16,avx512vl")]
18792    fn test_mm_maskz_mul_pch() {
18793        let a = _mm_set1_pch(0.0, 1.0);
18794        let b = _mm_set1_pch(0.0, 1.0);
18795        let r = _mm_maskz_mul_pch(0b0101, a, b);
18796        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
18797        assert_eq_m128h(r, e);
18798    }
18799
18800    #[simd_test(enable = "avx512fp16,avx512vl")]
18801    fn test_mm256_mul_pch() {
18802        let a = _mm256_set1_pch(0.0, 1.0);
18803        let b = _mm256_set1_pch(0.0, 1.0);
18804        let r = _mm256_mul_pch(a, b);
18805        let e = _mm256_set1_pch(-1.0, 0.0);
18806        assert_eq_m256h(r, e);
18807    }
18808
18809    #[simd_test(enable = "avx512fp16,avx512vl")]
18810    fn test_mm256_mask_mul_pch() {
18811        let a = _mm256_set1_pch(0.0, 1.0);
18812        let b = _mm256_set1_pch(0.0, 1.0);
18813        let src = _mm256_setr_ph(
18814            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18815        );
18816        let r = _mm256_mask_mul_pch(src, 0b01010101, a, b);
18817        let e = _mm256_setr_ph(
18818            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18819        );
18820        assert_eq_m256h(r, e);
18821    }
18822
18823    #[simd_test(enable = "avx512fp16,avx512vl")]
18824    fn test_mm256_maskz_mul_pch() {
18825        let a = _mm256_set1_pch(0.0, 1.0);
18826        let b = _mm256_set1_pch(0.0, 1.0);
18827        let r = _mm256_maskz_mul_pch(0b01010101, a, b);
18828        let e = _mm256_setr_ph(
18829            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18830        );
18831        assert_eq_m256h(r, e);
18832    }
18833
18834    #[simd_test(enable = "avx512fp16")]
18835    fn test_mm512_mul_pch() {
18836        let a = _mm512_set1_pch(0.0, 1.0);
18837        let b = _mm512_set1_pch(0.0, 1.0);
18838        let r = _mm512_mul_pch(a, b);
18839        let e = _mm512_set1_pch(-1.0, 0.0);
18840        assert_eq_m512h(r, e);
18841    }
18842
18843    #[simd_test(enable = "avx512fp16")]
18844    fn test_mm512_mask_mul_pch() {
18845        let a = _mm512_set1_pch(0.0, 1.0);
18846        let b = _mm512_set1_pch(0.0, 1.0);
18847        let src = _mm512_setr_ph(
18848            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18849            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18850            32.0, 33.0,
18851        );
18852        let r = _mm512_mask_mul_pch(src, 0b0101010101010101, a, b);
18853        let e = _mm512_setr_ph(
18854            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18855            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18856            33.0,
18857        );
18858        assert_eq_m512h(r, e);
18859    }
18860
18861    #[simd_test(enable = "avx512fp16")]
18862    fn test_mm512_maskz_mul_pch() {
18863        let a = _mm512_set1_pch(0.0, 1.0);
18864        let b = _mm512_set1_pch(0.0, 1.0);
18865        let r = _mm512_maskz_mul_pch(0b0101010101010101, a, b);
18866        let e = _mm512_setr_ph(
18867            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18868            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18869        );
18870        assert_eq_m512h(r, e);
18871    }
18872
18873    #[simd_test(enable = "avx512fp16")]
18874    fn test_mm512_mul_round_pch() {
18875        let a = _mm512_set1_pch(0.0, 1.0);
18876        let b = _mm512_set1_pch(0.0, 1.0);
18877        let r = _mm512_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18878        let e = _mm512_set1_pch(-1.0, 0.0);
18879        assert_eq_m512h(r, e);
18880    }
18881
18882    #[simd_test(enable = "avx512fp16")]
18883    fn test_mm512_mask_mul_round_pch() {
18884        let a = _mm512_set1_pch(0.0, 1.0);
18885        let b = _mm512_set1_pch(0.0, 1.0);
18886        let src = _mm512_setr_ph(
18887            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18888            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18889            32.0, 33.0,
18890        );
18891        let r = _mm512_mask_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18892            src,
18893            0b0101010101010101,
18894            a,
18895            b,
18896        );
18897        let e = _mm512_setr_ph(
18898            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18899            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18900            33.0,
18901        );
18902        assert_eq_m512h(r, e);
18903    }
18904
18905    #[simd_test(enable = "avx512fp16")]
18906    fn test_mm512_maskz_mul_round_pch() {
18907        let a = _mm512_set1_pch(0.0, 1.0);
18908        let b = _mm512_set1_pch(0.0, 1.0);
18909        let r = _mm512_maskz_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18910            0b0101010101010101,
18911            a,
18912            b,
18913        );
18914        let e = _mm512_setr_ph(
18915            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18916            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18917        );
18918        assert_eq_m512h(r, e);
18919    }
18920
18921    #[simd_test(enable = "avx512fp16,avx512vl")]
18922    fn test_mm_mul_round_sch() {
18923        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18924        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18925        let r = _mm_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18926        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18927        assert_eq_m128h(r, e);
18928    }
18929
18930    #[simd_test(enable = "avx512fp16,avx512vl")]
18931    fn test_mm_mask_mul_round_sch() {
18932        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18933        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18934        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18935        let r = _mm_mask_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18936            src, 0, a, b,
18937        );
18938        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18939        assert_eq_m128h(r, e);
18940    }
18941
18942    #[simd_test(enable = "avx512fp16,avx512vl")]
18943    fn test_mm_maskz_mul_round_sch() {
18944        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18945        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18946        let r =
18947            _mm_maskz_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18948        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18949        assert_eq_m128h(r, e);
18950    }
18951
18952    #[simd_test(enable = "avx512fp16,avx512vl")]
18953    fn test_mm_mul_sch() {
18954        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18955        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18956        let r = _mm_mul_sch(a, b);
18957        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18958        assert_eq_m128h(r, e);
18959    }
18960
18961    #[simd_test(enable = "avx512fp16,avx512vl")]
18962    fn test_mm_mask_mul_sch() {
18963        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18964        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18965        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18966        let r = _mm_mask_mul_sch(src, 0, a, b);
18967        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18968        assert_eq_m128h(r, e);
18969    }
18970
18971    #[simd_test(enable = "avx512fp16,avx512vl")]
18972    fn test_mm_maskz_mul_sch() {
18973        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18974        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18975        let r = _mm_maskz_mul_sch(0, a, b);
18976        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18977        assert_eq_m128h(r, e);
18978    }
18979
18980    #[simd_test(enable = "avx512fp16,avx512vl")]
18981    fn test_mm_fmul_pch() {
18982        let a = _mm_set1_pch(0.0, 1.0);
18983        let b = _mm_set1_pch(0.0, 1.0);
18984        let r = _mm_fmul_pch(a, b);
18985        let e = _mm_set1_pch(-1.0, 0.0);
18986        assert_eq_m128h(r, e);
18987    }
18988
18989    #[simd_test(enable = "avx512fp16,avx512vl")]
18990    fn test_mm_mask_fmul_pch() {
18991        let a = _mm_set1_pch(0.0, 1.0);
18992        let b = _mm_set1_pch(0.0, 1.0);
18993        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18994        let r = _mm_mask_fmul_pch(src, 0b0101, a, b);
18995        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18996        assert_eq_m128h(r, e);
18997    }
18998
18999    #[simd_test(enable = "avx512fp16,avx512vl")]
19000    fn test_mm_maskz_fmul_pch() {
19001        let a = _mm_set1_pch(0.0, 1.0);
19002        let b = _mm_set1_pch(0.0, 1.0);
19003        let r = _mm_maskz_fmul_pch(0b0101, a, b);
19004        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
19005        assert_eq_m128h(r, e);
19006    }
19007
19008    #[simd_test(enable = "avx512fp16,avx512vl")]
19009    fn test_mm256_fmul_pch() {
19010        let a = _mm256_set1_pch(0.0, 1.0);
19011        let b = _mm256_set1_pch(0.0, 1.0);
19012        let r = _mm256_fmul_pch(a, b);
19013        let e = _mm256_set1_pch(-1.0, 0.0);
19014        assert_eq_m256h(r, e);
19015    }
19016
19017    #[simd_test(enable = "avx512fp16,avx512vl")]
19018    fn test_mm256_mask_fmul_pch() {
19019        let a = _mm256_set1_pch(0.0, 1.0);
19020        let b = _mm256_set1_pch(0.0, 1.0);
19021        let src = _mm256_setr_ph(
19022            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19023        );
19024        let r = _mm256_mask_fmul_pch(src, 0b01010101, a, b);
19025        let e = _mm256_setr_ph(
19026            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19027        );
19028        assert_eq_m256h(r, e);
19029    }
19030
19031    #[simd_test(enable = "avx512fp16,avx512vl")]
19032    fn test_mm256_maskz_fmul_pch() {
19033        let a = _mm256_set1_pch(0.0, 1.0);
19034        let b = _mm256_set1_pch(0.0, 1.0);
19035        let r = _mm256_maskz_fmul_pch(0b01010101, a, b);
19036        let e = _mm256_setr_ph(
19037            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19038        );
19039        assert_eq_m256h(r, e);
19040    }
19041
19042    #[simd_test(enable = "avx512fp16")]
19043    fn test_mm512_fmul_pch() {
19044        let a = _mm512_set1_pch(0.0, 1.0);
19045        let b = _mm512_set1_pch(0.0, 1.0);
19046        let r = _mm512_fmul_pch(a, b);
19047        let e = _mm512_set1_pch(-1.0, 0.0);
19048        assert_eq_m512h(r, e);
19049    }
19050
19051    #[simd_test(enable = "avx512fp16")]
19052    fn test_mm512_mask_fmul_pch() {
19053        let a = _mm512_set1_pch(0.0, 1.0);
19054        let b = _mm512_set1_pch(0.0, 1.0);
19055        let src = _mm512_setr_ph(
19056            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19057            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19058            32.0, 33.0,
19059        );
19060        let r = _mm512_mask_fmul_pch(src, 0b0101010101010101, a, b);
19061        let e = _mm512_setr_ph(
19062            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19063            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19064            33.0,
19065        );
19066        assert_eq_m512h(r, e);
19067    }
19068
19069    #[simd_test(enable = "avx512fp16")]
19070    fn test_mm512_maskz_fmul_pch() {
19071        let a = _mm512_set1_pch(0.0, 1.0);
19072        let b = _mm512_set1_pch(0.0, 1.0);
19073        let r = _mm512_maskz_fmul_pch(0b0101010101010101, a, b);
19074        let e = _mm512_setr_ph(
19075            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19076            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19077        );
19078        assert_eq_m512h(r, e);
19079    }
19080
19081    #[simd_test(enable = "avx512fp16")]
19082    fn test_mm512_fmul_round_pch() {
19083        let a = _mm512_set1_pch(0.0, 1.0);
19084        let b = _mm512_set1_pch(0.0, 1.0);
19085        let r = _mm512_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19086        let e = _mm512_set1_pch(-1.0, 0.0);
19087        assert_eq_m512h(r, e);
19088    }
19089
19090    #[simd_test(enable = "avx512fp16")]
19091    fn test_mm512_mask_fmul_round_pch() {
19092        let a = _mm512_set1_pch(0.0, 1.0);
19093        let b = _mm512_set1_pch(0.0, 1.0);
19094        let src = _mm512_setr_ph(
19095            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19096            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19097            32.0, 33.0,
19098        );
19099        let r = _mm512_mask_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19100            src,
19101            0b0101010101010101,
19102            a,
19103            b,
19104        );
19105        let e = _mm512_setr_ph(
19106            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19107            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19108            33.0,
19109        );
19110        assert_eq_m512h(r, e);
19111    }
19112
19113    #[simd_test(enable = "avx512fp16")]
19114    fn test_mm512_maskz_fmul_round_pch() {
19115        let a = _mm512_set1_pch(0.0, 1.0);
19116        let b = _mm512_set1_pch(0.0, 1.0);
19117        let r = _mm512_maskz_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19118            0b0101010101010101,
19119            a,
19120            b,
19121        );
19122        let e = _mm512_setr_ph(
19123            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19124            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19125        );
19126        assert_eq_m512h(r, e);
19127    }
19128
19129    #[simd_test(enable = "avx512fp16,avx512vl")]
19130    fn test_mm_fmul_round_sch() {
19131        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19132        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19133        let r = _mm_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19134        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19135        assert_eq_m128h(r, e);
19136    }
19137
19138    #[simd_test(enable = "avx512fp16,avx512vl")]
19139    fn test_mm_mask_fmul_round_sch() {
19140        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19141        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19142        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19143        let r = _mm_mask_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19144            src, 0, a, b,
19145        );
19146        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19147        assert_eq_m128h(r, e);
19148    }
19149
19150    #[simd_test(enable = "avx512fp16,avx512vl")]
19151    fn test_mm_maskz_fmul_round_sch() {
19152        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19153        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19154        let r =
19155            _mm_maskz_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19156        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19157        assert_eq_m128h(r, e);
19158    }
19159
19160    #[simd_test(enable = "avx512fp16,avx512vl")]
19161    fn test_mm_fmul_sch() {
19162        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19163        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19164        let r = _mm_fmul_sch(a, b);
19165        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19166        assert_eq_m128h(r, e);
19167    }
19168
19169    #[simd_test(enable = "avx512fp16,avx512vl")]
19170    fn test_mm_mask_fmul_sch() {
19171        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19172        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19173        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19174        let r = _mm_mask_fmul_sch(src, 0, a, b);
19175        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19176        assert_eq_m128h(r, e);
19177    }
19178
19179    #[simd_test(enable = "avx512fp16,avx512vl")]
19180    fn test_mm_maskz_fmul_sch() {
19181        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19182        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19183        let r = _mm_maskz_fmul_sch(0, a, b);
19184        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19185        assert_eq_m128h(r, e);
19186    }
19187
19188    #[simd_test(enable = "avx512fp16,avx512vl")]
19189    fn test_mm_cmul_pch() {
19190        let a = _mm_set1_pch(0.0, 1.0);
19191        let b = _mm_set1_pch(0.0, -1.0);
19192        let r = _mm_cmul_pch(a, b);
19193        let e = _mm_set1_pch(-1.0, 0.0);
19194        assert_eq_m128h(r, e);
19195    }
19196
19197    #[simd_test(enable = "avx512fp16,avx512vl")]
19198    fn test_mm_mask_cmul_pch() {
19199        let a = _mm_set1_pch(0.0, 1.0);
19200        let b = _mm_set1_pch(0.0, -1.0);
19201        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19202        let r = _mm_mask_cmul_pch(src, 0b0101, a, b);
19203        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
19204        assert_eq_m128h(r, e);
19205    }
19206
19207    #[simd_test(enable = "avx512fp16,avx512vl")]
19208    fn test_mm_maskz_cmul_pch() {
19209        let a = _mm_set1_pch(0.0, 1.0);
19210        let b = _mm_set1_pch(0.0, -1.0);
19211        let r = _mm_maskz_cmul_pch(0b0101, a, b);
19212        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
19213        assert_eq_m128h(r, e);
19214    }
19215
19216    #[simd_test(enable = "avx512fp16,avx512vl")]
19217    fn test_mm256_cmul_pch() {
19218        let a = _mm256_set1_pch(0.0, 1.0);
19219        let b = _mm256_set1_pch(0.0, -1.0);
19220        let r = _mm256_cmul_pch(a, b);
19221        let e = _mm256_set1_pch(-1.0, 0.0);
19222        assert_eq_m256h(r, e);
19223    }
19224
19225    #[simd_test(enable = "avx512fp16,avx512vl")]
19226    fn test_mm256_mask_cmul_pch() {
19227        let a = _mm256_set1_pch(0.0, 1.0);
19228        let b = _mm256_set1_pch(0.0, -1.0);
19229        let src = _mm256_setr_ph(
19230            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19231        );
19232        let r = _mm256_mask_cmul_pch(src, 0b01010101, a, b);
19233        let e = _mm256_setr_ph(
19234            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19235        );
19236        assert_eq_m256h(r, e);
19237    }
19238
19239    #[simd_test(enable = "avx512fp16,avx512vl")]
19240    fn test_mm256_maskz_cmul_pch() {
19241        let a = _mm256_set1_pch(0.0, 1.0);
19242        let b = _mm256_set1_pch(0.0, -1.0);
19243        let r = _mm256_maskz_cmul_pch(0b01010101, a, b);
19244        let e = _mm256_setr_ph(
19245            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19246        );
19247        assert_eq_m256h(r, e);
19248    }
19249
19250    #[simd_test(enable = "avx512fp16")]
19251    fn test_mm512_cmul_pch() {
19252        let a = _mm512_set1_pch(0.0, 1.0);
19253        let b = _mm512_set1_pch(0.0, -1.0);
19254        let r = _mm512_cmul_pch(a, b);
19255        let e = _mm512_set1_pch(-1.0, 0.0);
19256        assert_eq_m512h(r, e);
19257    }
19258
19259    #[simd_test(enable = "avx512fp16")]
19260    fn test_mm512_mask_cmul_pch() {
19261        let a = _mm512_set1_pch(0.0, 1.0);
19262        let b = _mm512_set1_pch(0.0, -1.0);
19263        let src = _mm512_setr_ph(
19264            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19265            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19266            32.0, 33.0,
19267        );
19268        let r = _mm512_mask_cmul_pch(src, 0b0101010101010101, a, b);
19269        let e = _mm512_setr_ph(
19270            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19271            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19272            33.0,
19273        );
19274        assert_eq_m512h(r, e);
19275    }
19276
19277    #[simd_test(enable = "avx512fp16")]
19278    fn test_mm512_maskz_cmul_pch() {
19279        let a = _mm512_set1_pch(0.0, 1.0);
19280        let b = _mm512_set1_pch(0.0, -1.0);
19281        let r = _mm512_maskz_cmul_pch(0b0101010101010101, a, b);
19282        let e = _mm512_setr_ph(
19283            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19284            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19285        );
19286        assert_eq_m512h(r, e);
19287    }
19288
19289    #[simd_test(enable = "avx512fp16")]
19290    fn test_mm512_cmul_round_pch() {
19291        let a = _mm512_set1_pch(0.0, 1.0);
19292        let b = _mm512_set1_pch(0.0, -1.0);
19293        let r = _mm512_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19294        let e = _mm512_set1_pch(-1.0, 0.0);
19295        assert_eq_m512h(r, e);
19296    }
19297
19298    #[simd_test(enable = "avx512fp16")]
19299    fn test_mm512_mask_cmul_round_pch() {
19300        let a = _mm512_set1_pch(0.0, 1.0);
19301        let b = _mm512_set1_pch(0.0, -1.0);
19302        let src = _mm512_setr_ph(
19303            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19304            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19305            32.0, 33.0,
19306        );
19307        let r = _mm512_mask_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19308            src,
19309            0b0101010101010101,
19310            a,
19311            b,
19312        );
19313        let e = _mm512_setr_ph(
19314            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19315            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19316            33.0,
19317        );
19318        assert_eq_m512h(r, e);
19319    }
19320
19321    #[simd_test(enable = "avx512fp16")]
19322    fn test_mm512_maskz_cmul_round_pch() {
19323        let a = _mm512_set1_pch(0.0, 1.0);
19324        let b = _mm512_set1_pch(0.0, -1.0);
19325        let r = _mm512_maskz_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19326            0b0101010101010101,
19327            a,
19328            b,
19329        );
19330        let e = _mm512_setr_ph(
19331            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19332            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19333        );
19334        assert_eq_m512h(r, e);
19335    }
19336
19337    #[simd_test(enable = "avx512fp16,avx512vl")]
19338    fn test_mm_cmul_sch() {
19339        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19340        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19341        let r = _mm_cmul_sch(a, b);
19342        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19343        assert_eq_m128h(r, e);
19344    }
19345
19346    #[simd_test(enable = "avx512fp16,avx512vl")]
19347    fn test_mm_mask_cmul_sch() {
19348        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19349        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19350        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19351        let r = _mm_mask_cmul_sch(src, 0, a, b);
19352        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19353        assert_eq_m128h(r, e);
19354    }
19355
19356    #[simd_test(enable = "avx512fp16,avx512vl")]
19357    fn test_mm_maskz_cmul_sch() {
19358        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19359        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19360        let r = _mm_maskz_cmul_sch(0, a, b);
19361        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19362        assert_eq_m128h(r, e);
19363    }
19364
19365    #[simd_test(enable = "avx512fp16,avx512vl")]
19366    fn test_mm_cmul_round_sch() {
19367        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19368        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19369        let r = _mm_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19370        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19371        assert_eq_m128h(r, e);
19372    }
19373
19374    #[simd_test(enable = "avx512fp16,avx512vl")]
19375    fn test_mm_mask_cmul_round_sch() {
19376        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19377        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19378        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19379        let r = _mm_mask_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19380            src, 0, a, b,
19381        );
19382        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19383        assert_eq_m128h(r, e);
19384    }
19385
19386    #[simd_test(enable = "avx512fp16,avx512vl")]
19387    fn test_mm_maskz_cmul_round_sch() {
19388        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19389        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19390        let r =
19391            _mm_maskz_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19392        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19393        assert_eq_m128h(r, e);
19394    }
19395
19396    #[simd_test(enable = "avx512fp16,avx512vl")]
19397    fn test_mm_fcmul_pch() {
19398        let a = _mm_set1_pch(0.0, 1.0);
19399        let b = _mm_set1_pch(0.0, -1.0);
19400        let r = _mm_fcmul_pch(a, b);
19401        let e = _mm_set1_pch(-1.0, 0.0);
19402        assert_eq_m128h(r, e);
19403    }
19404
19405    #[simd_test(enable = "avx512fp16,avx512vl")]
19406    fn test_mm_mask_fcmul_pch() {
19407        let a = _mm_set1_pch(0.0, 1.0);
19408        let b = _mm_set1_pch(0.0, -1.0);
19409        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19410        let r = _mm_mask_fcmul_pch(src, 0b0101, a, b);
19411        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
19412        assert_eq_m128h(r, e);
19413    }
19414
19415    #[simd_test(enable = "avx512fp16,avx512vl")]
19416    fn test_mm_maskz_fcmul_pch() {
19417        let a = _mm_set1_pch(0.0, 1.0);
19418        let b = _mm_set1_pch(0.0, -1.0);
19419        let r = _mm_maskz_fcmul_pch(0b0101, a, b);
19420        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
19421        assert_eq_m128h(r, e);
19422    }
19423
19424    #[simd_test(enable = "avx512fp16,avx512vl")]
19425    fn test_mm256_fcmul_pch() {
19426        let a = _mm256_set1_pch(0.0, 1.0);
19427        let b = _mm256_set1_pch(0.0, -1.0);
19428        let r = _mm256_fcmul_pch(a, b);
19429        let e = _mm256_set1_pch(-1.0, 0.0);
19430        assert_eq_m256h(r, e);
19431    }
19432
19433    #[simd_test(enable = "avx512fp16,avx512vl")]
19434    fn test_mm256_mask_fcmul_pch() {
19435        let a = _mm256_set1_pch(0.0, 1.0);
19436        let b = _mm256_set1_pch(0.0, -1.0);
19437        let src = _mm256_setr_ph(
19438            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19439        );
19440        let r = _mm256_mask_fcmul_pch(src, 0b01010101, a, b);
19441        let e = _mm256_setr_ph(
19442            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19443        );
19444        assert_eq_m256h(r, e);
19445    }
19446
19447    #[simd_test(enable = "avx512fp16,avx512vl")]
19448    fn test_mm256_maskz_fcmul_pch() {
19449        let a = _mm256_set1_pch(0.0, 1.0);
19450        let b = _mm256_set1_pch(0.0, -1.0);
19451        let r = _mm256_maskz_fcmul_pch(0b01010101, a, b);
19452        let e = _mm256_setr_ph(
19453            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19454        );
19455        assert_eq_m256h(r, e);
19456    }
19457
19458    #[simd_test(enable = "avx512fp16")]
19459    fn test_mm512_fcmul_pch() {
19460        let a = _mm512_set1_pch(0.0, 1.0);
19461        let b = _mm512_set1_pch(0.0, -1.0);
19462        let r = _mm512_fcmul_pch(a, b);
19463        let e = _mm512_set1_pch(-1.0, 0.0);
19464        assert_eq_m512h(r, e);
19465    }
19466
19467    #[simd_test(enable = "avx512fp16")]
19468    fn test_mm512_mask_fcmul_pch() {
19469        let a = _mm512_set1_pch(0.0, 1.0);
19470        let b = _mm512_set1_pch(0.0, -1.0);
19471        let src = _mm512_setr_ph(
19472            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19473            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19474            32.0, 33.0,
19475        );
19476        let r = _mm512_mask_fcmul_pch(src, 0b0101010101010101, a, b);
19477        let e = _mm512_setr_ph(
19478            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19479            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19480            33.0,
19481        );
19482        assert_eq_m512h(r, e);
19483    }
19484
19485    #[simd_test(enable = "avx512fp16")]
19486    fn test_mm512_maskz_fcmul_pch() {
19487        let a = _mm512_set1_pch(0.0, 1.0);
19488        let b = _mm512_set1_pch(0.0, -1.0);
19489        let r = _mm512_maskz_fcmul_pch(0b0101010101010101, a, b);
19490        let e = _mm512_setr_ph(
19491            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19492            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19493        );
19494        assert_eq_m512h(r, e);
19495    }
19496
19497    #[simd_test(enable = "avx512fp16")]
19498    fn test_mm512_fcmul_round_pch() {
19499        let a = _mm512_set1_pch(0.0, 1.0);
19500        let b = _mm512_set1_pch(0.0, -1.0);
19501        let r = _mm512_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19502        let e = _mm512_set1_pch(-1.0, 0.0);
19503        assert_eq_m512h(r, e);
19504    }
19505
19506    #[simd_test(enable = "avx512fp16")]
19507    fn test_mm512_mask_fcmul_round_pch() {
19508        let a = _mm512_set1_pch(0.0, 1.0);
19509        let b = _mm512_set1_pch(0.0, -1.0);
19510        let src = _mm512_setr_ph(
19511            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19512            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19513            32.0, 33.0,
19514        );
19515        let r = _mm512_mask_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19516            src,
19517            0b0101010101010101,
19518            a,
19519            b,
19520        );
19521        let e = _mm512_setr_ph(
19522            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19523            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19524            33.0,
19525        );
19526        assert_eq_m512h(r, e);
19527    }
19528
19529    #[simd_test(enable = "avx512fp16")]
19530    fn test_mm512_maskz_fcmul_round_pch() {
19531        let a = _mm512_set1_pch(0.0, 1.0);
19532        let b = _mm512_set1_pch(0.0, -1.0);
19533        let r = _mm512_maskz_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19534            0b0101010101010101,
19535            a,
19536            b,
19537        );
19538        let e = _mm512_setr_ph(
19539            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19540            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19541        );
19542        assert_eq_m512h(r, e);
19543    }
19544
19545    #[simd_test(enable = "avx512fp16,avx512vl")]
19546    fn test_mm_fcmul_sch() {
19547        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19548        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19549        let r = _mm_fcmul_sch(a, b);
19550        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19551        assert_eq_m128h(r, e);
19552    }
19553
19554    #[simd_test(enable = "avx512fp16,avx512vl")]
19555    fn test_mm_mask_fcmul_sch() {
19556        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19557        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19558        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19559        let r = _mm_mask_fcmul_sch(src, 0, a, b);
19560        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19561        assert_eq_m128h(r, e);
19562    }
19563
19564    #[simd_test(enable = "avx512fp16,avx512vl")]
19565    fn test_mm_maskz_fcmul_sch() {
19566        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19567        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19568        let r = _mm_maskz_fcmul_sch(0, a, b);
19569        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19570        assert_eq_m128h(r, e);
19571    }
19572
19573    #[simd_test(enable = "avx512fp16,avx512vl")]
19574    fn test_mm_fcmul_round_sch() {
19575        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19576        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19577        let r = _mm_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19578        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19579        assert_eq_m128h(r, e);
19580    }
19581
19582    #[simd_test(enable = "avx512fp16,avx512vl")]
19583    fn test_mm_mask_fcmul_round_sch() {
19584        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19585        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19586        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19587        let r = _mm_mask_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19588            src, 0, a, b,
19589        );
19590        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19591        assert_eq_m128h(r, e);
19592    }
19593
19594    #[simd_test(enable = "avx512fp16,avx512vl")]
19595    fn test_mm_maskz_fcmul_round_sch() {
19596        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19597        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19598        let r =
19599            _mm_maskz_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19600        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19601        assert_eq_m128h(r, e);
19602    }
19603
19604    #[simd_test(enable = "avx512fp16,avx512vl")]
19605    const fn test_mm_abs_ph() {
19606        let a = _mm_set_ph(-1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0);
19607        let r = _mm_abs_ph(a);
19608        let e = _mm_set_ph(1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0);
19609        assert_eq_m128h(r, e);
19610    }
19611
19612    #[simd_test(enable = "avx512fp16,avx512vl")]
19613    const fn test_mm256_abs_ph() {
19614        let a = _mm256_set_ph(
19615            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
19616            -14.0,
19617        );
19618        let r = _mm256_abs_ph(a);
19619        let e = _mm256_set_ph(
19620            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
19621        );
19622        assert_eq_m256h(r, e);
19623    }
19624
19625    #[simd_test(enable = "avx512fp16")]
19626    const fn test_mm512_abs_ph() {
19627        let a = _mm512_set_ph(
19628            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
19629            -14.0, 15.0, -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0,
19630            27.0, -28.0, 29.0, -30.0,
19631        );
19632        let r = _mm512_abs_ph(a);
19633        let e = _mm512_set_ph(
19634            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
19635            15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0,
19636            29.0, 30.0,
19637        );
19638        assert_eq_m512h(r, e);
19639    }
19640
19641    #[simd_test(enable = "avx512fp16,avx512vl")]
19642    const fn test_mm_conj_pch() {
19643        let a = _mm_set1_pch(0.0, 1.0);
19644        let r = _mm_conj_pch(a);
19645        let e = _mm_set1_pch(0.0, -1.0);
19646        assert_eq_m128h(r, e);
19647    }
19648
19649    #[simd_test(enable = "avx512fp16,avx512vl")]
19650    const fn test_mm_mask_conj_pch() {
19651        let a = _mm_set1_pch(0.0, 1.0);
19652        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19653        let r = _mm_mask_conj_pch(src, 0b0101, a);
19654        let e = _mm_setr_ph(0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0);
19655        assert_eq_m128h(r, e);
19656    }
19657
19658    #[simd_test(enable = "avx512fp16,avx512vl")]
19659    const fn test_mm_maskz_conj_pch() {
19660        let a = _mm_set1_pch(0.0, 1.0);
19661        let r = _mm_maskz_conj_pch(0b0101, a);
19662        let e = _mm_setr_ph(0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0);
19663        assert_eq_m128h(r, e);
19664    }
19665
19666    #[simd_test(enable = "avx512fp16,avx512vl")]
19667    const fn test_mm256_conj_pch() {
19668        let a = _mm256_set1_pch(0.0, 1.0);
19669        let r = _mm256_conj_pch(a);
19670        let e = _mm256_set1_pch(0.0, -1.0);
19671        assert_eq_m256h(r, e);
19672    }
19673
19674    #[simd_test(enable = "avx512fp16,avx512vl")]
19675    const fn test_mm256_mask_conj_pch() {
19676        let a = _mm256_set1_pch(0.0, 1.0);
19677        let src = _mm256_setr_ph(
19678            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19679        );
19680        let r = _mm256_mask_conj_pch(src, 0b01010101, a);
19681        let e = _mm256_setr_ph(
19682            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
19683        );
19684        assert_eq_m256h(r, e);
19685    }
19686
19687    #[simd_test(enable = "avx512fp16,avx512vl")]
19688    const fn test_mm256_maskz_conj_pch() {
19689        let a = _mm256_set1_pch(0.0, 1.0);
19690        let r = _mm256_maskz_conj_pch(0b01010101, a);
19691        let e = _mm256_setr_ph(
19692            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19693        );
19694        assert_eq_m256h(r, e);
19695    }
19696
19697    #[simd_test(enable = "avx512fp16")]
19698    const fn test_mm512_conj_pch() {
19699        let a = _mm512_set1_pch(0.0, 1.0);
19700        let r = _mm512_conj_pch(a);
19701        let e = _mm512_set1_pch(0.0, -1.0);
19702        assert_eq_m512h(r, e);
19703    }
19704
19705    #[simd_test(enable = "avx512fp16")]
19706    const fn test_mm512_mask_conj_pch() {
19707        let a = _mm512_set1_pch(0.0, 1.0);
19708        let src = _mm512_setr_ph(
19709            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19710            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19711            32.0, 33.0,
19712        );
19713        let r = _mm512_mask_conj_pch(src, 0b0101010101010101, a);
19714        let e = _mm512_setr_ph(
19715            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
19716            0.0, -1.0, 20.0, 21.0, 0.0, -1.0, 24.0, 25.0, 0.0, -1.0, 28.0, 29.0, 0.0, -1.0, 32.0,
19717            33.0,
19718        );
19719        assert_eq_m512h(r, e);
19720    }
19721
19722    #[simd_test(enable = "avx512fp16")]
19723    const fn test_mm512_maskz_conj_pch() {
19724        let a = _mm512_set1_pch(0.0, 1.0);
19725        let r = _mm512_maskz_conj_pch(0b0101010101010101, a);
19726        let e = _mm512_setr_ph(
19727            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19728            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19729        );
19730        assert_eq_m512h(r, e);
19731    }
19732
19733    #[simd_test(enable = "avx512fp16,avx512vl")]
19734    fn test_mm_fmadd_pch() {
19735        let a = _mm_set1_pch(0.0, 1.0);
19736        let b = _mm_set1_pch(0.0, 2.0);
19737        let c = _mm_set1_pch(0.0, 3.0);
19738        let r = _mm_fmadd_pch(a, b, c);
19739        let e = _mm_set1_pch(-2.0, 3.0);
19740        assert_eq_m128h(r, e);
19741    }
19742
19743    #[simd_test(enable = "avx512fp16,avx512vl")]
19744    fn test_mm_mask_fmadd_pch() {
19745        let a = _mm_set1_pch(0.0, 1.0);
19746        let b = _mm_set1_pch(0.0, 2.0);
19747        let c = _mm_set1_pch(0.0, 3.0);
19748        let r = _mm_mask_fmadd_pch(a, 0b0101, b, c);
19749        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0);
19750        assert_eq_m128h(r, e);
19751    }
19752
19753    #[simd_test(enable = "avx512fp16,avx512vl")]
19754    fn test_mm_mask3_fmadd_pch() {
19755        let a = _mm_set1_pch(0.0, 1.0);
19756        let b = _mm_set1_pch(0.0, 2.0);
19757        let c = _mm_set1_pch(0.0, 3.0);
19758        let r = _mm_mask3_fmadd_pch(a, b, c, 0b0101);
19759        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0);
19760        assert_eq_m128h(r, e);
19761    }
19762
19763    #[simd_test(enable = "avx512fp16,avx512vl")]
19764    fn test_mm_maskz_fmadd_pch() {
19765        let a = _mm_set1_pch(0.0, 1.0);
19766        let b = _mm_set1_pch(0.0, 2.0);
19767        let c = _mm_set1_pch(0.0, 3.0);
19768        let r = _mm_maskz_fmadd_pch(0b0101, a, b, c);
19769        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0);
19770        assert_eq_m128h(r, e);
19771    }
19772
19773    #[simd_test(enable = "avx512fp16,avx512vl")]
19774    fn test_mm256_fmadd_pch() {
19775        let a = _mm256_set1_pch(0.0, 1.0);
19776        let b = _mm256_set1_pch(0.0, 2.0);
19777        let c = _mm256_set1_pch(0.0, 3.0);
19778        let r = _mm256_fmadd_pch(a, b, c);
19779        let e = _mm256_set1_pch(-2.0, 3.0);
19780        assert_eq_m256h(r, e);
19781    }
19782
19783    #[simd_test(enable = "avx512fp16,avx512vl")]
19784    fn test_mm256_mask_fmadd_pch() {
19785        let a = _mm256_set1_pch(0.0, 1.0);
19786        let b = _mm256_set1_pch(0.0, 2.0);
19787        let c = _mm256_set1_pch(0.0, 3.0);
19788        let r = _mm256_mask_fmadd_pch(a, 0b01010101, b, c);
19789        let e = _mm256_setr_ph(
19790            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19791        );
19792        assert_eq_m256h(r, e);
19793    }
19794
19795    #[simd_test(enable = "avx512fp16,avx512vl")]
19796    fn test_mm256_mask3_fmadd_pch() {
19797        let a = _mm256_set1_pch(0.0, 1.0);
19798        let b = _mm256_set1_pch(0.0, 2.0);
19799        let c = _mm256_set1_pch(0.0, 3.0);
19800        let r = _mm256_mask3_fmadd_pch(a, b, c, 0b01010101);
19801        let e = _mm256_setr_ph(
19802            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19803        );
19804        assert_eq_m256h(r, e);
19805    }
19806
19807    #[simd_test(enable = "avx512fp16,avx512vl")]
19808    fn test_mm256_maskz_fmadd_pch() {
19809        let a = _mm256_set1_pch(0.0, 1.0);
19810        let b = _mm256_set1_pch(0.0, 2.0);
19811        let c = _mm256_set1_pch(0.0, 3.0);
19812        let r = _mm256_maskz_fmadd_pch(0b01010101, a, b, c);
19813        let e = _mm256_setr_ph(
19814            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19815        );
19816        assert_eq_m256h(r, e);
19817    }
19818
19819    #[simd_test(enable = "avx512fp16")]
19820    fn test_mm512_fmadd_pch() {
19821        let a = _mm512_set1_pch(0.0, 1.0);
19822        let b = _mm512_set1_pch(0.0, 2.0);
19823        let c = _mm512_set1_pch(0.0, 3.0);
19824        let r = _mm512_fmadd_pch(a, b, c);
19825        let e = _mm512_set1_pch(-2.0, 3.0);
19826        assert_eq_m512h(r, e);
19827    }
19828
19829    #[simd_test(enable = "avx512fp16")]
19830    fn test_mm512_mask_fmadd_pch() {
19831        let a = _mm512_set1_pch(0.0, 1.0);
19832        let b = _mm512_set1_pch(0.0, 2.0);
19833        let c = _mm512_set1_pch(0.0, 3.0);
19834        let r = _mm512_mask_fmadd_pch(a, 0b0101010101010101, b, c);
19835        let e = _mm512_setr_ph(
19836            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19837            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19838        );
19839        assert_eq_m512h(r, e);
19840    }
19841
19842    #[simd_test(enable = "avx512fp16")]
19843    fn test_mm512_mask3_fmadd_pch() {
19844        let a = _mm512_set1_pch(0.0, 1.0);
19845        let b = _mm512_set1_pch(0.0, 2.0);
19846        let c = _mm512_set1_pch(0.0, 3.0);
19847        let r = _mm512_mask3_fmadd_pch(a, b, c, 0b0101010101010101);
19848        let e = _mm512_setr_ph(
19849            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19850            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19851        );
19852        assert_eq_m512h(r, e);
19853    }
19854
19855    #[simd_test(enable = "avx512fp16")]
19856    fn test_mm512_maskz_fmadd_pch() {
19857        let a = _mm512_set1_pch(0.0, 1.0);
19858        let b = _mm512_set1_pch(0.0, 2.0);
19859        let c = _mm512_set1_pch(0.0, 3.0);
19860        let r = _mm512_maskz_fmadd_pch(0b0101010101010101, a, b, c);
19861        let e = _mm512_setr_ph(
19862            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19863            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19864        );
19865        assert_eq_m512h(r, e);
19866    }
19867
19868    #[simd_test(enable = "avx512fp16")]
19869    fn test_mm512_fmadd_round_pch() {
19870        let a = _mm512_set1_pch(0.0, 1.0);
19871        let b = _mm512_set1_pch(0.0, 2.0);
19872        let c = _mm512_set1_pch(0.0, 3.0);
19873        let r =
19874            _mm512_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19875        let e = _mm512_set1_pch(-2.0, 3.0);
19876        assert_eq_m512h(r, e);
19877    }
19878
19879    #[simd_test(enable = "avx512fp16")]
19880    fn test_mm512_mask_fmadd_round_pch() {
19881        let a = _mm512_set1_pch(0.0, 1.0);
19882        let b = _mm512_set1_pch(0.0, 2.0);
19883        let c = _mm512_set1_pch(0.0, 3.0);
19884        let r = _mm512_mask_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19885            a,
19886            0b0101010101010101,
19887            b,
19888            c,
19889        );
19890        let e = _mm512_setr_ph(
19891            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19892            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19893        );
19894        assert_eq_m512h(r, e);
19895    }
19896
19897    #[simd_test(enable = "avx512fp16")]
19898    fn test_mm512_mask3_fmadd_round_pch() {
19899        let a = _mm512_set1_pch(0.0, 1.0);
19900        let b = _mm512_set1_pch(0.0, 2.0);
19901        let c = _mm512_set1_pch(0.0, 3.0);
19902        let r = _mm512_mask3_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19903            a,
19904            b,
19905            c,
19906            0b0101010101010101,
19907        );
19908        let e = _mm512_setr_ph(
19909            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19910            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19911        );
19912        assert_eq_m512h(r, e);
19913    }
19914
19915    #[simd_test(enable = "avx512fp16")]
19916    fn test_mm512_maskz_fmadd_round_pch() {
19917        let a = _mm512_set1_pch(0.0, 1.0);
19918        let b = _mm512_set1_pch(0.0, 2.0);
19919        let c = _mm512_set1_pch(0.0, 3.0);
19920        let r = _mm512_maskz_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19921            0b0101010101010101,
19922            a,
19923            b,
19924            c,
19925        );
19926        let e = _mm512_setr_ph(
19927            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19928            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19929        );
19930        assert_eq_m512h(r, e);
19931    }
19932
19933    #[simd_test(enable = "avx512fp16,avx512vl")]
19934    fn test_mm_fmadd_sch() {
19935        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19936        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19937        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19938        let r = _mm_fmadd_sch(a, b, c);
19939        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19940        assert_eq_m128h(r, e);
19941    }
19942
19943    #[simd_test(enable = "avx512fp16,avx512vl")]
19944    fn test_mm_mask_fmadd_sch() {
19945        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19946        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19947        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19948        let r = _mm_mask_fmadd_sch(a, 0, b, c);
19949        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19950        assert_eq_m128h(r, e);
19951        let r = _mm_mask_fmadd_sch(a, 1, b, c);
19952        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19953        assert_eq_m128h(r, e);
19954    }
19955
19956    #[simd_test(enable = "avx512fp16,avx512vl")]
19957    fn test_mm_mask3_fmadd_sch() {
19958        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19959        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19960        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19961        let r = _mm_mask3_fmadd_sch(a, b, c, 0);
19962        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19963        assert_eq_m128h(r, e);
19964        let r = _mm_mask3_fmadd_sch(a, b, c, 1);
19965        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19966        assert_eq_m128h(r, e);
19967    }
19968
19969    #[simd_test(enable = "avx512fp16,avx512vl")]
19970    fn test_mm_maskz_fmadd_sch() {
19971        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19972        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19973        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19974        let r = _mm_maskz_fmadd_sch(0, a, b, c);
19975        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19976        assert_eq_m128h(r, e);
19977        let r = _mm_maskz_fmadd_sch(1, a, b, c);
19978        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19979        assert_eq_m128h(r, e);
19980    }
19981
19982    #[simd_test(enable = "avx512fp16,avx512vl")]
19983    fn test_mm_fmadd_round_sch() {
19984        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19985        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19986        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19987        let r = _mm_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19988        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19989        assert_eq_m128h(r, e);
19990    }
19991
19992    #[simd_test(enable = "avx512fp16,avx512vl")]
19993    fn test_mm_mask_fmadd_round_sch() {
19994        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19995        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19996        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19997        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19998            a, 0, b, c,
19999        );
20000        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20001        assert_eq_m128h(r, e);
20002        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20003            a, 1, b, c,
20004        );
20005        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20006        assert_eq_m128h(r, e);
20007    }
20008
20009    #[simd_test(enable = "avx512fp16,avx512vl")]
20010    fn test_mm_mask3_fmadd_round_sch() {
20011        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20012        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20013        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20014        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20015            a, b, c, 0,
20016        );
20017        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20018        assert_eq_m128h(r, e);
20019        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20020            a, b, c, 1,
20021        );
20022        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20023        assert_eq_m128h(r, e);
20024    }
20025
20026    #[simd_test(enable = "avx512fp16,avx512vl")]
20027    fn test_mm_maskz_fmadd_round_sch() {
20028        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20029        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20030        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20031        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20032            0, a, b, c,
20033        );
20034        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20035        assert_eq_m128h(r, e);
20036        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20037            1, a, b, c,
20038        );
20039        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20040        assert_eq_m128h(r, e);
20041    }
20042
20043    #[simd_test(enable = "avx512fp16,avx512vl")]
20044    fn test_mm_fcmadd_pch() {
20045        let a = _mm_set1_pch(0.0, 1.0);
20046        let b = _mm_set1_pch(0.0, 2.0);
20047        let c = _mm_set1_pch(0.0, 3.0);
20048        let r = _mm_fcmadd_pch(a, b, c);
20049        let e = _mm_set1_pch(2.0, 3.0);
20050        assert_eq_m128h(r, e);
20051    }
20052
20053    #[simd_test(enable = "avx512fp16,avx512vl")]
20054    fn test_mm_mask_fcmadd_pch() {
20055        let a = _mm_set1_pch(0.0, 1.0);
20056        let b = _mm_set1_pch(0.0, 2.0);
20057        let c = _mm_set1_pch(0.0, 3.0);
20058        let r = _mm_mask_fcmadd_pch(a, 0b0101, b, c);
20059        let e = _mm_setr_ph(2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0);
20060        assert_eq_m128h(r, e);
20061    }
20062
20063    #[simd_test(enable = "avx512fp16,avx512vl")]
20064    fn test_mm_mask3_fcmadd_pch() {
20065        let a = _mm_set1_pch(0.0, 1.0);
20066        let b = _mm_set1_pch(0.0, 2.0);
20067        let c = _mm_set1_pch(0.0, 3.0);
20068        let r = _mm_mask3_fcmadd_pch(a, b, c, 0b0101);
20069        let e = _mm_setr_ph(2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0);
20070        assert_eq_m128h(r, e);
20071    }
20072
20073    #[simd_test(enable = "avx512fp16,avx512vl")]
20074    fn test_mm_maskz_fcmadd_pch() {
20075        let a = _mm_set1_pch(0.0, 1.0);
20076        let b = _mm_set1_pch(0.0, 2.0);
20077        let c = _mm_set1_pch(0.0, 3.0);
20078        let r = _mm_maskz_fcmadd_pch(0b0101, a, b, c);
20079        let e = _mm_setr_ph(2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0);
20080        assert_eq_m128h(r, e);
20081    }
20082
20083    #[simd_test(enable = "avx512fp16,avx512vl")]
20084    fn test_mm256_fcmadd_pch() {
20085        let a = _mm256_set1_pch(0.0, 1.0);
20086        let b = _mm256_set1_pch(0.0, 2.0);
20087        let c = _mm256_set1_pch(0.0, 3.0);
20088        let r = _mm256_fcmadd_pch(a, b, c);
20089        let e = _mm256_set1_pch(2.0, 3.0);
20090        assert_eq_m256h(r, e);
20091    }
20092
20093    #[simd_test(enable = "avx512fp16,avx512vl")]
20094    fn test_mm256_mask_fcmadd_pch() {
20095        let a = _mm256_set1_pch(0.0, 1.0);
20096        let b = _mm256_set1_pch(0.0, 2.0);
20097        let c = _mm256_set1_pch(0.0, 3.0);
20098        let r = _mm256_mask_fcmadd_pch(a, 0b01010101, b, c);
20099        let e = _mm256_setr_ph(
20100            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
20101        );
20102        assert_eq_m256h(r, e);
20103    }
20104
20105    #[simd_test(enable = "avx512fp16,avx512vl")]
20106    fn test_mm256_mask3_fcmadd_pch() {
20107        let a = _mm256_set1_pch(0.0, 1.0);
20108        let b = _mm256_set1_pch(0.0, 2.0);
20109        let c = _mm256_set1_pch(0.0, 3.0);
20110        let r = _mm256_mask3_fcmadd_pch(a, b, c, 0b01010101);
20111        let e = _mm256_setr_ph(
20112            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
20113        );
20114        assert_eq_m256h(r, e);
20115    }
20116
20117    #[simd_test(enable = "avx512fp16,avx512vl")]
20118    fn test_mm256_maskz_fcmadd_pch() {
20119        let a = _mm256_set1_pch(0.0, 1.0);
20120        let b = _mm256_set1_pch(0.0, 2.0);
20121        let c = _mm256_set1_pch(0.0, 3.0);
20122        let r = _mm256_maskz_fcmadd_pch(0b01010101, a, b, c);
20123        let e = _mm256_setr_ph(
20124            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
20125        );
20126        assert_eq_m256h(r, e);
20127    }
20128
20129    #[simd_test(enable = "avx512fp16")]
20130    fn test_mm512_fcmadd_pch() {
20131        let a = _mm512_set1_pch(0.0, 1.0);
20132        let b = _mm512_set1_pch(0.0, 2.0);
20133        let c = _mm512_set1_pch(0.0, 3.0);
20134        let r = _mm512_fcmadd_pch(a, b, c);
20135        let e = _mm512_set1_pch(2.0, 3.0);
20136        assert_eq_m512h(r, e);
20137    }
20138
20139    #[simd_test(enable = "avx512fp16")]
20140    fn test_mm512_mask_fcmadd_pch() {
20141        let a = _mm512_set1_pch(0.0, 1.0);
20142        let b = _mm512_set1_pch(0.0, 2.0);
20143        let c = _mm512_set1_pch(0.0, 3.0);
20144        let r = _mm512_mask_fcmadd_pch(a, 0b0101010101010101, b, c);
20145        let e = _mm512_setr_ph(
20146            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
20147            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
20148        );
20149        assert_eq_m512h(r, e);
20150    }
20151
20152    #[simd_test(enable = "avx512fp16")]
20153    fn test_mm512_mask3_fcmadd_pch() {
20154        let a = _mm512_set1_pch(0.0, 1.0);
20155        let b = _mm512_set1_pch(0.0, 2.0);
20156        let c = _mm512_set1_pch(0.0, 3.0);
20157        let r = _mm512_mask3_fcmadd_pch(a, b, c, 0b0101010101010101);
20158        let e = _mm512_setr_ph(
20159            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
20160            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
20161        );
20162        assert_eq_m512h(r, e);
20163    }
20164
20165    #[simd_test(enable = "avx512fp16")]
20166    fn test_mm512_maskz_fcmadd_pch() {
20167        let a = _mm512_set1_pch(0.0, 1.0);
20168        let b = _mm512_set1_pch(0.0, 2.0);
20169        let c = _mm512_set1_pch(0.0, 3.0);
20170        let r = _mm512_maskz_fcmadd_pch(0b0101010101010101, a, b, c);
20171        let e = _mm512_setr_ph(
20172            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
20173            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
20174        );
20175        assert_eq_m512h(r, e);
20176    }
20177
20178    #[simd_test(enable = "avx512fp16")]
20179    fn test_mm512_fcmadd_round_pch() {
20180        let a = _mm512_set1_pch(0.0, 1.0);
20181        let b = _mm512_set1_pch(0.0, 2.0);
20182        let c = _mm512_set1_pch(0.0, 3.0);
20183        let r =
20184            _mm512_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20185        let e = _mm512_set1_pch(2.0, 3.0);
20186        assert_eq_m512h(r, e);
20187    }
20188
20189    #[simd_test(enable = "avx512fp16")]
20190    fn test_mm512_mask_fcmadd_round_pch() {
20191        let a = _mm512_set1_pch(0.0, 1.0);
20192        let b = _mm512_set1_pch(0.0, 2.0);
20193        let c = _mm512_set1_pch(0.0, 3.0);
20194        let r = _mm512_mask_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20195            a,
20196            0b0101010101010101,
20197            b,
20198            c,
20199        );
20200        let e = _mm512_setr_ph(
20201            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
20202            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
20203        );
20204        assert_eq_m512h(r, e);
20205    }
20206
20207    #[simd_test(enable = "avx512fp16")]
20208    fn test_mm512_mask3_fcmadd_round_pch() {
20209        let a = _mm512_set1_pch(0.0, 1.0);
20210        let b = _mm512_set1_pch(0.0, 2.0);
20211        let c = _mm512_set1_pch(0.0, 3.0);
20212        let r = _mm512_mask3_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20213            a,
20214            b,
20215            c,
20216            0b0101010101010101,
20217        );
20218        let e = _mm512_setr_ph(
20219            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
20220            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
20221        );
20222        assert_eq_m512h(r, e);
20223    }
20224
20225    #[simd_test(enable = "avx512fp16")]
20226    fn test_mm512_maskz_fcmadd_round_pch() {
20227        let a = _mm512_set1_pch(0.0, 1.0);
20228        let b = _mm512_set1_pch(0.0, 2.0);
20229        let c = _mm512_set1_pch(0.0, 3.0);
20230        let r = _mm512_maskz_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20231            0b0101010101010101,
20232            a,
20233            b,
20234            c,
20235        );
20236        let e = _mm512_setr_ph(
20237            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
20238            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
20239        );
20240        assert_eq_m512h(r, e);
20241    }
20242
20243    #[simd_test(enable = "avx512fp16,avx512vl")]
20244    fn test_mm_fcmadd_sch() {
20245        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20246        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20247        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20248        let r = _mm_fcmadd_sch(a, b, c);
20249        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20250        assert_eq_m128h(r, e);
20251    }
20252
20253    #[simd_test(enable = "avx512fp16,avx512vl")]
20254    fn test_mm_mask_fcmadd_sch() {
20255        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20256        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20257        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20258        let r = _mm_mask_fcmadd_sch(a, 0, b, c);
20259        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20260        assert_eq_m128h(r, e);
20261        let r = _mm_mask_fcmadd_sch(a, 1, b, c);
20262        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20263        assert_eq_m128h(r, e);
20264    }
20265
20266    #[simd_test(enable = "avx512fp16,avx512vl")]
20267    fn test_mm_mask3_fcmadd_sch() {
20268        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20269        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20270        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20271        let r = _mm_mask3_fcmadd_sch(a, b, c, 0);
20272        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20273        assert_eq_m128h(r, e);
20274        let r = _mm_mask3_fcmadd_sch(a, b, c, 1);
20275        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20276        assert_eq_m128h(r, e);
20277    }
20278
20279    #[simd_test(enable = "avx512fp16,avx512vl")]
20280    fn test_mm_maskz_fcmadd_sch() {
20281        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20282        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20283        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20284        let r = _mm_maskz_fcmadd_sch(0, a, b, c);
20285        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20286        assert_eq_m128h(r, e);
20287        let r = _mm_maskz_fcmadd_sch(1, a, b, c);
20288        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20289        assert_eq_m128h(r, e);
20290    }
20291
20292    #[simd_test(enable = "avx512fp16,avx512vl")]
20293    fn test_mm_fcmadd_round_sch() {
20294        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20295        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20296        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20297        let r = _mm_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20298        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20299        assert_eq_m128h(r, e);
20300    }
20301
20302    #[simd_test(enable = "avx512fp16,avx512vl")]
20303    fn test_mm_mask_fcmadd_round_sch() {
20304        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20305        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20306        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20307        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20308            a, 0, b, c,
20309        );
20310        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20311        assert_eq_m128h(r, e);
20312        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20313            a, 1, b, c,
20314        );
20315        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20316        assert_eq_m128h(r, e);
20317    }
20318
20319    #[simd_test(enable = "avx512fp16,avx512vl")]
20320    fn test_mm_mask3_fcmadd_round_sch() {
20321        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20322        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20323        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20324        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20325            a, b, c, 0,
20326        );
20327        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20328        assert_eq_m128h(r, e);
20329        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20330            a, b, c, 1,
20331        );
20332        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20333        assert_eq_m128h(r, e);
20334    }
20335
20336    #[simd_test(enable = "avx512fp16,avx512vl")]
20337    fn test_mm_maskz_fcmadd_round_sch() {
20338        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20339        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20340        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20341        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20342            0, a, b, c,
20343        );
20344        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20345        assert_eq_m128h(r, e);
20346        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20347            1, a, b, c,
20348        );
20349        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20350        assert_eq_m128h(r, e);
20351    }
20352
20353    #[simd_test(enable = "avx512fp16,avx512vl")]
20354    const fn test_mm_fmadd_ph() {
20355        let a = _mm_set1_ph(1.0);
20356        let b = _mm_set1_ph(2.0);
20357        let c = _mm_set1_ph(3.0);
20358        let r = _mm_fmadd_ph(a, b, c);
20359        let e = _mm_set1_ph(5.0);
20360        assert_eq_m128h(r, e);
20361    }
20362
20363    #[simd_test(enable = "avx512fp16,avx512vl")]
20364    const fn test_mm_mask_fmadd_ph() {
20365        let a = _mm_set1_ph(1.0);
20366        let b = _mm_set1_ph(2.0);
20367        let c = _mm_set1_ph(3.0);
20368        let r = _mm_mask_fmadd_ph(a, 0b01010101, b, c);
20369        let e = _mm_set_ph(1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0);
20370        assert_eq_m128h(r, e);
20371    }
20372
20373    #[simd_test(enable = "avx512fp16,avx512vl")]
20374    const fn test_mm_mask3_fmadd_ph() {
20375        let a = _mm_set1_ph(1.0);
20376        let b = _mm_set1_ph(2.0);
20377        let c = _mm_set1_ph(3.0);
20378        let r = _mm_mask3_fmadd_ph(a, b, c, 0b01010101);
20379        let e = _mm_set_ph(3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0);
20380        assert_eq_m128h(r, e);
20381    }
20382
20383    #[simd_test(enable = "avx512fp16,avx512vl")]
20384    const fn test_mm_maskz_fmadd_ph() {
20385        let a = _mm_set1_ph(1.0);
20386        let b = _mm_set1_ph(2.0);
20387        let c = _mm_set1_ph(3.0);
20388        let r = _mm_maskz_fmadd_ph(0b01010101, a, b, c);
20389        let e = _mm_set_ph(0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0);
20390        assert_eq_m128h(r, e);
20391    }
20392
20393    #[simd_test(enable = "avx512fp16,avx512vl")]
20394    const fn test_mm256_fmadd_ph() {
20395        let a = _mm256_set1_ph(1.0);
20396        let b = _mm256_set1_ph(2.0);
20397        let c = _mm256_set1_ph(3.0);
20398        let r = _mm256_fmadd_ph(a, b, c);
20399        let e = _mm256_set1_ph(5.0);
20400        assert_eq_m256h(r, e);
20401    }
20402
20403    #[simd_test(enable = "avx512fp16,avx512vl")]
20404    const fn test_mm256_mask_fmadd_ph() {
20405        let a = _mm256_set1_ph(1.0);
20406        let b = _mm256_set1_ph(2.0);
20407        let c = _mm256_set1_ph(3.0);
20408        let r = _mm256_mask_fmadd_ph(a, 0b0101010101010101, b, c);
20409        let e = _mm256_set_ph(
20410            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20411        );
20412        assert_eq_m256h(r, e);
20413    }
20414
20415    #[simd_test(enable = "avx512fp16,avx512vl")]
20416    const fn test_mm256_mask3_fmadd_ph() {
20417        let a = _mm256_set1_ph(1.0);
20418        let b = _mm256_set1_ph(2.0);
20419        let c = _mm256_set1_ph(3.0);
20420        let r = _mm256_mask3_fmadd_ph(a, b, c, 0b0101010101010101);
20421        let e = _mm256_set_ph(
20422            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20423        );
20424        assert_eq_m256h(r, e);
20425    }
20426
20427    #[simd_test(enable = "avx512fp16,avx512vl")]
20428    const fn test_mm256_maskz_fmadd_ph() {
20429        let a = _mm256_set1_ph(1.0);
20430        let b = _mm256_set1_ph(2.0);
20431        let c = _mm256_set1_ph(3.0);
20432        let r = _mm256_maskz_fmadd_ph(0b0101010101010101, a, b, c);
20433        let e = _mm256_set_ph(
20434            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20435        );
20436        assert_eq_m256h(r, e);
20437    }
20438
20439    #[simd_test(enable = "avx512fp16")]
20440    const fn test_mm512_fmadd_ph() {
20441        let a = _mm512_set1_ph(1.0);
20442        let b = _mm512_set1_ph(2.0);
20443        let c = _mm512_set1_ph(3.0);
20444        let r = _mm512_fmadd_ph(a, b, c);
20445        let e = _mm512_set1_ph(5.0);
20446        assert_eq_m512h(r, e);
20447    }
20448
20449    #[simd_test(enable = "avx512fp16")]
20450    const fn test_mm512_mask_fmadd_ph() {
20451        let a = _mm512_set1_ph(1.0);
20452        let b = _mm512_set1_ph(2.0);
20453        let c = _mm512_set1_ph(3.0);
20454        let r = _mm512_mask_fmadd_ph(a, 0b01010101010101010101010101010101, b, c);
20455        let e = _mm512_set_ph(
20456            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
20457            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20458        );
20459        assert_eq_m512h(r, e);
20460    }
20461
20462    #[simd_test(enable = "avx512fp16")]
20463    const fn test_mm512_mask3_fmadd_ph() {
20464        let a = _mm512_set1_ph(1.0);
20465        let b = _mm512_set1_ph(2.0);
20466        let c = _mm512_set1_ph(3.0);
20467        let r = _mm512_mask3_fmadd_ph(a, b, c, 0b01010101010101010101010101010101);
20468        let e = _mm512_set_ph(
20469            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
20470            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20471        );
20472        assert_eq_m512h(r, e);
20473    }
20474
20475    #[simd_test(enable = "avx512fp16")]
20476    const fn test_mm512_maskz_fmadd_ph() {
20477        let a = _mm512_set1_ph(1.0);
20478        let b = _mm512_set1_ph(2.0);
20479        let c = _mm512_set1_ph(3.0);
20480        let r = _mm512_maskz_fmadd_ph(0b01010101010101010101010101010101, a, b, c);
20481        let e = _mm512_set_ph(
20482            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
20483            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20484        );
20485        assert_eq_m512h(r, e);
20486    }
20487
20488    #[simd_test(enable = "avx512fp16")]
20489    fn test_mm512_fmadd_round_ph() {
20490        let a = _mm512_set1_ph(1.0);
20491        let b = _mm512_set1_ph(2.0);
20492        let c = _mm512_set1_ph(3.0);
20493        let r = _mm512_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20494        let e = _mm512_set1_ph(5.0);
20495        assert_eq_m512h(r, e);
20496    }
20497
20498    #[simd_test(enable = "avx512fp16")]
20499    fn test_mm512_mask_fmadd_round_ph() {
20500        let a = _mm512_set1_ph(1.0);
20501        let b = _mm512_set1_ph(2.0);
20502        let c = _mm512_set1_ph(3.0);
20503        let r = _mm512_mask_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20504            a,
20505            0b01010101010101010101010101010101,
20506            b,
20507            c,
20508        );
20509        let e = _mm512_set_ph(
20510            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
20511            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20512        );
20513        assert_eq_m512h(r, e);
20514    }
20515
20516    #[simd_test(enable = "avx512fp16")]
20517    fn test_mm512_mask3_fmadd_round_ph() {
20518        let a = _mm512_set1_ph(1.0);
20519        let b = _mm512_set1_ph(2.0);
20520        let c = _mm512_set1_ph(3.0);
20521        let r = _mm512_mask3_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20522            a,
20523            b,
20524            c,
20525            0b01010101010101010101010101010101,
20526        );
20527        let e = _mm512_set_ph(
20528            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
20529            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20530        );
20531        assert_eq_m512h(r, e);
20532    }
20533
20534    #[simd_test(enable = "avx512fp16")]
20535    fn test_mm512_maskz_fmadd_round_ph() {
20536        let a = _mm512_set1_ph(1.0);
20537        let b = _mm512_set1_ph(2.0);
20538        let c = _mm512_set1_ph(3.0);
20539        let r = _mm512_maskz_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20540            0b01010101010101010101010101010101,
20541            a,
20542            b,
20543            c,
20544        );
20545        let e = _mm512_set_ph(
20546            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
20547            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20548        );
20549        assert_eq_m512h(r, e);
20550    }
20551
20552    #[simd_test(enable = "avx512fp16,avx512vl")]
20553    const fn test_mm_fmadd_sh() {
20554        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20555        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20556        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20557        let r = _mm_fmadd_sh(a, b, c);
20558        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20559        assert_eq_m128h(r, e);
20560    }
20561
20562    #[simd_test(enable = "avx512fp16,avx512vl")]
20563    const fn test_mm_mask_fmadd_sh() {
20564        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20565        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20566        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20567        let r = _mm_mask_fmadd_sh(a, 0, b, c);
20568        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20569        assert_eq_m128h(r, e);
20570        let r = _mm_mask_fmadd_sh(a, 1, b, c);
20571        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20572        assert_eq_m128h(r, e);
20573    }
20574
20575    #[simd_test(enable = "avx512fp16,avx512vl")]
20576    const fn test_mm_mask3_fmadd_sh() {
20577        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20578        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20579        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20580        let r = _mm_mask3_fmadd_sh(a, b, c, 0);
20581        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20582        assert_eq_m128h(r, e);
20583        let r = _mm_mask3_fmadd_sh(a, b, c, 1);
20584        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
20585        assert_eq_m128h(r, e);
20586    }
20587
20588    #[simd_test(enable = "avx512fp16,avx512vl")]
20589    const fn test_mm_maskz_fmadd_sh() {
20590        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20591        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20592        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20593        let r = _mm_maskz_fmadd_sh(0, a, b, c);
20594        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20595        assert_eq_m128h(r, e);
20596        let r = _mm_maskz_fmadd_sh(1, a, b, c);
20597        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20598        assert_eq_m128h(r, e);
20599    }
20600
20601    #[simd_test(enable = "avx512fp16,avx512vl")]
20602    fn test_mm_fmadd_round_sh() {
20603        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20604        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20605        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20606        let r = _mm_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20607        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20608        assert_eq_m128h(r, e);
20609    }
20610
20611    #[simd_test(enable = "avx512fp16,avx512vl")]
20612    fn test_mm_mask_fmadd_round_sh() {
20613        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20614        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20615        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20616        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20617            a, 0, b, c,
20618        );
20619        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20620        assert_eq_m128h(r, e);
20621        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20622            a, 1, b, c,
20623        );
20624        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20625        assert_eq_m128h(r, e);
20626    }
20627
20628    #[simd_test(enable = "avx512fp16,avx512vl")]
20629    fn test_mm_mask3_fmadd_round_sh() {
20630        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20631        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20632        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20633        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20634            a, b, c, 0,
20635        );
20636        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20637        assert_eq_m128h(r, e);
20638        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20639            a, b, c, 1,
20640        );
20641        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
20642        assert_eq_m128h(r, e);
20643    }
20644
20645    #[simd_test(enable = "avx512fp16,avx512vl")]
20646    fn test_mm_maskz_fmadd_round_sh() {
20647        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20648        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20649        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20650        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20651            0, a, b, c,
20652        );
20653        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20654        assert_eq_m128h(r, e);
20655        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20656            1, a, b, c,
20657        );
20658        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20659        assert_eq_m128h(r, e);
20660    }
20661
20662    #[simd_test(enable = "avx512fp16,avx512vl")]
20663    const fn test_mm_fmsub_ph() {
20664        let a = _mm_set1_ph(1.0);
20665        let b = _mm_set1_ph(2.0);
20666        let c = _mm_set1_ph(3.0);
20667        let r = _mm_fmsub_ph(a, b, c);
20668        let e = _mm_set1_ph(-1.0);
20669        assert_eq_m128h(r, e);
20670    }
20671
20672    #[simd_test(enable = "avx512fp16,avx512vl")]
20673    const fn test_mm_mask_fmsub_ph() {
20674        let a = _mm_set1_ph(1.0);
20675        let b = _mm_set1_ph(2.0);
20676        let c = _mm_set1_ph(3.0);
20677        let r = _mm_mask_fmsub_ph(a, 0b01010101, b, c);
20678        let e = _mm_set_ph(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0);
20679        assert_eq_m128h(r, e);
20680    }
20681
20682    #[simd_test(enable = "avx512fp16,avx512vl")]
20683    const fn test_mm_mask3_fmsub_ph() {
20684        let a = _mm_set1_ph(1.0);
20685        let b = _mm_set1_ph(2.0);
20686        let c = _mm_set1_ph(3.0);
20687        let r = _mm_mask3_fmsub_ph(a, b, c, 0b01010101);
20688        let e = _mm_set_ph(3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0);
20689        assert_eq_m128h(r, e);
20690    }
20691
20692    #[simd_test(enable = "avx512fp16,avx512vl")]
20693    const fn test_mm_maskz_fmsub_ph() {
20694        let a = _mm_set1_ph(1.0);
20695        let b = _mm_set1_ph(2.0);
20696        let c = _mm_set1_ph(3.0);
20697        let r = _mm_maskz_fmsub_ph(0b01010101, a, b, c);
20698        let e = _mm_set_ph(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0);
20699        assert_eq_m128h(r, e);
20700    }
20701
20702    #[simd_test(enable = "avx512fp16,avx512vl")]
20703    const fn test_mm256_fmsub_ph() {
20704        let a = _mm256_set1_ph(1.0);
20705        let b = _mm256_set1_ph(2.0);
20706        let c = _mm256_set1_ph(3.0);
20707        let r = _mm256_fmsub_ph(a, b, c);
20708        let e = _mm256_set1_ph(-1.0);
20709        assert_eq_m256h(r, e);
20710    }
20711
20712    #[simd_test(enable = "avx512fp16,avx512vl")]
20713    const fn test_mm256_mask_fmsub_ph() {
20714        let a = _mm256_set1_ph(1.0);
20715        let b = _mm256_set1_ph(2.0);
20716        let c = _mm256_set1_ph(3.0);
20717        let r = _mm256_mask_fmsub_ph(a, 0b0101010101010101, b, c);
20718        let e = _mm256_set_ph(
20719            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20720        );
20721        assert_eq_m256h(r, e);
20722    }
20723
20724    #[simd_test(enable = "avx512fp16,avx512vl")]
20725    const fn test_mm256_mask3_fmsub_ph() {
20726        let a = _mm256_set1_ph(1.0);
20727        let b = _mm256_set1_ph(2.0);
20728        let c = _mm256_set1_ph(3.0);
20729        let r = _mm256_mask3_fmsub_ph(a, b, c, 0b0101010101010101);
20730        let e = _mm256_set_ph(
20731            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20732        );
20733        assert_eq_m256h(r, e);
20734    }
20735
20736    #[simd_test(enable = "avx512fp16,avx512vl")]
20737    const fn test_mm256_maskz_fmsub_ph() {
20738        let a = _mm256_set1_ph(1.0);
20739        let b = _mm256_set1_ph(2.0);
20740        let c = _mm256_set1_ph(3.0);
20741        let r = _mm256_maskz_fmsub_ph(0b0101010101010101, a, b, c);
20742        let e = _mm256_set_ph(
20743            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20744        );
20745        assert_eq_m256h(r, e);
20746    }
20747
20748    #[simd_test(enable = "avx512fp16")]
20749    const fn test_mm512_fmsub_ph() {
20750        let a = _mm512_set1_ph(1.0);
20751        let b = _mm512_set1_ph(2.0);
20752        let c = _mm512_set1_ph(3.0);
20753        let r = _mm512_fmsub_ph(a, b, c);
20754        let e = _mm512_set1_ph(-1.0);
20755        assert_eq_m512h(r, e);
20756    }
20757
20758    #[simd_test(enable = "avx512fp16")]
20759    const fn test_mm512_mask_fmsub_ph() {
20760        let a = _mm512_set1_ph(1.0);
20761        let b = _mm512_set1_ph(2.0);
20762        let c = _mm512_set1_ph(3.0);
20763        let r = _mm512_mask_fmsub_ph(a, 0b01010101010101010101010101010101, b, c);
20764        let e = _mm512_set_ph(
20765            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20766            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20767        );
20768        assert_eq_m512h(r, e);
20769    }
20770
20771    #[simd_test(enable = "avx512fp16")]
20772    const fn test_mm512_mask3_fmsub_ph() {
20773        let a = _mm512_set1_ph(1.0);
20774        let b = _mm512_set1_ph(2.0);
20775        let c = _mm512_set1_ph(3.0);
20776        let r = _mm512_mask3_fmsub_ph(a, b, c, 0b01010101010101010101010101010101);
20777        let e = _mm512_set_ph(
20778            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20779            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20780        );
20781        assert_eq_m512h(r, e);
20782    }
20783
20784    #[simd_test(enable = "avx512fp16")]
20785    const fn test_mm512_maskz_fmsub_ph() {
20786        let a = _mm512_set1_ph(1.0);
20787        let b = _mm512_set1_ph(2.0);
20788        let c = _mm512_set1_ph(3.0);
20789        let r = _mm512_maskz_fmsub_ph(0b01010101010101010101010101010101, a, b, c);
20790        let e = _mm512_set_ph(
20791            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20792            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20793        );
20794        assert_eq_m512h(r, e);
20795    }
20796
20797    #[simd_test(enable = "avx512fp16")]
20798    fn test_mm512_fmsub_round_ph() {
20799        let a = _mm512_set1_ph(1.0);
20800        let b = _mm512_set1_ph(2.0);
20801        let c = _mm512_set1_ph(3.0);
20802        let r = _mm512_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20803        let e = _mm512_set1_ph(-1.0);
20804        assert_eq_m512h(r, e);
20805    }
20806
20807    #[simd_test(enable = "avx512fp16")]
20808    fn test_mm512_mask_fmsub_round_ph() {
20809        let a = _mm512_set1_ph(1.0);
20810        let b = _mm512_set1_ph(2.0);
20811        let c = _mm512_set1_ph(3.0);
20812        let r = _mm512_mask_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20813            a,
20814            0b01010101010101010101010101010101,
20815            b,
20816            c,
20817        );
20818        let e = _mm512_set_ph(
20819            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20820            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20821        );
20822        assert_eq_m512h(r, e);
20823    }
20824
20825    #[simd_test(enable = "avx512fp16")]
20826    fn test_mm512_mask3_fmsub_round_ph() {
20827        let a = _mm512_set1_ph(1.0);
20828        let b = _mm512_set1_ph(2.0);
20829        let c = _mm512_set1_ph(3.0);
20830        let r = _mm512_mask3_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20831            a,
20832            b,
20833            c,
20834            0b01010101010101010101010101010101,
20835        );
20836        let e = _mm512_set_ph(
20837            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20838            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20839        );
20840        assert_eq_m512h(r, e);
20841    }
20842
20843    #[simd_test(enable = "avx512fp16")]
20844    fn test_mm512_maskz_fmsub_round_ph() {
20845        let a = _mm512_set1_ph(1.0);
20846        let b = _mm512_set1_ph(2.0);
20847        let c = _mm512_set1_ph(3.0);
20848        let r = _mm512_maskz_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20849            0b01010101010101010101010101010101,
20850            a,
20851            b,
20852            c,
20853        );
20854        let e = _mm512_set_ph(
20855            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20856            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20857        );
20858        assert_eq_m512h(r, e);
20859    }
20860
20861    #[simd_test(enable = "avx512fp16,avx512vl")]
20862    const fn test_mm_fmsub_sh() {
20863        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20864        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20865        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20866        let r = _mm_fmsub_sh(a, b, c);
20867        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20868        assert_eq_m128h(r, e);
20869    }
20870
20871    #[simd_test(enable = "avx512fp16,avx512vl")]
20872    const fn test_mm_mask_fmsub_sh() {
20873        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20874        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20875        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20876        let r = _mm_mask_fmsub_sh(a, 0, b, c);
20877        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20878        assert_eq_m128h(r, e);
20879        let r = _mm_mask_fmsub_sh(a, 1, b, c);
20880        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20881        assert_eq_m128h(r, e);
20882    }
20883
20884    #[simd_test(enable = "avx512fp16,avx512vl")]
20885    const fn test_mm_mask3_fmsub_sh() {
20886        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20887        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20888        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20889        let r = _mm_mask3_fmsub_sh(a, b, c, 0);
20890        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20891        assert_eq_m128h(r, e);
20892        let r = _mm_mask3_fmsub_sh(a, b, c, 1);
20893        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
20894        assert_eq_m128h(r, e);
20895    }
20896
20897    #[simd_test(enable = "avx512fp16,avx512vl")]
20898    const fn test_mm_maskz_fmsub_sh() {
20899        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20900        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20901        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20902        let r = _mm_maskz_fmsub_sh(0, a, b, c);
20903        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20904        assert_eq_m128h(r, e);
20905        let r = _mm_maskz_fmsub_sh(1, a, b, c);
20906        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20907        assert_eq_m128h(r, e);
20908    }
20909
20910    #[simd_test(enable = "avx512fp16,avx512vl")]
20911    fn test_mm_fmsub_round_sh() {
20912        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20913        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20914        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20915        let r = _mm_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20916        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20917        assert_eq_m128h(r, e);
20918    }
20919
20920    #[simd_test(enable = "avx512fp16,avx512vl")]
20921    fn test_mm_mask_fmsub_round_sh() {
20922        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20923        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20924        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20925        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20926            a, 0, b, c,
20927        );
20928        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20929        assert_eq_m128h(r, e);
20930        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20931            a, 1, b, c,
20932        );
20933        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20934        assert_eq_m128h(r, e);
20935    }
20936
20937    #[simd_test(enable = "avx512fp16,avx512vl")]
20938    fn test_mm_mask3_fmsub_round_sh() {
20939        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20940        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20941        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20942        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20943            a, b, c, 0,
20944        );
20945        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20946        assert_eq_m128h(r, e);
20947        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20948            a, b, c, 1,
20949        );
20950        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
20951        assert_eq_m128h(r, e);
20952    }
20953
20954    #[simd_test(enable = "avx512fp16,avx512vl")]
20955    fn test_mm_maskz_fmsub_round_sh() {
20956        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20957        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20958        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20959        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20960            0, a, b, c,
20961        );
20962        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20963        assert_eq_m128h(r, e);
20964        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20965            1, a, b, c,
20966        );
20967        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20968        assert_eq_m128h(r, e);
20969    }
20970
20971    #[simd_test(enable = "avx512fp16,avx512vl")]
20972    const fn test_mm_fnmadd_ph() {
20973        let a = _mm_set1_ph(1.0);
20974        let b = _mm_set1_ph(2.0);
20975        let c = _mm_set1_ph(3.0);
20976        let r = _mm_fnmadd_ph(a, b, c);
20977        let e = _mm_set1_ph(1.0);
20978        assert_eq_m128h(r, e);
20979    }
20980
20981    #[simd_test(enable = "avx512fp16,avx512vl")]
20982    const fn test_mm_mask_fnmadd_ph() {
20983        let a = _mm_set1_ph(1.0);
20984        let b = _mm_set1_ph(2.0);
20985        let c = _mm_set1_ph(3.0);
20986        let r = _mm_mask_fnmadd_ph(a, 0b01010101, b, c);
20987        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
20988        assert_eq_m128h(r, e);
20989    }
20990
20991    #[simd_test(enable = "avx512fp16,avx512vl")]
20992    const fn test_mm_mask3_fnmadd_ph() {
20993        let a = _mm_set1_ph(1.0);
20994        let b = _mm_set1_ph(2.0);
20995        let c = _mm_set1_ph(3.0);
20996        let r = _mm_mask3_fnmadd_ph(a, b, c, 0b01010101);
20997        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
20998        assert_eq_m128h(r, e);
20999    }
21000
21001    #[simd_test(enable = "avx512fp16,avx512vl")]
21002    const fn test_mm_maskz_fnmadd_ph() {
21003        let a = _mm_set1_ph(1.0);
21004        let b = _mm_set1_ph(2.0);
21005        let c = _mm_set1_ph(3.0);
21006        let r = _mm_maskz_fnmadd_ph(0b01010101, a, b, c);
21007        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
21008        assert_eq_m128h(r, e);
21009    }
21010
21011    #[simd_test(enable = "avx512fp16,avx512vl")]
21012    const fn test_mm256_fnmadd_ph() {
21013        let a = _mm256_set1_ph(1.0);
21014        let b = _mm256_set1_ph(2.0);
21015        let c = _mm256_set1_ph(3.0);
21016        let r = _mm256_fnmadd_ph(a, b, c);
21017        let e = _mm256_set1_ph(1.0);
21018        assert_eq_m256h(r, e);
21019    }
21020
21021    #[simd_test(enable = "avx512fp16,avx512vl")]
21022    const fn test_mm256_mask_fnmadd_ph() {
21023        let a = _mm256_set1_ph(1.0);
21024        let b = _mm256_set1_ph(2.0);
21025        let c = _mm256_set1_ph(3.0);
21026        let r = _mm256_mask_fnmadd_ph(a, 0b0101010101010101, b, c);
21027        let e = _mm256_set_ph(
21028            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
21029        );
21030        assert_eq_m256h(r, e);
21031    }
21032
21033    #[simd_test(enable = "avx512fp16,avx512vl")]
21034    const fn test_mm256_mask3_fnmadd_ph() {
21035        let a = _mm256_set1_ph(1.0);
21036        let b = _mm256_set1_ph(2.0);
21037        let c = _mm256_set1_ph(3.0);
21038        let r = _mm256_mask3_fnmadd_ph(a, b, c, 0b0101010101010101);
21039        let e = _mm256_set_ph(
21040            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
21041        );
21042        assert_eq_m256h(r, e);
21043    }
21044
21045    #[simd_test(enable = "avx512fp16,avx512vl")]
21046    const fn test_mm256_maskz_fnmadd_ph() {
21047        let a = _mm256_set1_ph(1.0);
21048        let b = _mm256_set1_ph(2.0);
21049        let c = _mm256_set1_ph(3.0);
21050        let r = _mm256_maskz_fnmadd_ph(0b0101010101010101, a, b, c);
21051        let e = _mm256_set_ph(
21052            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
21053        );
21054        assert_eq_m256h(r, e);
21055    }
21056
21057    #[simd_test(enable = "avx512fp16")]
21058    const fn test_mm512_fnmadd_ph() {
21059        let a = _mm512_set1_ph(1.0);
21060        let b = _mm512_set1_ph(2.0);
21061        let c = _mm512_set1_ph(3.0);
21062        let r = _mm512_fnmadd_ph(a, b, c);
21063        let e = _mm512_set1_ph(1.0);
21064        assert_eq_m512h(r, e);
21065    }
21066
21067    #[simd_test(enable = "avx512fp16")]
21068    const fn test_mm512_mask_fnmadd_ph() {
21069        let a = _mm512_set1_ph(1.0);
21070        let b = _mm512_set1_ph(2.0);
21071        let c = _mm512_set1_ph(3.0);
21072        let r = _mm512_mask_fnmadd_ph(a, 0b01010101010101010101010101010101, b, c);
21073        let e = _mm512_set_ph(
21074            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
21075            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
21076        );
21077        assert_eq_m512h(r, e);
21078    }
21079
21080    #[simd_test(enable = "avx512fp16")]
21081    const fn test_mm512_mask3_fnmadd_ph() {
21082        let a = _mm512_set1_ph(1.0);
21083        let b = _mm512_set1_ph(2.0);
21084        let c = _mm512_set1_ph(3.0);
21085        let r = _mm512_mask3_fnmadd_ph(a, b, c, 0b01010101010101010101010101010101);
21086        let e = _mm512_set_ph(
21087            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
21088            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
21089        );
21090        assert_eq_m512h(r, e);
21091    }
21092
21093    #[simd_test(enable = "avx512fp16")]
21094    const fn test_mm512_maskz_fnmadd_ph() {
21095        let a = _mm512_set1_ph(1.0);
21096        let b = _mm512_set1_ph(2.0);
21097        let c = _mm512_set1_ph(3.0);
21098        let r = _mm512_maskz_fnmadd_ph(0b01010101010101010101010101010101, a, b, c);
21099        let e = _mm512_set_ph(
21100            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
21101            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
21102        );
21103        assert_eq_m512h(r, e);
21104    }
21105
21106    #[simd_test(enable = "avx512fp16")]
21107    fn test_mm512_fnmadd_round_ph() {
21108        let a = _mm512_set1_ph(1.0);
21109        let b = _mm512_set1_ph(2.0);
21110        let c = _mm512_set1_ph(3.0);
21111        let r =
21112            _mm512_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21113        let e = _mm512_set1_ph(1.0);
21114        assert_eq_m512h(r, e);
21115    }
21116
21117    #[simd_test(enable = "avx512fp16")]
21118    fn test_mm512_mask_fnmadd_round_ph() {
21119        let a = _mm512_set1_ph(1.0);
21120        let b = _mm512_set1_ph(2.0);
21121        let c = _mm512_set1_ph(3.0);
21122        let r = _mm512_mask_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21123            a,
21124            0b01010101010101010101010101010101,
21125            b,
21126            c,
21127        );
21128        let e = _mm512_set_ph(
21129            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
21130            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
21131        );
21132        assert_eq_m512h(r, e);
21133    }
21134
21135    #[simd_test(enable = "avx512fp16")]
21136    fn test_mm512_mask3_fnmadd_round_ph() {
21137        let a = _mm512_set1_ph(1.0);
21138        let b = _mm512_set1_ph(2.0);
21139        let c = _mm512_set1_ph(3.0);
21140        let r = _mm512_mask3_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21141            a,
21142            b,
21143            c,
21144            0b01010101010101010101010101010101,
21145        );
21146        let e = _mm512_set_ph(
21147            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
21148            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
21149        );
21150        assert_eq_m512h(r, e);
21151    }
21152
21153    #[simd_test(enable = "avx512fp16")]
21154    fn test_mm512_maskz_fnmadd_round_ph() {
21155        let a = _mm512_set1_ph(1.0);
21156        let b = _mm512_set1_ph(2.0);
21157        let c = _mm512_set1_ph(3.0);
21158        let r = _mm512_maskz_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21159            0b01010101010101010101010101010101,
21160            a,
21161            b,
21162            c,
21163        );
21164        let e = _mm512_set_ph(
21165            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
21166            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
21167        );
21168        assert_eq_m512h(r, e);
21169    }
21170
21171    #[simd_test(enable = "avx512fp16,avx512vl")]
21172    const fn test_mm_fnmadd_sh() {
21173        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21174        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21175        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21176        let r = _mm_fnmadd_sh(a, b, c);
21177        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21178        assert_eq_m128h(r, e);
21179    }
21180
21181    #[simd_test(enable = "avx512fp16,avx512vl")]
21182    const fn test_mm_mask_fnmadd_sh() {
21183        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21184        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21185        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21186        let r = _mm_mask_fnmadd_sh(a, 0, b, c);
21187        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21188        assert_eq_m128h(r, e);
21189        let r = _mm_mask_fnmadd_sh(a, 1, b, c);
21190        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21191        assert_eq_m128h(r, e);
21192    }
21193
21194    #[simd_test(enable = "avx512fp16,avx512vl")]
21195    const fn test_mm_mask3_fnmadd_sh() {
21196        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21197        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21198        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21199        let r = _mm_mask3_fnmadd_sh(a, b, c, 0);
21200        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21201        assert_eq_m128h(r, e);
21202        let r = _mm_mask3_fnmadd_sh(a, b, c, 1);
21203        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
21204        assert_eq_m128h(r, e);
21205    }
21206
21207    #[simd_test(enable = "avx512fp16,avx512vl")]
21208    const fn test_mm_maskz_fnmadd_sh() {
21209        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21210        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21211        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21212        let r = _mm_maskz_fnmadd_sh(0, a, b, c);
21213        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21214        assert_eq_m128h(r, e);
21215        let r = _mm_maskz_fnmadd_sh(1, a, b, c);
21216        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21217        assert_eq_m128h(r, e);
21218    }
21219
21220    #[simd_test(enable = "avx512fp16,avx512vl")]
21221    fn test_mm_fnmadd_round_sh() {
21222        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21223        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21224        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21225        let r = _mm_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21226        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21227        assert_eq_m128h(r, e);
21228    }
21229
21230    #[simd_test(enable = "avx512fp16,avx512vl")]
21231    fn test_mm_mask_fnmadd_round_sh() {
21232        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21233        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21234        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21235        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21236            a, 0, b, c,
21237        );
21238        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21239        assert_eq_m128h(r, e);
21240        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21241            a, 1, b, c,
21242        );
21243        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21244        assert_eq_m128h(r, e);
21245    }
21246
21247    #[simd_test(enable = "avx512fp16,avx512vl")]
21248    fn test_mm_mask3_fnmadd_round_sh() {
21249        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21250        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21251        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21252        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21253            a, b, c, 0,
21254        );
21255        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21256        assert_eq_m128h(r, e);
21257        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21258            a, b, c, 1,
21259        );
21260        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
21261        assert_eq_m128h(r, e);
21262    }
21263
21264    #[simd_test(enable = "avx512fp16,avx512vl")]
21265    fn test_mm_maskz_fnmadd_round_sh() {
21266        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21267        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21268        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21269        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21270            0, a, b, c,
21271        );
21272        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21273        assert_eq_m128h(r, e);
21274        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21275            1, a, b, c,
21276        );
21277        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21278        assert_eq_m128h(r, e);
21279    }
21280
21281    #[simd_test(enable = "avx512fp16,avx512vl")]
21282    const fn test_mm_fnmsub_ph() {
21283        let a = _mm_set1_ph(1.0);
21284        let b = _mm_set1_ph(2.0);
21285        let c = _mm_set1_ph(3.0);
21286        let r = _mm_fnmsub_ph(a, b, c);
21287        let e = _mm_set1_ph(-5.0);
21288        assert_eq_m128h(r, e);
21289    }
21290
21291    #[simd_test(enable = "avx512fp16,avx512vl")]
21292    const fn test_mm_mask_fnmsub_ph() {
21293        let a = _mm_set1_ph(1.0);
21294        let b = _mm_set1_ph(2.0);
21295        let c = _mm_set1_ph(3.0);
21296        let r = _mm_mask_fnmsub_ph(a, 0b01010101, b, c);
21297        let e = _mm_set_ph(1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0);
21298        assert_eq_m128h(r, e);
21299    }
21300
21301    #[simd_test(enable = "avx512fp16,avx512vl")]
21302    const fn test_mm_mask3_fnmsub_ph() {
21303        let a = _mm_set1_ph(1.0);
21304        let b = _mm_set1_ph(2.0);
21305        let c = _mm_set1_ph(3.0);
21306        let r = _mm_mask3_fnmsub_ph(a, b, c, 0b01010101);
21307        let e = _mm_set_ph(3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0);
21308        assert_eq_m128h(r, e);
21309    }
21310
21311    #[simd_test(enable = "avx512fp16,avx512vl")]
21312    const fn test_mm_maskz_fnmsub_ph() {
21313        let a = _mm_set1_ph(1.0);
21314        let b = _mm_set1_ph(2.0);
21315        let c = _mm_set1_ph(3.0);
21316        let r = _mm_maskz_fnmsub_ph(0b01010101, a, b, c);
21317        let e = _mm_set_ph(0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0);
21318        assert_eq_m128h(r, e);
21319    }
21320
21321    #[simd_test(enable = "avx512fp16,avx512vl")]
21322    const fn test_mm256_fnmsub_ph() {
21323        let a = _mm256_set1_ph(1.0);
21324        let b = _mm256_set1_ph(2.0);
21325        let c = _mm256_set1_ph(3.0);
21326        let r = _mm256_fnmsub_ph(a, b, c);
21327        let e = _mm256_set1_ph(-5.0);
21328        assert_eq_m256h(r, e);
21329    }
21330
21331    #[simd_test(enable = "avx512fp16,avx512vl")]
21332    const fn test_mm256_mask_fnmsub_ph() {
21333        let a = _mm256_set1_ph(1.0);
21334        let b = _mm256_set1_ph(2.0);
21335        let c = _mm256_set1_ph(3.0);
21336        let r = _mm256_mask_fnmsub_ph(a, 0b0101010101010101, b, c);
21337        let e = _mm256_set_ph(
21338            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21339        );
21340        assert_eq_m256h(r, e);
21341    }
21342
21343    #[simd_test(enable = "avx512fp16,avx512vl")]
21344    const fn test_mm256_mask3_fnmsub_ph() {
21345        let a = _mm256_set1_ph(1.0);
21346        let b = _mm256_set1_ph(2.0);
21347        let c = _mm256_set1_ph(3.0);
21348        let r = _mm256_mask3_fnmsub_ph(a, b, c, 0b0101010101010101);
21349        let e = _mm256_set_ph(
21350            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21351        );
21352        assert_eq_m256h(r, e);
21353    }
21354
21355    #[simd_test(enable = "avx512fp16,avx512vl")]
21356    const fn test_mm256_maskz_fnmsub_ph() {
21357        let a = _mm256_set1_ph(1.0);
21358        let b = _mm256_set1_ph(2.0);
21359        let c = _mm256_set1_ph(3.0);
21360        let r = _mm256_maskz_fnmsub_ph(0b0101010101010101, a, b, c);
21361        let e = _mm256_set_ph(
21362            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21363        );
21364        assert_eq_m256h(r, e);
21365    }
21366
21367    #[simd_test(enable = "avx512fp16")]
21368    const fn test_mm512_fnmsub_ph() {
21369        let a = _mm512_set1_ph(1.0);
21370        let b = _mm512_set1_ph(2.0);
21371        let c = _mm512_set1_ph(3.0);
21372        let r = _mm512_fnmsub_ph(a, b, c);
21373        let e = _mm512_set1_ph(-5.0);
21374        assert_eq_m512h(r, e);
21375    }
21376
21377    #[simd_test(enable = "avx512fp16")]
21378    const fn test_mm512_mask_fnmsub_ph() {
21379        let a = _mm512_set1_ph(1.0);
21380        let b = _mm512_set1_ph(2.0);
21381        let c = _mm512_set1_ph(3.0);
21382        let r = _mm512_mask_fnmsub_ph(a, 0b01010101010101010101010101010101, b, c);
21383        let e = _mm512_set_ph(
21384            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21385            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21386        );
21387        assert_eq_m512h(r, e);
21388    }
21389
21390    #[simd_test(enable = "avx512fp16")]
21391    const fn test_mm512_mask3_fnmsub_ph() {
21392        let a = _mm512_set1_ph(1.0);
21393        let b = _mm512_set1_ph(2.0);
21394        let c = _mm512_set1_ph(3.0);
21395        let r = _mm512_mask3_fnmsub_ph(a, b, c, 0b01010101010101010101010101010101);
21396        let e = _mm512_set_ph(
21397            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21398            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21399        );
21400        assert_eq_m512h(r, e);
21401    }
21402
21403    #[simd_test(enable = "avx512fp16")]
21404    const fn test_mm512_maskz_fnmsub_ph() {
21405        let a = _mm512_set1_ph(1.0);
21406        let b = _mm512_set1_ph(2.0);
21407        let c = _mm512_set1_ph(3.0);
21408        let r = _mm512_maskz_fnmsub_ph(0b01010101010101010101010101010101, a, b, c);
21409        let e = _mm512_set_ph(
21410            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21411            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21412        );
21413        assert_eq_m512h(r, e);
21414    }
21415
21416    #[simd_test(enable = "avx512fp16")]
21417    fn test_mm512_fnmsub_round_ph() {
21418        let a = _mm512_set1_ph(1.0);
21419        let b = _mm512_set1_ph(2.0);
21420        let c = _mm512_set1_ph(3.0);
21421        let r =
21422            _mm512_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21423        let e = _mm512_set1_ph(-5.0);
21424        assert_eq_m512h(r, e);
21425    }
21426
21427    #[simd_test(enable = "avx512fp16")]
21428    fn test_mm512_mask_fnmsub_round_ph() {
21429        let a = _mm512_set1_ph(1.0);
21430        let b = _mm512_set1_ph(2.0);
21431        let c = _mm512_set1_ph(3.0);
21432        let r = _mm512_mask_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21433            a,
21434            0b01010101010101010101010101010101,
21435            b,
21436            c,
21437        );
21438        let e = _mm512_set_ph(
21439            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21440            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21441        );
21442        assert_eq_m512h(r, e);
21443    }
21444
21445    #[simd_test(enable = "avx512fp16")]
21446    fn test_mm512_mask3_fnmsub_round_ph() {
21447        let a = _mm512_set1_ph(1.0);
21448        let b = _mm512_set1_ph(2.0);
21449        let c = _mm512_set1_ph(3.0);
21450        let r = _mm512_mask3_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21451            a,
21452            b,
21453            c,
21454            0b01010101010101010101010101010101,
21455        );
21456        let e = _mm512_set_ph(
21457            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21458            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21459        );
21460        assert_eq_m512h(r, e);
21461    }
21462
21463    #[simd_test(enable = "avx512fp16")]
21464    fn test_mm512_maskz_fnmsub_round_ph() {
21465        let a = _mm512_set1_ph(1.0);
21466        let b = _mm512_set1_ph(2.0);
21467        let c = _mm512_set1_ph(3.0);
21468        let r = _mm512_maskz_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21469            0b01010101010101010101010101010101,
21470            a,
21471            b,
21472            c,
21473        );
21474        let e = _mm512_set_ph(
21475            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21476            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21477        );
21478        assert_eq_m512h(r, e);
21479    }
21480
21481    #[simd_test(enable = "avx512fp16,avx512vl")]
21482    const fn test_mm_fnmsub_sh() {
21483        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21484        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21485        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21486        let r = _mm_fnmsub_sh(a, b, c);
21487        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21488        assert_eq_m128h(r, e);
21489    }
21490
21491    #[simd_test(enable = "avx512fp16,avx512vl")]
21492    const fn test_mm_mask_fnmsub_sh() {
21493        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21494        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21495        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21496        let r = _mm_mask_fnmsub_sh(a, 0, b, c);
21497        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21498        assert_eq_m128h(r, e);
21499        let r = _mm_mask_fnmsub_sh(a, 1, b, c);
21500        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21501        assert_eq_m128h(r, e);
21502    }
21503
21504    #[simd_test(enable = "avx512fp16,avx512vl")]
21505    const fn test_mm_mask3_fnmsub_sh() {
21506        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21507        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21508        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21509        let r = _mm_mask3_fnmsub_sh(a, b, c, 0);
21510        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21511        assert_eq_m128h(r, e);
21512        let r = _mm_mask3_fnmsub_sh(a, b, c, 1);
21513        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
21514        assert_eq_m128h(r, e);
21515    }
21516
21517    #[simd_test(enable = "avx512fp16,avx512vl")]
21518    const fn test_mm_maskz_fnmsub_sh() {
21519        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21520        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21521        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21522        let r = _mm_maskz_fnmsub_sh(0, a, b, c);
21523        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21524        assert_eq_m128h(r, e);
21525        let r = _mm_maskz_fnmsub_sh(1, a, b, c);
21526        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21527        assert_eq_m128h(r, e);
21528    }
21529
21530    #[simd_test(enable = "avx512fp16,avx512vl")]
21531    fn test_mm_fnmsub_round_sh() {
21532        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21533        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21534        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21535        let r = _mm_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21536        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21537        assert_eq_m128h(r, e);
21538    }
21539
21540    #[simd_test(enable = "avx512fp16,avx512vl")]
21541    fn test_mm_mask_fnmsub_round_sh() {
21542        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21543        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21544        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21545        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21546            a, 0, b, c,
21547        );
21548        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21549        assert_eq_m128h(r, e);
21550        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21551            a, 1, b, c,
21552        );
21553        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21554        assert_eq_m128h(r, e);
21555    }
21556
21557    #[simd_test(enable = "avx512fp16,avx512vl")]
21558    fn test_mm_mask3_fnmsub_round_sh() {
21559        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21560        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21561        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21562        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21563            a, b, c, 0,
21564        );
21565        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21566        assert_eq_m128h(r, e);
21567        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21568            a, b, c, 1,
21569        );
21570        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
21571        assert_eq_m128h(r, e);
21572    }
21573
21574    #[simd_test(enable = "avx512fp16,avx512vl")]
21575    fn test_mm_maskz_fnmsub_round_sh() {
21576        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21577        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21578        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21579        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21580            0, a, b, c,
21581        );
21582        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21583        assert_eq_m128h(r, e);
21584        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21585            1, a, b, c,
21586        );
21587        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21588        assert_eq_m128h(r, e);
21589    }
21590
21591    #[simd_test(enable = "avx512fp16,avx512vl")]
21592    const fn test_mm_fmaddsub_ph() {
21593        let a = _mm_set1_ph(1.0);
21594        let b = _mm_set1_ph(2.0);
21595        let c = _mm_set1_ph(3.0);
21596        let r = _mm_fmaddsub_ph(a, b, c);
21597        let e = _mm_set_ph(5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0);
21598        assert_eq_m128h(r, e);
21599    }
21600
21601    #[simd_test(enable = "avx512fp16,avx512vl")]
21602    const fn test_mm_mask_fmaddsub_ph() {
21603        let a = _mm_set1_ph(1.0);
21604        let b = _mm_set1_ph(2.0);
21605        let c = _mm_set1_ph(3.0);
21606        let r = _mm_mask_fmaddsub_ph(a, 0b00110011, b, c);
21607        let e = _mm_set_ph(1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0);
21608        assert_eq_m128h(r, e);
21609    }
21610
21611    #[simd_test(enable = "avx512fp16,avx512vl")]
21612    const fn test_mm_mask3_fmaddsub_ph() {
21613        let a = _mm_set1_ph(1.0);
21614        let b = _mm_set1_ph(2.0);
21615        let c = _mm_set1_ph(3.0);
21616        let r = _mm_mask3_fmaddsub_ph(a, b, c, 0b00110011);
21617        let e = _mm_set_ph(3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0);
21618        assert_eq_m128h(r, e);
21619    }
21620
21621    #[simd_test(enable = "avx512fp16,avx512vl")]
21622    const fn test_mm_maskz_fmaddsub_ph() {
21623        let a = _mm_set1_ph(1.0);
21624        let b = _mm_set1_ph(2.0);
21625        let c = _mm_set1_ph(3.0);
21626        let r = _mm_maskz_fmaddsub_ph(0b00110011, a, b, c);
21627        let e = _mm_set_ph(0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0);
21628        assert_eq_m128h(r, e);
21629    }
21630
21631    #[simd_test(enable = "avx512fp16,avx512vl")]
21632    const fn test_mm256_fmaddsub_ph() {
21633        let a = _mm256_set1_ph(1.0);
21634        let b = _mm256_set1_ph(2.0);
21635        let c = _mm256_set1_ph(3.0);
21636        let r = _mm256_fmaddsub_ph(a, b, c);
21637        let e = _mm256_set_ph(
21638            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21639        );
21640        assert_eq_m256h(r, e);
21641    }
21642
21643    #[simd_test(enable = "avx512fp16,avx512vl")]
21644    const fn test_mm256_mask_fmaddsub_ph() {
21645        let a = _mm256_set1_ph(1.0);
21646        let b = _mm256_set1_ph(2.0);
21647        let c = _mm256_set1_ph(3.0);
21648        let r = _mm256_mask_fmaddsub_ph(a, 0b0011001100110011, b, c);
21649        let e = _mm256_set_ph(
21650            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21651        );
21652        assert_eq_m256h(r, e);
21653    }
21654
21655    #[simd_test(enable = "avx512fp16,avx512vl")]
21656    const fn test_mm256_mask3_fmaddsub_ph() {
21657        let a = _mm256_set1_ph(1.0);
21658        let b = _mm256_set1_ph(2.0);
21659        let c = _mm256_set1_ph(3.0);
21660        let r = _mm256_mask3_fmaddsub_ph(a, b, c, 0b0011001100110011);
21661        let e = _mm256_set_ph(
21662            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21663        );
21664        assert_eq_m256h(r, e);
21665    }
21666
21667    #[simd_test(enable = "avx512fp16,avx512vl")]
21668    const fn test_mm256_maskz_fmaddsub_ph() {
21669        let a = _mm256_set1_ph(1.0);
21670        let b = _mm256_set1_ph(2.0);
21671        let c = _mm256_set1_ph(3.0);
21672        let r = _mm256_maskz_fmaddsub_ph(0b0011001100110011, a, b, c);
21673        let e = _mm256_set_ph(
21674            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21675        );
21676        assert_eq_m256h(r, e);
21677    }
21678
21679    #[simd_test(enable = "avx512fp16")]
21680    const fn test_mm512_fmaddsub_ph() {
21681        let a = _mm512_set1_ph(1.0);
21682        let b = _mm512_set1_ph(2.0);
21683        let c = _mm512_set1_ph(3.0);
21684        let r = _mm512_fmaddsub_ph(a, b, c);
21685        let e = _mm512_set_ph(
21686            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21687            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21688        );
21689        assert_eq_m512h(r, e);
21690    }
21691
21692    #[simd_test(enable = "avx512fp16")]
21693    const fn test_mm512_mask_fmaddsub_ph() {
21694        let a = _mm512_set1_ph(1.0);
21695        let b = _mm512_set1_ph(2.0);
21696        let c = _mm512_set1_ph(3.0);
21697        let r = _mm512_mask_fmaddsub_ph(a, 0b00110011001100110011001100110011, b, c);
21698        let e = _mm512_set_ph(
21699            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21700            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21701        );
21702        assert_eq_m512h(r, e);
21703    }
21704
21705    #[simd_test(enable = "avx512fp16")]
21706    const fn test_mm512_mask3_fmaddsub_ph() {
21707        let a = _mm512_set1_ph(1.0);
21708        let b = _mm512_set1_ph(2.0);
21709        let c = _mm512_set1_ph(3.0);
21710        let r = _mm512_mask3_fmaddsub_ph(a, b, c, 0b00110011001100110011001100110011);
21711        let e = _mm512_set_ph(
21712            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21713            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21714        );
21715        assert_eq_m512h(r, e);
21716    }
21717
21718    #[simd_test(enable = "avx512fp16")]
21719    const fn test_mm512_maskz_fmaddsub_ph() {
21720        let a = _mm512_set1_ph(1.0);
21721        let b = _mm512_set1_ph(2.0);
21722        let c = _mm512_set1_ph(3.0);
21723        let r = _mm512_maskz_fmaddsub_ph(0b00110011001100110011001100110011, a, b, c);
21724        let e = _mm512_set_ph(
21725            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21726            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21727        );
21728        assert_eq_m512h(r, e);
21729    }
21730
21731    #[simd_test(enable = "avx512fp16")]
21732    fn test_mm512_fmaddsub_round_ph() {
21733        let a = _mm512_set1_ph(1.0);
21734        let b = _mm512_set1_ph(2.0);
21735        let c = _mm512_set1_ph(3.0);
21736        let r =
21737            _mm512_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21738        let e = _mm512_set_ph(
21739            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21740            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21741        );
21742        assert_eq_m512h(r, e);
21743    }
21744
21745    #[simd_test(enable = "avx512fp16")]
21746    fn test_mm512_mask_fmaddsub_round_ph() {
21747        let a = _mm512_set1_ph(1.0);
21748        let b = _mm512_set1_ph(2.0);
21749        let c = _mm512_set1_ph(3.0);
21750        let r = _mm512_mask_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21751            a,
21752            0b00110011001100110011001100110011,
21753            b,
21754            c,
21755        );
21756        let e = _mm512_set_ph(
21757            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21758            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21759        );
21760        assert_eq_m512h(r, e);
21761    }
21762
21763    #[simd_test(enable = "avx512fp16")]
21764    fn test_mm512_mask3_fmaddsub_round_ph() {
21765        let a = _mm512_set1_ph(1.0);
21766        let b = _mm512_set1_ph(2.0);
21767        let c = _mm512_set1_ph(3.0);
21768        let r = _mm512_mask3_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21769            a,
21770            b,
21771            c,
21772            0b00110011001100110011001100110011,
21773        );
21774        let e = _mm512_set_ph(
21775            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21776            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21777        );
21778        assert_eq_m512h(r, e);
21779    }
21780
21781    #[simd_test(enable = "avx512fp16")]
21782    fn test_mm512_maskz_fmaddsub_round_ph() {
21783        let a = _mm512_set1_ph(1.0);
21784        let b = _mm512_set1_ph(2.0);
21785        let c = _mm512_set1_ph(3.0);
21786        let r = _mm512_maskz_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21787            0b00110011001100110011001100110011,
21788            a,
21789            b,
21790            c,
21791        );
21792        let e = _mm512_set_ph(
21793            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21794            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21795        );
21796        assert_eq_m512h(r, e);
21797    }
21798
21799    #[simd_test(enable = "avx512fp16,avx512vl")]
21800    const fn test_mm_fmsubadd_ph() {
21801        let a = _mm_set1_ph(1.0);
21802        let b = _mm_set1_ph(2.0);
21803        let c = _mm_set1_ph(3.0);
21804        let r = _mm_fmsubadd_ph(a, b, c);
21805        let e = _mm_set_ph(-1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0);
21806        assert_eq_m128h(r, e);
21807    }
21808
21809    #[simd_test(enable = "avx512fp16,avx512vl")]
21810    const fn test_mm_mask_fmsubadd_ph() {
21811        let a = _mm_set1_ph(1.0);
21812        let b = _mm_set1_ph(2.0);
21813        let c = _mm_set1_ph(3.0);
21814        let r = _mm_mask_fmsubadd_ph(a, 0b00110011, b, c);
21815        let e = _mm_set_ph(1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0);
21816        assert_eq_m128h(r, e);
21817    }
21818
21819    #[simd_test(enable = "avx512fp16,avx512vl")]
21820    const fn test_mm_mask3_fmsubadd_ph() {
21821        let a = _mm_set1_ph(1.0);
21822        let b = _mm_set1_ph(2.0);
21823        let c = _mm_set1_ph(3.0);
21824        let r = _mm_mask3_fmsubadd_ph(a, b, c, 0b00110011);
21825        let e = _mm_set_ph(3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0);
21826        assert_eq_m128h(r, e);
21827    }
21828
21829    #[simd_test(enable = "avx512fp16,avx512vl")]
21830    const fn test_mm_maskz_fmsubadd_ph() {
21831        let a = _mm_set1_ph(1.0);
21832        let b = _mm_set1_ph(2.0);
21833        let c = _mm_set1_ph(3.0);
21834        let r = _mm_maskz_fmsubadd_ph(0b00110011, a, b, c);
21835        let e = _mm_set_ph(0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0);
21836        assert_eq_m128h(r, e);
21837    }
21838
21839    #[simd_test(enable = "avx512fp16,avx512vl")]
21840    const fn test_mm256_fmsubadd_ph() {
21841        let a = _mm256_set1_ph(1.0);
21842        let b = _mm256_set1_ph(2.0);
21843        let c = _mm256_set1_ph(3.0);
21844        let r = _mm256_fmsubadd_ph(a, b, c);
21845        let e = _mm256_set_ph(
21846            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21847        );
21848        assert_eq_m256h(r, e);
21849    }
21850
21851    #[simd_test(enable = "avx512fp16,avx512vl")]
21852    const fn test_mm256_mask_fmsubadd_ph() {
21853        let a = _mm256_set1_ph(1.0);
21854        let b = _mm256_set1_ph(2.0);
21855        let c = _mm256_set1_ph(3.0);
21856        let r = _mm256_mask_fmsubadd_ph(a, 0b0011001100110011, b, c);
21857        let e = _mm256_set_ph(
21858            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21859        );
21860        assert_eq_m256h(r, e);
21861    }
21862
21863    #[simd_test(enable = "avx512fp16,avx512vl")]
21864    const fn test_mm256_mask3_fmsubadd_ph() {
21865        let a = _mm256_set1_ph(1.0);
21866        let b = _mm256_set1_ph(2.0);
21867        let c = _mm256_set1_ph(3.0);
21868        let r = _mm256_mask3_fmsubadd_ph(a, b, c, 0b0011001100110011);
21869        let e = _mm256_set_ph(
21870            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21871        );
21872        assert_eq_m256h(r, e);
21873    }
21874
21875    #[simd_test(enable = "avx512fp16,avx512vl")]
21876    const fn test_mm256_maskz_fmsubadd_ph() {
21877        let a = _mm256_set1_ph(1.0);
21878        let b = _mm256_set1_ph(2.0);
21879        let c = _mm256_set1_ph(3.0);
21880        let r = _mm256_maskz_fmsubadd_ph(0b0011001100110011, a, b, c);
21881        let e = _mm256_set_ph(
21882            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21883        );
21884        assert_eq_m256h(r, e);
21885    }
21886
21887    #[simd_test(enable = "avx512fp16")]
21888    const fn test_mm512_fmsubadd_ph() {
21889        let a = _mm512_set1_ph(1.0);
21890        let b = _mm512_set1_ph(2.0);
21891        let c = _mm512_set1_ph(3.0);
21892        let r = _mm512_fmsubadd_ph(a, b, c);
21893        let e = _mm512_set_ph(
21894            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21895            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21896        );
21897        assert_eq_m512h(r, e);
21898    }
21899
21900    #[simd_test(enable = "avx512fp16")]
21901    const fn test_mm512_mask_fmsubadd_ph() {
21902        let a = _mm512_set1_ph(1.0);
21903        let b = _mm512_set1_ph(2.0);
21904        let c = _mm512_set1_ph(3.0);
21905        let r = _mm512_mask_fmsubadd_ph(a, 0b00110011001100110011001100110011, b, c);
21906        let e = _mm512_set_ph(
21907            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21908            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21909        );
21910        assert_eq_m512h(r, e);
21911    }
21912
21913    #[simd_test(enable = "avx512fp16")]
21914    const fn test_mm512_mask3_fmsubadd_ph() {
21915        let a = _mm512_set1_ph(1.0);
21916        let b = _mm512_set1_ph(2.0);
21917        let c = _mm512_set1_ph(3.0);
21918        let r = _mm512_mask3_fmsubadd_ph(a, b, c, 0b00110011001100110011001100110011);
21919        let e = _mm512_set_ph(
21920            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21921            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21922        );
21923        assert_eq_m512h(r, e);
21924    }
21925
21926    #[simd_test(enable = "avx512fp16")]
21927    const fn test_mm512_maskz_fmsubadd_ph() {
21928        let a = _mm512_set1_ph(1.0);
21929        let b = _mm512_set1_ph(2.0);
21930        let c = _mm512_set1_ph(3.0);
21931        let r = _mm512_maskz_fmsubadd_ph(0b00110011001100110011001100110011, a, b, c);
21932        let e = _mm512_set_ph(
21933            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21934            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21935        );
21936        assert_eq_m512h(r, e);
21937    }
21938
21939    #[simd_test(enable = "avx512fp16")]
21940    fn test_mm512_fmsubadd_round_ph() {
21941        let a = _mm512_set1_ph(1.0);
21942        let b = _mm512_set1_ph(2.0);
21943        let c = _mm512_set1_ph(3.0);
21944        let r =
21945            _mm512_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21946        let e = _mm512_set_ph(
21947            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21948            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21949        );
21950        assert_eq_m512h(r, e);
21951    }
21952
21953    #[simd_test(enable = "avx512fp16")]
21954    fn test_mm512_mask_fmsubadd_round_ph() {
21955        let a = _mm512_set1_ph(1.0);
21956        let b = _mm512_set1_ph(2.0);
21957        let c = _mm512_set1_ph(3.0);
21958        let r = _mm512_mask_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21959            a,
21960            0b00110011001100110011001100110011,
21961            b,
21962            c,
21963        );
21964        let e = _mm512_set_ph(
21965            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21966            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21967        );
21968        assert_eq_m512h(r, e);
21969    }
21970
21971    #[simd_test(enable = "avx512fp16")]
21972    fn test_mm512_mask3_fmsubadd_round_ph() {
21973        let a = _mm512_set1_ph(1.0);
21974        let b = _mm512_set1_ph(2.0);
21975        let c = _mm512_set1_ph(3.0);
21976        let r = _mm512_mask3_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21977            a,
21978            b,
21979            c,
21980            0b00110011001100110011001100110011,
21981        );
21982        let e = _mm512_set_ph(
21983            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21984            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21985        );
21986        assert_eq_m512h(r, e);
21987    }
21988
21989    #[simd_test(enable = "avx512fp16")]
21990    fn test_mm512_maskz_fmsubadd_round_ph() {
21991        let a = _mm512_set1_ph(1.0);
21992        let b = _mm512_set1_ph(2.0);
21993        let c = _mm512_set1_ph(3.0);
21994        let r = _mm512_maskz_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21995            0b00110011001100110011001100110011,
21996            a,
21997            b,
21998            c,
21999        );
22000        let e = _mm512_set_ph(
22001            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
22002            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
22003        );
22004        assert_eq_m512h(r, e);
22005    }
22006
22007    #[simd_test(enable = "avx512fp16,avx512vl")]
22008    fn test_mm_rcp_ph() {
22009        let a = _mm_set1_ph(2.0);
22010        let r = _mm_rcp_ph(a);
22011        let e = _mm_set1_ph(0.5);
22012        assert_eq_m128h(r, e);
22013    }
22014
22015    #[simd_test(enable = "avx512fp16,avx512vl")]
22016    fn test_mm_mask_rcp_ph() {
22017        let a = _mm_set1_ph(2.0);
22018        let src = _mm_set1_ph(1.0);
22019        let r = _mm_mask_rcp_ph(src, 0b01010101, a);
22020        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
22021        assert_eq_m128h(r, e);
22022    }
22023
22024    #[simd_test(enable = "avx512fp16,avx512vl")]
22025    fn test_mm_maskz_rcp_ph() {
22026        let a = _mm_set1_ph(2.0);
22027        let r = _mm_maskz_rcp_ph(0b01010101, a);
22028        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
22029        assert_eq_m128h(r, e);
22030    }
22031
22032    #[simd_test(enable = "avx512fp16,avx512vl")]
22033    fn test_mm256_rcp_ph() {
22034        let a = _mm256_set1_ph(2.0);
22035        let r = _mm256_rcp_ph(a);
22036        let e = _mm256_set1_ph(0.5);
22037        assert_eq_m256h(r, e);
22038    }
22039
22040    #[simd_test(enable = "avx512fp16,avx512vl")]
22041    fn test_mm256_mask_rcp_ph() {
22042        let a = _mm256_set1_ph(2.0);
22043        let src = _mm256_set1_ph(1.0);
22044        let r = _mm256_mask_rcp_ph(src, 0b0101010101010101, a);
22045        let e = _mm256_set_ph(
22046            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
22047        );
22048        assert_eq_m256h(r, e);
22049    }
22050
22051    #[simd_test(enable = "avx512fp16,avx512vl")]
22052    fn test_mm256_maskz_rcp_ph() {
22053        let a = _mm256_set1_ph(2.0);
22054        let r = _mm256_maskz_rcp_ph(0b0101010101010101, a);
22055        let e = _mm256_set_ph(
22056            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
22057        );
22058        assert_eq_m256h(r, e);
22059    }
22060
22061    #[simd_test(enable = "avx512fp16")]
22062    fn test_mm512_rcp_ph() {
22063        let a = _mm512_set1_ph(2.0);
22064        let r = _mm512_rcp_ph(a);
22065        let e = _mm512_set1_ph(0.5);
22066        assert_eq_m512h(r, e);
22067    }
22068
22069    #[simd_test(enable = "avx512fp16")]
22070    fn test_mm512_mask_rcp_ph() {
22071        let a = _mm512_set1_ph(2.0);
22072        let src = _mm512_set1_ph(1.0);
22073        let r = _mm512_mask_rcp_ph(src, 0b01010101010101010101010101010101, a);
22074        let e = _mm512_set_ph(
22075            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
22076            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
22077        );
22078        assert_eq_m512h(r, e);
22079    }
22080
22081    #[simd_test(enable = "avx512fp16")]
22082    fn test_mm512_maskz_rcp_ph() {
22083        let a = _mm512_set1_ph(2.0);
22084        let r = _mm512_maskz_rcp_ph(0b01010101010101010101010101010101, a);
22085        let e = _mm512_set_ph(
22086            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
22087            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
22088        );
22089        assert_eq_m512h(r, e);
22090    }
22091
22092    #[simd_test(enable = "avx512fp16,avx512vl")]
22093    fn test_mm_rcp_sh() {
22094        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22095        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22096        let r = _mm_rcp_sh(a, b);
22097        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22098        assert_eq_m128h(r, e);
22099    }
22100
22101    #[simd_test(enable = "avx512fp16,avx512vl")]
22102    fn test_mm_mask_rcp_sh() {
22103        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22104        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22105        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22106        let r = _mm_mask_rcp_sh(src, 0, a, b);
22107        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22108        assert_eq_m128h(r, e);
22109        let r = _mm_mask_rcp_sh(src, 1, a, b);
22110        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22111        assert_eq_m128h(r, e);
22112    }
22113
22114    #[simd_test(enable = "avx512fp16,avx512vl")]
22115    fn test_mm_maskz_rcp_sh() {
22116        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22117        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22118        let r = _mm_maskz_rcp_sh(0, a, b);
22119        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22120        assert_eq_m128h(r, e);
22121        let r = _mm_maskz_rcp_sh(1, a, b);
22122        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22123        assert_eq_m128h(r, e);
22124    }
22125
22126    #[simd_test(enable = "avx512fp16,avx512vl")]
22127    fn test_mm_rsqrt_ph() {
22128        let a = _mm_set1_ph(4.0);
22129        let r = _mm_rsqrt_ph(a);
22130        let e = _mm_set1_ph(0.5);
22131        assert_eq_m128h(r, e);
22132    }
22133
22134    #[simd_test(enable = "avx512fp16,avx512vl")]
22135    fn test_mm_mask_rsqrt_ph() {
22136        let a = _mm_set1_ph(4.0);
22137        let src = _mm_set1_ph(1.0);
22138        let r = _mm_mask_rsqrt_ph(src, 0b01010101, a);
22139        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
22140        assert_eq_m128h(r, e);
22141    }
22142
22143    #[simd_test(enable = "avx512fp16,avx512vl")]
22144    fn test_mm_maskz_rsqrt_ph() {
22145        let a = _mm_set1_ph(4.0);
22146        let r = _mm_maskz_rsqrt_ph(0b01010101, a);
22147        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
22148        assert_eq_m128h(r, e);
22149    }
22150
22151    #[simd_test(enable = "avx512fp16,avx512vl")]
22152    fn test_mm256_rsqrt_ph() {
22153        let a = _mm256_set1_ph(4.0);
22154        let r = _mm256_rsqrt_ph(a);
22155        let e = _mm256_set1_ph(0.5);
22156        assert_eq_m256h(r, e);
22157    }
22158
22159    #[simd_test(enable = "avx512fp16,avx512vl")]
22160    fn test_mm256_mask_rsqrt_ph() {
22161        let a = _mm256_set1_ph(4.0);
22162        let src = _mm256_set1_ph(1.0);
22163        let r = _mm256_mask_rsqrt_ph(src, 0b0101010101010101, a);
22164        let e = _mm256_set_ph(
22165            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
22166        );
22167        assert_eq_m256h(r, e);
22168    }
22169
22170    #[simd_test(enable = "avx512fp16,avx512vl")]
22171    fn test_mm256_maskz_rsqrt_ph() {
22172        let a = _mm256_set1_ph(4.0);
22173        let r = _mm256_maskz_rsqrt_ph(0b0101010101010101, a);
22174        let e = _mm256_set_ph(
22175            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
22176        );
22177        assert_eq_m256h(r, e);
22178    }
22179
22180    #[simd_test(enable = "avx512fp16")]
22181    fn test_mm512_rsqrt_ph() {
22182        let a = _mm512_set1_ph(4.0);
22183        let r = _mm512_rsqrt_ph(a);
22184        let e = _mm512_set1_ph(0.5);
22185        assert_eq_m512h(r, e);
22186    }
22187
22188    #[simd_test(enable = "avx512fp16")]
22189    fn test_mm512_mask_rsqrt_ph() {
22190        let a = _mm512_set1_ph(4.0);
22191        let src = _mm512_set1_ph(1.0);
22192        let r = _mm512_mask_rsqrt_ph(src, 0b01010101010101010101010101010101, a);
22193        let e = _mm512_set_ph(
22194            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
22195            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
22196        );
22197        assert_eq_m512h(r, e);
22198    }
22199
22200    #[simd_test(enable = "avx512fp16")]
22201    fn test_mm512_maskz_rsqrt_ph() {
22202        let a = _mm512_set1_ph(4.0);
22203        let r = _mm512_maskz_rsqrt_ph(0b01010101010101010101010101010101, a);
22204        let e = _mm512_set_ph(
22205            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
22206            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
22207        );
22208        assert_eq_m512h(r, e);
22209    }
22210
22211    #[simd_test(enable = "avx512fp16,avx512vl")]
22212    fn test_mm_rsqrt_sh() {
22213        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22214        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22215        let r = _mm_rsqrt_sh(a, b);
22216        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22217        assert_eq_m128h(r, e);
22218    }
22219
22220    #[simd_test(enable = "avx512fp16,avx512vl")]
22221    fn test_mm_mask_rsqrt_sh() {
22222        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22223        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22224        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22225        let r = _mm_mask_rsqrt_sh(src, 0, a, b);
22226        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22227        assert_eq_m128h(r, e);
22228        let r = _mm_mask_rsqrt_sh(src, 1, a, b);
22229        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22230        assert_eq_m128h(r, e);
22231    }
22232
22233    #[simd_test(enable = "avx512fp16,avx512vl")]
22234    fn test_mm_maskz_rsqrt_sh() {
22235        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22236        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22237        let r = _mm_maskz_rsqrt_sh(0, a, b);
22238        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22239        assert_eq_m128h(r, e);
22240        let r = _mm_maskz_rsqrt_sh(1, a, b);
22241        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22242        assert_eq_m128h(r, e);
22243    }
22244
22245    #[simd_test(enable = "avx512fp16,avx512vl")]
22246    fn test_mm_sqrt_ph() {
22247        let a = _mm_set1_ph(4.0);
22248        let r = _mm_sqrt_ph(a);
22249        let e = _mm_set1_ph(2.0);
22250        assert_eq_m128h(r, e);
22251    }
22252
22253    #[simd_test(enable = "avx512fp16,avx512vl")]
22254    fn test_mm_mask_sqrt_ph() {
22255        let a = _mm_set1_ph(4.0);
22256        let src = _mm_set1_ph(1.0);
22257        let r = _mm_mask_sqrt_ph(src, 0b01010101, a);
22258        let e = _mm_set_ph(1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0);
22259        assert_eq_m128h(r, e);
22260    }
22261
22262    #[simd_test(enable = "avx512fp16,avx512vl")]
22263    fn test_mm_maskz_sqrt_ph() {
22264        let a = _mm_set1_ph(4.0);
22265        let r = _mm_maskz_sqrt_ph(0b01010101, a);
22266        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
22267        assert_eq_m128h(r, e);
22268    }
22269
22270    #[simd_test(enable = "avx512fp16,avx512vl")]
22271    fn test_mm256_sqrt_ph() {
22272        let a = _mm256_set1_ph(4.0);
22273        let r = _mm256_sqrt_ph(a);
22274        let e = _mm256_set1_ph(2.0);
22275        assert_eq_m256h(r, e);
22276    }
22277
22278    #[simd_test(enable = "avx512fp16,avx512vl")]
22279    fn test_mm256_mask_sqrt_ph() {
22280        let a = _mm256_set1_ph(4.0);
22281        let src = _mm256_set1_ph(1.0);
22282        let r = _mm256_mask_sqrt_ph(src, 0b0101010101010101, a);
22283        let e = _mm256_set_ph(
22284            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22285        );
22286        assert_eq_m256h(r, e);
22287    }
22288
22289    #[simd_test(enable = "avx512fp16,avx512vl")]
22290    fn test_mm256_maskz_sqrt_ph() {
22291        let a = _mm256_set1_ph(4.0);
22292        let r = _mm256_maskz_sqrt_ph(0b0101010101010101, a);
22293        let e = _mm256_set_ph(
22294            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22295        );
22296        assert_eq_m256h(r, e);
22297    }
22298
22299    #[simd_test(enable = "avx512fp16")]
22300    fn test_mm512_sqrt_ph() {
22301        let a = _mm512_set1_ph(4.0);
22302        let r = _mm512_sqrt_ph(a);
22303        let e = _mm512_set1_ph(2.0);
22304        assert_eq_m512h(r, e);
22305    }
22306
22307    #[simd_test(enable = "avx512fp16")]
22308    fn test_mm512_mask_sqrt_ph() {
22309        let a = _mm512_set1_ph(4.0);
22310        let src = _mm512_set1_ph(1.0);
22311        let r = _mm512_mask_sqrt_ph(src, 0b01010101010101010101010101010101, a);
22312        let e = _mm512_set_ph(
22313            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
22314            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22315        );
22316        assert_eq_m512h(r, e);
22317    }
22318
22319    #[simd_test(enable = "avx512fp16")]
22320    fn test_mm512_maskz_sqrt_ph() {
22321        let a = _mm512_set1_ph(4.0);
22322        let r = _mm512_maskz_sqrt_ph(0b01010101010101010101010101010101, a);
22323        let e = _mm512_set_ph(
22324            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22325            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22326        );
22327        assert_eq_m512h(r, e);
22328    }
22329
22330    #[simd_test(enable = "avx512fp16")]
22331    fn test_mm512_sqrt_round_ph() {
22332        let a = _mm512_set1_ph(4.0);
22333        let r = _mm512_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
22334        let e = _mm512_set1_ph(2.0);
22335        assert_eq_m512h(r, e);
22336    }
22337
22338    #[simd_test(enable = "avx512fp16")]
22339    fn test_mm512_mask_sqrt_round_ph() {
22340        let a = _mm512_set1_ph(4.0);
22341        let src = _mm512_set1_ph(1.0);
22342        let r = _mm512_mask_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22343            src,
22344            0b01010101010101010101010101010101,
22345            a,
22346        );
22347        let e = _mm512_set_ph(
22348            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
22349            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22350        );
22351        assert_eq_m512h(r, e);
22352    }
22353
22354    #[simd_test(enable = "avx512fp16")]
22355    fn test_mm512_maskz_sqrt_round_ph() {
22356        let a = _mm512_set1_ph(4.0);
22357        let r = _mm512_maskz_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22358            0b01010101010101010101010101010101,
22359            a,
22360        );
22361        let e = _mm512_set_ph(
22362            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22363            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22364        );
22365        assert_eq_m512h(r, e);
22366    }
22367
22368    #[simd_test(enable = "avx512fp16,avx512vl")]
22369    fn test_mm_sqrt_sh() {
22370        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22371        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22372        let r = _mm_sqrt_sh(a, b);
22373        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22374        assert_eq_m128h(r, e);
22375    }
22376
22377    #[simd_test(enable = "avx512fp16,avx512vl")]
22378    fn test_mm_mask_sqrt_sh() {
22379        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22380        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22381        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22382        let r = _mm_mask_sqrt_sh(src, 0, a, b);
22383        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22384        assert_eq_m128h(r, e);
22385        let r = _mm_mask_sqrt_sh(src, 1, a, b);
22386        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22387        assert_eq_m128h(r, e);
22388    }
22389
22390    #[simd_test(enable = "avx512fp16,avx512vl")]
22391    fn test_mm_maskz_sqrt_sh() {
22392        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22393        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22394        let r = _mm_maskz_sqrt_sh(0, a, b);
22395        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22396        assert_eq_m128h(r, e);
22397        let r = _mm_maskz_sqrt_sh(1, a, b);
22398        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22399        assert_eq_m128h(r, e);
22400    }
22401
22402    #[simd_test(enable = "avx512fp16,avx512vl")]
22403    fn test_mm_sqrt_round_sh() {
22404        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22405        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22406        let r = _mm_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22407        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22408        assert_eq_m128h(r, e);
22409    }
22410
22411    #[simd_test(enable = "avx512fp16,avx512vl")]
22412    fn test_mm_mask_sqrt_round_sh() {
22413        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22414        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22415        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22416        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22417            src, 0, a, b,
22418        );
22419        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22420        assert_eq_m128h(r, e);
22421        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22422            src, 1, a, b,
22423        );
22424        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22425        assert_eq_m128h(r, e);
22426    }
22427
22428    #[simd_test(enable = "avx512fp16,avx512vl")]
22429    fn test_mm_maskz_sqrt_round_sh() {
22430        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22431        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22432        let r =
22433            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22434        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22435        assert_eq_m128h(r, e);
22436        let r =
22437            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22438        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22439        assert_eq_m128h(r, e);
22440    }
22441
22442    #[simd_test(enable = "avx512fp16,avx512vl")]
22443    fn test_mm_max_ph() {
22444        let a = _mm_set1_ph(2.0);
22445        let b = _mm_set1_ph(1.0);
22446        let r = _mm_max_ph(a, b);
22447        let e = _mm_set1_ph(2.0);
22448        assert_eq_m128h(r, e);
22449    }
22450
22451    #[simd_test(enable = "avx512fp16,avx512vl")]
22452    fn test_mm_mask_max_ph() {
22453        let a = _mm_set1_ph(2.0);
22454        let b = _mm_set1_ph(1.0);
22455        let src = _mm_set1_ph(3.0);
22456        let r = _mm_mask_max_ph(src, 0b01010101, a, b);
22457        let e = _mm_set_ph(3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0);
22458        assert_eq_m128h(r, e);
22459    }
22460
22461    #[simd_test(enable = "avx512fp16,avx512vl")]
22462    fn test_mm_maskz_max_ph() {
22463        let a = _mm_set1_ph(2.0);
22464        let b = _mm_set1_ph(1.0);
22465        let r = _mm_maskz_max_ph(0b01010101, a, b);
22466        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
22467        assert_eq_m128h(r, e);
22468    }
22469
22470    #[simd_test(enable = "avx512fp16,avx512vl")]
22471    fn test_mm256_max_ph() {
22472        let a = _mm256_set1_ph(2.0);
22473        let b = _mm256_set1_ph(1.0);
22474        let r = _mm256_max_ph(a, b);
22475        let e = _mm256_set1_ph(2.0);
22476        assert_eq_m256h(r, e);
22477    }
22478
22479    #[simd_test(enable = "avx512fp16,avx512vl")]
22480    fn test_mm256_mask_max_ph() {
22481        let a = _mm256_set1_ph(2.0);
22482        let b = _mm256_set1_ph(1.0);
22483        let src = _mm256_set1_ph(3.0);
22484        let r = _mm256_mask_max_ph(src, 0b0101010101010101, a, b);
22485        let e = _mm256_set_ph(
22486            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22487        );
22488        assert_eq_m256h(r, e);
22489    }
22490
22491    #[simd_test(enable = "avx512fp16,avx512vl")]
22492    fn test_mm256_maskz_max_ph() {
22493        let a = _mm256_set1_ph(2.0);
22494        let b = _mm256_set1_ph(1.0);
22495        let r = _mm256_maskz_max_ph(0b0101010101010101, a, b);
22496        let e = _mm256_set_ph(
22497            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22498        );
22499        assert_eq_m256h(r, e);
22500    }
22501
22502    #[simd_test(enable = "avx512fp16")]
22503    fn test_mm512_max_ph() {
22504        let a = _mm512_set1_ph(2.0);
22505        let b = _mm512_set1_ph(1.0);
22506        let r = _mm512_max_ph(a, b);
22507        let e = _mm512_set1_ph(2.0);
22508        assert_eq_m512h(r, e);
22509    }
22510
22511    #[simd_test(enable = "avx512fp16")]
22512    fn test_mm512_mask_max_ph() {
22513        let a = _mm512_set1_ph(2.0);
22514        let b = _mm512_set1_ph(1.0);
22515        let src = _mm512_set1_ph(3.0);
22516        let r = _mm512_mask_max_ph(src, 0b01010101010101010101010101010101, a, b);
22517        let e = _mm512_set_ph(
22518            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
22519            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22520        );
22521        assert_eq_m512h(r, e);
22522    }
22523
22524    #[simd_test(enable = "avx512fp16")]
22525    fn test_mm512_maskz_max_ph() {
22526        let a = _mm512_set1_ph(2.0);
22527        let b = _mm512_set1_ph(1.0);
22528        let r = _mm512_maskz_max_ph(0b01010101010101010101010101010101, a, b);
22529        let e = _mm512_set_ph(
22530            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22531            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22532        );
22533        assert_eq_m512h(r, e);
22534    }
22535
22536    #[simd_test(enable = "avx512fp16")]
22537    fn test_mm512_max_round_ph() {
22538        let a = _mm512_set1_ph(2.0);
22539        let b = _mm512_set1_ph(1.0);
22540        let r = _mm512_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22541        let e = _mm512_set1_ph(2.0);
22542        assert_eq_m512h(r, e);
22543    }
22544
22545    #[simd_test(enable = "avx512fp16")]
22546    fn test_mm512_mask_max_round_ph() {
22547        let a = _mm512_set1_ph(2.0);
22548        let b = _mm512_set1_ph(1.0);
22549        let src = _mm512_set1_ph(3.0);
22550        let r = _mm512_mask_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22551            src,
22552            0b01010101010101010101010101010101,
22553            a,
22554            b,
22555        );
22556        let e = _mm512_set_ph(
22557            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
22558            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22559        );
22560        assert_eq_m512h(r, e);
22561    }
22562
22563    #[simd_test(enable = "avx512fp16")]
22564    fn test_mm512_maskz_max_round_ph() {
22565        let a = _mm512_set1_ph(2.0);
22566        let b = _mm512_set1_ph(1.0);
22567        let r = _mm512_maskz_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22568            0b01010101010101010101010101010101,
22569            a,
22570            b,
22571        );
22572        let e = _mm512_set_ph(
22573            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22574            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22575        );
22576        assert_eq_m512h(r, e);
22577    }
22578
22579    #[simd_test(enable = "avx512fp16,avx512vl")]
22580    fn test_mm_max_sh() {
22581        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22582        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22583        let r = _mm_max_sh(a, b);
22584        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22585        assert_eq_m128h(r, e);
22586    }
22587
22588    #[simd_test(enable = "avx512fp16,avx512vl")]
22589    fn test_mm_mask_max_sh() {
22590        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22591        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22592        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22593        let r = _mm_mask_max_sh(src, 0, a, b);
22594        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22595        assert_eq_m128h(r, e);
22596        let r = _mm_mask_max_sh(src, 1, a, b);
22597        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22598        assert_eq_m128h(r, e);
22599    }
22600
22601    #[simd_test(enable = "avx512fp16,avx512vl")]
22602    fn test_mm_maskz_max_sh() {
22603        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22604        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22605        let r = _mm_maskz_max_sh(0, a, b);
22606        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22607        assert_eq_m128h(r, e);
22608        let r = _mm_maskz_max_sh(1, a, b);
22609        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22610        assert_eq_m128h(r, e);
22611    }
22612
22613    #[simd_test(enable = "avx512fp16,avx512vl")]
22614    fn test_mm_max_round_sh() {
22615        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22616        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22617        let r = _mm_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22618        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22619        assert_eq_m128h(r, e);
22620    }
22621
22622    #[simd_test(enable = "avx512fp16,avx512vl")]
22623    fn test_mm_mask_max_round_sh() {
22624        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22625        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22626        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22627        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22628            src, 0, a, b,
22629        );
22630        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22631        assert_eq_m128h(r, e);
22632        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22633            src, 1, a, b,
22634        );
22635        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22636        assert_eq_m128h(r, e);
22637    }
22638
22639    #[simd_test(enable = "avx512fp16,avx512vl")]
22640    fn test_mm_maskz_max_round_sh() {
22641        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22642        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22643        let r =
22644            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22645        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22646        assert_eq_m128h(r, e);
22647        let r =
22648            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22649        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22650        assert_eq_m128h(r, e);
22651    }
22652
22653    #[simd_test(enable = "avx512fp16,avx512vl")]
22654    fn test_mm_min_ph() {
22655        let a = _mm_set1_ph(2.0);
22656        let b = _mm_set1_ph(1.0);
22657        let r = _mm_min_ph(a, b);
22658        let e = _mm_set1_ph(1.0);
22659        assert_eq_m128h(r, e);
22660    }
22661
22662    #[simd_test(enable = "avx512fp16,avx512vl")]
22663    fn test_mm_mask_min_ph() {
22664        let a = _mm_set1_ph(2.0);
22665        let b = _mm_set1_ph(1.0);
22666        let src = _mm_set1_ph(3.0);
22667        let r = _mm_mask_min_ph(src, 0b01010101, a, b);
22668        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
22669        assert_eq_m128h(r, e);
22670    }
22671
22672    #[simd_test(enable = "avx512fp16,avx512vl")]
22673    fn test_mm_maskz_min_ph() {
22674        let a = _mm_set1_ph(2.0);
22675        let b = _mm_set1_ph(1.0);
22676        let r = _mm_maskz_min_ph(0b01010101, a, b);
22677        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
22678        assert_eq_m128h(r, e);
22679    }
22680
22681    #[simd_test(enable = "avx512fp16,avx512vl")]
22682    fn test_mm256_min_ph() {
22683        let a = _mm256_set1_ph(2.0);
22684        let b = _mm256_set1_ph(1.0);
22685        let r = _mm256_min_ph(a, b);
22686        let e = _mm256_set1_ph(1.0);
22687        assert_eq_m256h(r, e);
22688    }
22689
22690    #[simd_test(enable = "avx512fp16,avx512vl")]
22691    fn test_mm256_mask_min_ph() {
22692        let a = _mm256_set1_ph(2.0);
22693        let b = _mm256_set1_ph(1.0);
22694        let src = _mm256_set1_ph(3.0);
22695        let r = _mm256_mask_min_ph(src, 0b0101010101010101, a, b);
22696        let e = _mm256_set_ph(
22697            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22698        );
22699        assert_eq_m256h(r, e);
22700    }
22701
22702    #[simd_test(enable = "avx512fp16,avx512vl")]
22703    fn test_mm256_maskz_min_ph() {
22704        let a = _mm256_set1_ph(2.0);
22705        let b = _mm256_set1_ph(1.0);
22706        let r = _mm256_maskz_min_ph(0b0101010101010101, a, b);
22707        let e = _mm256_set_ph(
22708            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22709        );
22710        assert_eq_m256h(r, e);
22711    }
22712
22713    #[simd_test(enable = "avx512fp16")]
22714    fn test_mm512_min_ph() {
22715        let a = _mm512_set1_ph(2.0);
22716        let b = _mm512_set1_ph(1.0);
22717        let r = _mm512_min_ph(a, b);
22718        let e = _mm512_set1_ph(1.0);
22719        assert_eq_m512h(r, e);
22720    }
22721
22722    #[simd_test(enable = "avx512fp16")]
22723    fn test_mm512_mask_min_ph() {
22724        let a = _mm512_set1_ph(2.0);
22725        let b = _mm512_set1_ph(1.0);
22726        let src = _mm512_set1_ph(3.0);
22727        let r = _mm512_mask_min_ph(src, 0b01010101010101010101010101010101, a, b);
22728        let e = _mm512_set_ph(
22729            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
22730            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22731        );
22732        assert_eq_m512h(r, e);
22733    }
22734
22735    #[simd_test(enable = "avx512fp16")]
22736    fn test_mm512_maskz_min_ph() {
22737        let a = _mm512_set1_ph(2.0);
22738        let b = _mm512_set1_ph(1.0);
22739        let r = _mm512_maskz_min_ph(0b01010101010101010101010101010101, a, b);
22740        let e = _mm512_set_ph(
22741            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22742            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22743        );
22744        assert_eq_m512h(r, e);
22745    }
22746
22747    #[simd_test(enable = "avx512fp16")]
22748    fn test_mm512_min_round_ph() {
22749        let a = _mm512_set1_ph(2.0);
22750        let b = _mm512_set1_ph(1.0);
22751        let r = _mm512_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22752        let e = _mm512_set1_ph(1.0);
22753        assert_eq_m512h(r, e);
22754    }
22755
22756    #[simd_test(enable = "avx512fp16")]
22757    fn test_mm512_mask_min_round_ph() {
22758        let a = _mm512_set1_ph(2.0);
22759        let b = _mm512_set1_ph(1.0);
22760        let src = _mm512_set1_ph(3.0);
22761        let r = _mm512_mask_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22762            src,
22763            0b01010101010101010101010101010101,
22764            a,
22765            b,
22766        );
22767        let e = _mm512_set_ph(
22768            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
22769            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22770        );
22771        assert_eq_m512h(r, e);
22772    }
22773
22774    #[simd_test(enable = "avx512fp16")]
22775    fn test_mm512_maskz_min_round_ph() {
22776        let a = _mm512_set1_ph(2.0);
22777        let b = _mm512_set1_ph(1.0);
22778        let r = _mm512_maskz_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22779            0b01010101010101010101010101010101,
22780            a,
22781            b,
22782        );
22783        let e = _mm512_set_ph(
22784            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22785            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22786        );
22787        assert_eq_m512h(r, e);
22788    }
22789
22790    #[simd_test(enable = "avx512fp16,avx512vl")]
22791    fn test_mm_min_sh() {
22792        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22793        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22794        let r = _mm_min_sh(a, b);
22795        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22796        assert_eq_m128h(r, e);
22797    }
22798
22799    #[simd_test(enable = "avx512fp16,avx512vl")]
22800    fn test_mm_mask_min_sh() {
22801        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22802        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22803        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22804        let r = _mm_mask_min_sh(src, 0, a, b);
22805        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22806        assert_eq_m128h(r, e);
22807        let r = _mm_mask_min_sh(src, 1, a, b);
22808        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22809        assert_eq_m128h(r, e);
22810    }
22811
22812    #[simd_test(enable = "avx512fp16,avx512vl")]
22813    fn test_mm_maskz_min_sh() {
22814        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22815        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22816        let r = _mm_maskz_min_sh(0, a, b);
22817        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22818        assert_eq_m128h(r, e);
22819        let r = _mm_maskz_min_sh(1, a, b);
22820        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22821        assert_eq_m128h(r, e);
22822    }
22823
22824    #[simd_test(enable = "avx512fp16,avx512vl")]
22825    fn test_mm_min_round_sh() {
22826        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22827        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22828        let r = _mm_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22829        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22830        assert_eq_m128h(r, e);
22831    }
22832
22833    #[simd_test(enable = "avx512fp16,avx512vl")]
22834    fn test_mm_mask_min_round_sh() {
22835        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22836        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22837        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22838        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22839            src, 0, a, b,
22840        );
22841        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22842        assert_eq_m128h(r, e);
22843        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22844            src, 1, a, b,
22845        );
22846        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22847        assert_eq_m128h(r, e);
22848    }
22849
22850    #[simd_test(enable = "avx512fp16,avx512vl")]
22851    fn test_mm_maskz_min_round_sh() {
22852        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22853        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22854        let r =
22855            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22856        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22857        assert_eq_m128h(r, e);
22858        let r =
22859            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22860        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22861        assert_eq_m128h(r, e);
22862    }
22863
22864    #[simd_test(enable = "avx512fp16,avx512vl")]
22865    fn test_mm_getexp_ph() {
22866        let a = _mm_set1_ph(3.0);
22867        let r = _mm_getexp_ph(a);
22868        let e = _mm_set1_ph(1.0);
22869        assert_eq_m128h(r, e);
22870    }
22871
22872    #[simd_test(enable = "avx512fp16,avx512vl")]
22873    fn test_mm_mask_getexp_ph() {
22874        let a = _mm_set1_ph(3.0);
22875        let src = _mm_set1_ph(4.0);
22876        let r = _mm_mask_getexp_ph(src, 0b01010101, a);
22877        let e = _mm_set_ph(4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0);
22878        assert_eq_m128h(r, e);
22879    }
22880
22881    #[simd_test(enable = "avx512fp16,avx512vl")]
22882    fn test_mm_maskz_getexp_ph() {
22883        let a = _mm_set1_ph(3.0);
22884        let r = _mm_maskz_getexp_ph(0b01010101, a);
22885        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
22886        assert_eq_m128h(r, e);
22887    }
22888
22889    #[simd_test(enable = "avx512fp16,avx512vl")]
22890    fn test_mm256_getexp_ph() {
22891        let a = _mm256_set1_ph(3.0);
22892        let r = _mm256_getexp_ph(a);
22893        let e = _mm256_set1_ph(1.0);
22894        assert_eq_m256h(r, e);
22895    }
22896
22897    #[simd_test(enable = "avx512fp16,avx512vl")]
22898    fn test_mm256_mask_getexp_ph() {
22899        let a = _mm256_set1_ph(3.0);
22900        let src = _mm256_set1_ph(4.0);
22901        let r = _mm256_mask_getexp_ph(src, 0b0101010101010101, a);
22902        let e = _mm256_set_ph(
22903            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22904        );
22905        assert_eq_m256h(r, e);
22906    }
22907
22908    #[simd_test(enable = "avx512fp16,avx512vl")]
22909    fn test_mm256_maskz_getexp_ph() {
22910        let a = _mm256_set1_ph(3.0);
22911        let r = _mm256_maskz_getexp_ph(0b0101010101010101, a);
22912        let e = _mm256_set_ph(
22913            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22914        );
22915        assert_eq_m256h(r, e);
22916    }
22917
22918    #[simd_test(enable = "avx512fp16")]
22919    fn test_mm512_getexp_ph() {
22920        let a = _mm512_set1_ph(3.0);
22921        let r = _mm512_getexp_ph(a);
22922        let e = _mm512_set1_ph(1.0);
22923        assert_eq_m512h(r, e);
22924    }
22925
22926    #[simd_test(enable = "avx512fp16")]
22927    fn test_mm512_mask_getexp_ph() {
22928        let a = _mm512_set1_ph(3.0);
22929        let src = _mm512_set1_ph(4.0);
22930        let r = _mm512_mask_getexp_ph(src, 0b01010101010101010101010101010101, a);
22931        let e = _mm512_set_ph(
22932            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
22933            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22934        );
22935        assert_eq_m512h(r, e);
22936    }
22937
22938    #[simd_test(enable = "avx512fp16")]
22939    fn test_mm512_maskz_getexp_ph() {
22940        let a = _mm512_set1_ph(3.0);
22941        let r = _mm512_maskz_getexp_ph(0b01010101010101010101010101010101, a);
22942        let e = _mm512_set_ph(
22943            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22944            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22945        );
22946        assert_eq_m512h(r, e);
22947    }
22948
22949    #[simd_test(enable = "avx512fp16")]
22950    fn test_mm512_getexp_round_ph() {
22951        let a = _mm512_set1_ph(3.0);
22952        let r = _mm512_getexp_round_ph::<_MM_FROUND_NO_EXC>(a);
22953        let e = _mm512_set1_ph(1.0);
22954        assert_eq_m512h(r, e);
22955    }
22956
22957    #[simd_test(enable = "avx512fp16")]
22958    fn test_mm512_mask_getexp_round_ph() {
22959        let a = _mm512_set1_ph(3.0);
22960        let src = _mm512_set1_ph(4.0);
22961        let r = _mm512_mask_getexp_round_ph::<_MM_FROUND_NO_EXC>(
22962            src,
22963            0b01010101010101010101010101010101,
22964            a,
22965        );
22966        let e = _mm512_set_ph(
22967            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
22968            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22969        );
22970        assert_eq_m512h(r, e);
22971    }
22972
22973    #[simd_test(enable = "avx512fp16")]
22974    fn test_mm512_maskz_getexp_round_ph() {
22975        let a = _mm512_set1_ph(3.0);
22976        let r = _mm512_maskz_getexp_round_ph::<_MM_FROUND_NO_EXC>(
22977            0b01010101010101010101010101010101,
22978            a,
22979        );
22980        let e = _mm512_set_ph(
22981            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22982            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22983        );
22984        assert_eq_m512h(r, e);
22985    }
22986
22987    #[simd_test(enable = "avx512fp16,avx512vl")]
22988    fn test_mm_getexp_sh() {
22989        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22990        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22991        let r = _mm_getexp_sh(a, b);
22992        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22993        assert_eq_m128h(r, e);
22994    }
22995
22996    #[simd_test(enable = "avx512fp16,avx512vl")]
22997    fn test_mm_mask_getexp_sh() {
22998        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22999        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23000        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
23001        let r = _mm_mask_getexp_sh(src, 0, a, b);
23002        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23003        assert_eq_m128h(r, e);
23004        let r = _mm_mask_getexp_sh(src, 1, a, b);
23005        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23006        assert_eq_m128h(r, e);
23007    }
23008
23009    #[simd_test(enable = "avx512fp16,avx512vl")]
23010    fn test_mm_maskz_getexp_sh() {
23011        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23012        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23013        let r = _mm_maskz_getexp_sh(0, a, b);
23014        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23015        assert_eq_m128h(r, e);
23016        let r = _mm_maskz_getexp_sh(1, a, b);
23017        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23018        assert_eq_m128h(r, e);
23019    }
23020
23021    #[simd_test(enable = "avx512fp16,avx512vl")]
23022    fn test_mm_getexp_round_sh() {
23023        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23024        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23025        let r = _mm_getexp_round_sh::<_MM_FROUND_NO_EXC>(a, b);
23026        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23027        assert_eq_m128h(r, e);
23028    }
23029
23030    #[simd_test(enable = "avx512fp16,avx512vl")]
23031    fn test_mm_mask_getexp_round_sh() {
23032        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23033        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23034        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
23035        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 0, a, b);
23036        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23037        assert_eq_m128h(r, e);
23038        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 1, a, b);
23039        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23040        assert_eq_m128h(r, e);
23041    }
23042
23043    #[simd_test(enable = "avx512fp16,avx512vl")]
23044    fn test_mm_maskz_getexp_round_sh() {
23045        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23046        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23047        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(0, a, b);
23048        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23049        assert_eq_m128h(r, e);
23050        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(1, a, b);
23051        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23052        assert_eq_m128h(r, e);
23053    }
23054
23055    #[simd_test(enable = "avx512fp16,avx512vl")]
23056    fn test_mm_getmant_ph() {
23057        let a = _mm_set1_ph(10.0);
23058        let r = _mm_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
23059        let e = _mm_set1_ph(1.25);
23060        assert_eq_m128h(r, e);
23061    }
23062
23063    #[simd_test(enable = "avx512fp16,avx512vl")]
23064    fn test_mm_mask_getmant_ph() {
23065        let a = _mm_set1_ph(10.0);
23066        let src = _mm_set1_ph(20.0);
23067        let r = _mm_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0b01010101, a);
23068        let e = _mm_set_ph(20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25);
23069        assert_eq_m128h(r, e);
23070    }
23071
23072    #[simd_test(enable = "avx512fp16,avx512vl")]
23073    fn test_mm_maskz_getmant_ph() {
23074        let a = _mm_set1_ph(10.0);
23075        let r = _mm_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0b01010101, a);
23076        let e = _mm_set_ph(0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25);
23077        assert_eq_m128h(r, e);
23078    }
23079
23080    #[simd_test(enable = "avx512fp16,avx512vl")]
23081    fn test_mm256_getmant_ph() {
23082        let a = _mm256_set1_ph(10.0);
23083        let r = _mm256_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
23084        let e = _mm256_set1_ph(1.25);
23085        assert_eq_m256h(r, e);
23086    }
23087
23088    #[simd_test(enable = "avx512fp16,avx512vl")]
23089    fn test_mm256_mask_getmant_ph() {
23090        let a = _mm256_set1_ph(10.0);
23091        let src = _mm256_set1_ph(20.0);
23092        let r = _mm256_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
23093            src,
23094            0b0101010101010101,
23095            a,
23096        );
23097        let e = _mm256_set_ph(
23098            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
23099            20.0, 1.25,
23100        );
23101        assert_eq_m256h(r, e);
23102    }
23103
23104    #[simd_test(enable = "avx512fp16,avx512vl")]
23105    fn test_mm256_maskz_getmant_ph() {
23106        let a = _mm256_set1_ph(10.0);
23107        let r = _mm256_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
23108            0b0101010101010101,
23109            a,
23110        );
23111        let e = _mm256_set_ph(
23112            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
23113        );
23114        assert_eq_m256h(r, e);
23115    }
23116
23117    #[simd_test(enable = "avx512fp16")]
23118    fn test_mm512_getmant_ph() {
23119        let a = _mm512_set1_ph(10.0);
23120        let r = _mm512_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
23121        let e = _mm512_set1_ph(1.25);
23122        assert_eq_m512h(r, e);
23123    }
23124
23125    #[simd_test(enable = "avx512fp16")]
23126    fn test_mm512_mask_getmant_ph() {
23127        let a = _mm512_set1_ph(10.0);
23128        let src = _mm512_set1_ph(20.0);
23129        let r = _mm512_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
23130            src,
23131            0b01010101010101010101010101010101,
23132            a,
23133        );
23134        let e = _mm512_set_ph(
23135            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
23136            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
23137            20.0, 1.25, 20.0, 1.25,
23138        );
23139        assert_eq_m512h(r, e);
23140    }
23141
23142    #[simd_test(enable = "avx512fp16")]
23143    fn test_mm512_maskz_getmant_ph() {
23144        let a = _mm512_set1_ph(10.0);
23145        let r = _mm512_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
23146            0b01010101010101010101010101010101,
23147            a,
23148        );
23149        let e = _mm512_set_ph(
23150            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
23151            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
23152        );
23153        assert_eq_m512h(r, e);
23154    }
23155
23156    #[simd_test(enable = "avx512fp16")]
23157    fn test_mm512_getmant_round_ph() {
23158        let a = _mm512_set1_ph(10.0);
23159        let r =
23160            _mm512_getmant_round_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
23161                a,
23162            );
23163        let e = _mm512_set1_ph(1.25);
23164        assert_eq_m512h(r, e);
23165    }
23166
23167    #[simd_test(enable = "avx512fp16")]
23168    fn test_mm512_mask_getmant_round_ph() {
23169        let a = _mm512_set1_ph(10.0);
23170        let src = _mm512_set1_ph(20.0);
23171        let r = _mm512_mask_getmant_round_ph::<
23172            _MM_MANT_NORM_P75_1P5,
23173            _MM_MANT_SIGN_NAN,
23174            _MM_FROUND_NO_EXC,
23175        >(src, 0b01010101010101010101010101010101, a);
23176        let e = _mm512_set_ph(
23177            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
23178            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
23179            20.0, 1.25, 20.0, 1.25,
23180        );
23181        assert_eq_m512h(r, e);
23182    }
23183
23184    #[simd_test(enable = "avx512fp16")]
23185    fn test_mm512_maskz_getmant_round_ph() {
23186        let a = _mm512_set1_ph(10.0);
23187        let r = _mm512_maskz_getmant_round_ph::<
23188            _MM_MANT_NORM_P75_1P5,
23189            _MM_MANT_SIGN_NAN,
23190            _MM_FROUND_NO_EXC,
23191        >(0b01010101010101010101010101010101, a);
23192        let e = _mm512_set_ph(
23193            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
23194            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
23195        );
23196        assert_eq_m512h(r, e);
23197    }
23198
23199    #[simd_test(enable = "avx512fp16,avx512vl")]
23200    fn test_mm_getmant_sh() {
23201        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23202        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23203        let r = _mm_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a, b);
23204        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23205        assert_eq_m128h(r, e);
23206    }
23207
23208    #[simd_test(enable = "avx512fp16,avx512vl")]
23209    fn test_mm_mask_getmant_sh() {
23210        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23211        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23212        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
23213        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0, a, b);
23214        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
23215        assert_eq_m128h(r, e);
23216        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 1, a, b);
23217        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23218        assert_eq_m128h(r, e);
23219    }
23220
23221    #[simd_test(enable = "avx512fp16,avx512vl")]
23222    fn test_mm_maskz_getmant_sh() {
23223        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23224        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23225        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0, a, b);
23226        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23227        assert_eq_m128h(r, e);
23228        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(1, a, b);
23229        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23230        assert_eq_m128h(r, e);
23231    }
23232
23233    #[simd_test(enable = "avx512fp16,avx512vl")]
23234    fn test_mm_getmant_round_sh() {
23235        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23236        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23237        let r = _mm_getmant_round_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
23238            a, b,
23239        );
23240        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23241        assert_eq_m128h(r, e);
23242    }
23243
23244    #[simd_test(enable = "avx512fp16,avx512vl")]
23245    fn test_mm_mask_getmant_round_sh() {
23246        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23247        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23248        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
23249        let r = _mm_mask_getmant_round_sh::<
23250            _MM_MANT_NORM_P75_1P5,
23251            _MM_MANT_SIGN_NAN,
23252            _MM_FROUND_NO_EXC,
23253        >(src, 0, a, b);
23254        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
23255        assert_eq_m128h(r, e);
23256        let r = _mm_mask_getmant_round_sh::<
23257            _MM_MANT_NORM_P75_1P5,
23258            _MM_MANT_SIGN_NAN,
23259            _MM_FROUND_NO_EXC,
23260        >(src, 1, a, b);
23261        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23262        assert_eq_m128h(r, e);
23263    }
23264
23265    #[simd_test(enable = "avx512fp16,avx512vl")]
23266    fn test_mm_maskz_getmant_round_sh() {
23267        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23268        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23269        let r = _mm_maskz_getmant_round_sh::<
23270            _MM_MANT_NORM_P75_1P5,
23271            _MM_MANT_SIGN_NAN,
23272            _MM_FROUND_NO_EXC,
23273        >(0, a, b);
23274        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23275        assert_eq_m128h(r, e);
23276        let r = _mm_maskz_getmant_round_sh::<
23277            _MM_MANT_NORM_P75_1P5,
23278            _MM_MANT_SIGN_NAN,
23279            _MM_FROUND_NO_EXC,
23280        >(1, a, b);
23281        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23282        assert_eq_m128h(r, e);
23283    }
23284
23285    #[simd_test(enable = "avx512fp16,avx512vl")]
23286    fn test_mm_roundscale_ph() {
23287        let a = _mm_set1_ph(1.1);
23288        let r = _mm_roundscale_ph::<0>(a);
23289        let e = _mm_set1_ph(1.0);
23290        assert_eq_m128h(r, e);
23291    }
23292
23293    #[simd_test(enable = "avx512fp16,avx512vl")]
23294    fn test_mm_mask_roundscale_ph() {
23295        let a = _mm_set1_ph(1.1);
23296        let src = _mm_set1_ph(2.0);
23297        let r = _mm_mask_roundscale_ph::<0>(src, 0b01010101, a);
23298        let e = _mm_set_ph(2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0);
23299        assert_eq_m128h(r, e);
23300    }
23301
23302    #[simd_test(enable = "avx512fp16,avx512vl")]
23303    fn test_mm_maskz_roundscale_ph() {
23304        let a = _mm_set1_ph(1.1);
23305        let r = _mm_maskz_roundscale_ph::<0>(0b01010101, a);
23306        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
23307        assert_eq_m128h(r, e);
23308    }
23309
23310    #[simd_test(enable = "avx512fp16,avx512vl")]
23311    fn test_mm256_roundscale_ph() {
23312        let a = _mm256_set1_ph(1.1);
23313        let r = _mm256_roundscale_ph::<0>(a);
23314        let e = _mm256_set1_ph(1.0);
23315        assert_eq_m256h(r, e);
23316    }
23317
23318    #[simd_test(enable = "avx512fp16,avx512vl")]
23319    fn test_mm256_mask_roundscale_ph() {
23320        let a = _mm256_set1_ph(1.1);
23321        let src = _mm256_set1_ph(2.0);
23322        let r = _mm256_mask_roundscale_ph::<0>(src, 0b0101010101010101, a);
23323        let e = _mm256_set_ph(
23324            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23325        );
23326        assert_eq_m256h(r, e);
23327    }
23328
23329    #[simd_test(enable = "avx512fp16,avx512vl")]
23330    fn test_mm256_maskz_roundscale_ph() {
23331        let a = _mm256_set1_ph(1.1);
23332        let r = _mm256_maskz_roundscale_ph::<0>(0b0101010101010101, a);
23333        let e = _mm256_set_ph(
23334            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23335        );
23336        assert_eq_m256h(r, e);
23337    }
23338
23339    #[simd_test(enable = "avx512fp16")]
23340    fn test_mm512_roundscale_ph() {
23341        let a = _mm512_set1_ph(1.1);
23342        let r = _mm512_roundscale_ph::<0>(a);
23343        let e = _mm512_set1_ph(1.0);
23344        assert_eq_m512h(r, e);
23345    }
23346
23347    #[simd_test(enable = "avx512fp16")]
23348    fn test_mm512_mask_roundscale_ph() {
23349        let a = _mm512_set1_ph(1.1);
23350        let src = _mm512_set1_ph(2.0);
23351        let r = _mm512_mask_roundscale_ph::<0>(src, 0b01010101010101010101010101010101, a);
23352        let e = _mm512_set_ph(
23353            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
23354            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23355        );
23356        assert_eq_m512h(r, e);
23357    }
23358
23359    #[simd_test(enable = "avx512fp16")]
23360    fn test_mm512_maskz_roundscale_ph() {
23361        let a = _mm512_set1_ph(1.1);
23362        let r = _mm512_maskz_roundscale_ph::<0>(0b01010101010101010101010101010101, a);
23363        let e = _mm512_set_ph(
23364            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
23365            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23366        );
23367        assert_eq_m512h(r, e);
23368    }
23369
23370    #[simd_test(enable = "avx512fp16")]
23371    fn test_mm512_roundscale_round_ph() {
23372        let a = _mm512_set1_ph(1.1);
23373        let r = _mm512_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(a);
23374        let e = _mm512_set1_ph(1.0);
23375        assert_eq_m512h(r, e);
23376    }
23377
23378    #[simd_test(enable = "avx512fp16")]
23379    fn test_mm512_mask_roundscale_round_ph() {
23380        let a = _mm512_set1_ph(1.1);
23381        let src = _mm512_set1_ph(2.0);
23382        let r = _mm512_mask_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
23383            src,
23384            0b01010101010101010101010101010101,
23385            a,
23386        );
23387        let e = _mm512_set_ph(
23388            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
23389            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23390        );
23391        assert_eq_m512h(r, e);
23392    }
23393
23394    #[simd_test(enable = "avx512fp16")]
23395    fn test_mm512_maskz_roundscale_round_ph() {
23396        let a = _mm512_set1_ph(1.1);
23397        let r = _mm512_maskz_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
23398            0b01010101010101010101010101010101,
23399            a,
23400        );
23401        let e = _mm512_set_ph(
23402            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
23403            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23404        );
23405        assert_eq_m512h(r, e);
23406    }
23407
23408    #[simd_test(enable = "avx512fp16,avx512vl")]
23409    fn test_mm_roundscale_sh() {
23410        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23411        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23412        let r = _mm_roundscale_sh::<0>(a, b);
23413        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23414        assert_eq_m128h(r, e);
23415    }
23416
23417    #[simd_test(enable = "avx512fp16,avx512vl")]
23418    fn test_mm_mask_roundscale_sh() {
23419        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23420        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23421        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
23422        let r = _mm_mask_roundscale_sh::<0>(src, 0, a, b);
23423        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23424        assert_eq_m128h(r, e);
23425        let r = _mm_mask_roundscale_sh::<0>(src, 1, a, b);
23426        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23427        assert_eq_m128h(r, e);
23428    }
23429
23430    #[simd_test(enable = "avx512fp16,avx512vl")]
23431    fn test_mm_maskz_roundscale_sh() {
23432        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23433        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23434        let r = _mm_maskz_roundscale_sh::<0>(0, a, b);
23435        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23436        assert_eq_m128h(r, e);
23437        let r = _mm_maskz_roundscale_sh::<0>(1, a, b);
23438        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23439        assert_eq_m128h(r, e);
23440    }
23441
23442    #[simd_test(enable = "avx512fp16,avx512vl")]
23443    fn test_mm_roundscale_round_sh() {
23444        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23445        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23446        let r = _mm_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(a, b);
23447        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23448        assert_eq_m128h(r, e);
23449    }
23450
23451    #[simd_test(enable = "avx512fp16,avx512vl")]
23452    fn test_mm_mask_roundscale_round_sh() {
23453        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23454        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23455        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
23456        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 0, a, b);
23457        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23458        assert_eq_m128h(r, e);
23459        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 1, a, b);
23460        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23461        assert_eq_m128h(r, e);
23462    }
23463
23464    #[simd_test(enable = "avx512fp16,avx512vl")]
23465    fn test_mm_maskz_roundscale_round_sh() {
23466        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23467        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23468        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(0, a, b);
23469        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23470        assert_eq_m128h(r, e);
23471        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(1, a, b);
23472        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23473        assert_eq_m128h(r, e);
23474    }
23475
23476    #[simd_test(enable = "avx512fp16,avx512vl")]
23477    fn test_mm_scalef_ph() {
23478        let a = _mm_set1_ph(1.);
23479        let b = _mm_set1_ph(3.);
23480        let r = _mm_scalef_ph(a, b);
23481        let e = _mm_set1_ph(8.0);
23482        assert_eq_m128h(r, e);
23483    }
23484
23485    #[simd_test(enable = "avx512fp16,avx512vl")]
23486    fn test_mm_mask_scalef_ph() {
23487        let a = _mm_set1_ph(1.);
23488        let b = _mm_set1_ph(3.);
23489        let src = _mm_set1_ph(2.);
23490        let r = _mm_mask_scalef_ph(src, 0b01010101, a, b);
23491        let e = _mm_set_ph(2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0);
23492        assert_eq_m128h(r, e);
23493    }
23494
23495    #[simd_test(enable = "avx512fp16,avx512vl")]
23496    fn test_mm_maskz_scalef_ph() {
23497        let a = _mm_set1_ph(1.);
23498        let b = _mm_set1_ph(3.);
23499        let r = _mm_maskz_scalef_ph(0b01010101, a, b);
23500        let e = _mm_set_ph(0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0);
23501        assert_eq_m128h(r, e);
23502    }
23503
23504    #[simd_test(enable = "avx512fp16,avx512vl")]
23505    fn test_mm256_scalef_ph() {
23506        let a = _mm256_set1_ph(1.);
23507        let b = _mm256_set1_ph(3.);
23508        let r = _mm256_scalef_ph(a, b);
23509        let e = _mm256_set1_ph(8.0);
23510        assert_eq_m256h(r, e);
23511    }
23512
23513    #[simd_test(enable = "avx512fp16,avx512vl")]
23514    fn test_mm256_mask_scalef_ph() {
23515        let a = _mm256_set1_ph(1.);
23516        let b = _mm256_set1_ph(3.);
23517        let src = _mm256_set1_ph(2.);
23518        let r = _mm256_mask_scalef_ph(src, 0b0101010101010101, a, b);
23519        let e = _mm256_set_ph(
23520            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23521        );
23522        assert_eq_m256h(r, e);
23523    }
23524
23525    #[simd_test(enable = "avx512fp16,avx512vl")]
23526    fn test_mm256_maskz_scalef_ph() {
23527        let a = _mm256_set1_ph(1.);
23528        let b = _mm256_set1_ph(3.);
23529        let r = _mm256_maskz_scalef_ph(0b0101010101010101, a, b);
23530        let e = _mm256_set_ph(
23531            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23532        );
23533        assert_eq_m256h(r, e);
23534    }
23535
23536    #[simd_test(enable = "avx512fp16")]
23537    fn test_mm512_scalef_ph() {
23538        let a = _mm512_set1_ph(1.);
23539        let b = _mm512_set1_ph(3.);
23540        let r = _mm512_scalef_ph(a, b);
23541        let e = _mm512_set1_ph(8.0);
23542        assert_eq_m512h(r, e);
23543    }
23544
23545    #[simd_test(enable = "avx512fp16")]
23546    fn test_mm512_mask_scalef_ph() {
23547        let a = _mm512_set1_ph(1.);
23548        let b = _mm512_set1_ph(3.);
23549        let src = _mm512_set1_ph(2.);
23550        let r = _mm512_mask_scalef_ph(src, 0b01010101010101010101010101010101, a, b);
23551        let e = _mm512_set_ph(
23552            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
23553            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23554        );
23555        assert_eq_m512h(r, e);
23556    }
23557
23558    #[simd_test(enable = "avx512fp16")]
23559    fn test_mm512_maskz_scalef_ph() {
23560        let a = _mm512_set1_ph(1.);
23561        let b = _mm512_set1_ph(3.);
23562        let r = _mm512_maskz_scalef_ph(0b01010101010101010101010101010101, a, b);
23563        let e = _mm512_set_ph(
23564            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
23565            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23566        );
23567        assert_eq_m512h(r, e);
23568    }
23569
23570    #[simd_test(enable = "avx512fp16")]
23571    fn test_mm512_scalef_round_ph() {
23572        let a = _mm512_set1_ph(1.);
23573        let b = _mm512_set1_ph(3.);
23574        let r = _mm512_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
23575        let e = _mm512_set1_ph(8.0);
23576        assert_eq_m512h(r, e);
23577    }
23578
23579    #[simd_test(enable = "avx512fp16")]
23580    fn test_mm512_mask_scalef_round_ph() {
23581        let a = _mm512_set1_ph(1.);
23582        let b = _mm512_set1_ph(3.);
23583        let src = _mm512_set1_ph(2.);
23584        let r = _mm512_mask_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23585            src,
23586            0b01010101010101010101010101010101,
23587            a,
23588            b,
23589        );
23590        let e = _mm512_set_ph(
23591            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
23592            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23593        );
23594        assert_eq_m512h(r, e);
23595    }
23596
23597    #[simd_test(enable = "avx512fp16")]
23598    fn test_mm512_maskz_scalef_round_ph() {
23599        let a = _mm512_set1_ph(1.);
23600        let b = _mm512_set1_ph(3.);
23601        let r = _mm512_maskz_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23602            0b01010101010101010101010101010101,
23603            a,
23604            b,
23605        );
23606        let e = _mm512_set_ph(
23607            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
23608            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23609        );
23610        assert_eq_m512h(r, e);
23611    }
23612
23613    #[simd_test(enable = "avx512fp16,avx512vl")]
23614    fn test_mm_scalef_sh() {
23615        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23616        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23617        let r = _mm_scalef_sh(a, b);
23618        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23619        assert_eq_m128h(r, e);
23620    }
23621
23622    #[simd_test(enable = "avx512fp16,avx512vl")]
23623    fn test_mm_mask_scalef_sh() {
23624        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23625        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23626        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23627        let r = _mm_mask_scalef_sh(src, 0, a, b);
23628        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23629        assert_eq_m128h(r, e);
23630        let r = _mm_mask_scalef_sh(src, 1, a, b);
23631        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23632        assert_eq_m128h(r, e);
23633    }
23634
23635    #[simd_test(enable = "avx512fp16,avx512vl")]
23636    fn test_mm_maskz_scalef_sh() {
23637        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23638        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23639        let r = _mm_maskz_scalef_sh(0, a, b);
23640        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23641        assert_eq_m128h(r, e);
23642        let r = _mm_maskz_scalef_sh(1, a, b);
23643        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23644        assert_eq_m128h(r, e);
23645    }
23646
23647    #[simd_test(enable = "avx512fp16,avx512vl")]
23648    fn test_mm_scalef_round_sh() {
23649        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23650        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23651        let r = _mm_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
23652        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23653        assert_eq_m128h(r, e);
23654    }
23655
23656    #[simd_test(enable = "avx512fp16,avx512vl")]
23657    fn test_mm_mask_scalef_round_sh() {
23658        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23659        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23660        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23661        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23662            src, 0, a, b,
23663        );
23664        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23665        assert_eq_m128h(r, e);
23666        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23667            src, 1, a, b,
23668        );
23669        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23670        assert_eq_m128h(r, e);
23671    }
23672
23673    #[simd_test(enable = "avx512fp16,avx512vl")]
23674    fn test_mm_maskz_scalef_round_sh() {
23675        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23676        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23677        let r =
23678            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
23679        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23680        assert_eq_m128h(r, e);
23681        let r =
23682            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
23683        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23684        assert_eq_m128h(r, e);
23685    }
23686
23687    #[simd_test(enable = "avx512fp16,avx512vl")]
23688    fn test_mm_reduce_ph() {
23689        let a = _mm_set1_ph(1.25);
23690        let r = _mm_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23691        let e = _mm_set1_ph(0.25);
23692        assert_eq_m128h(r, e);
23693    }
23694
23695    #[simd_test(enable = "avx512fp16,avx512vl")]
23696    fn test_mm_mask_reduce_ph() {
23697        let a = _mm_set1_ph(1.25);
23698        let src = _mm_set1_ph(2.0);
23699        let r = _mm_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01010101, a);
23700        let e = _mm_set_ph(2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25);
23701        assert_eq_m128h(r, e);
23702    }
23703
23704    #[simd_test(enable = "avx512fp16,avx512vl")]
23705    fn test_mm_maskz_reduce_ph() {
23706        let a = _mm_set1_ph(1.25);
23707        let r = _mm_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01010101, a);
23708        let e = _mm_set_ph(0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25);
23709        assert_eq_m128h(r, e);
23710    }
23711
23712    #[simd_test(enable = "avx512fp16,avx512vl")]
23713    fn test_mm256_reduce_ph() {
23714        let a = _mm256_set1_ph(1.25);
23715        let r = _mm256_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23716        let e = _mm256_set1_ph(0.25);
23717        assert_eq_m256h(r, e);
23718    }
23719
23720    #[simd_test(enable = "avx512fp16,avx512vl")]
23721    fn test_mm256_mask_reduce_ph() {
23722        let a = _mm256_set1_ph(1.25);
23723        let src = _mm256_set1_ph(2.0);
23724        let r = _mm256_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0101010101010101, a);
23725        let e = _mm256_set_ph(
23726            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23727        );
23728        assert_eq_m256h(r, e);
23729    }
23730
23731    #[simd_test(enable = "avx512fp16,avx512vl")]
23732    fn test_mm256_maskz_reduce_ph() {
23733        let a = _mm256_set1_ph(1.25);
23734        let r = _mm256_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0101010101010101, a);
23735        let e = _mm256_set_ph(
23736            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23737        );
23738        assert_eq_m256h(r, e);
23739    }
23740
23741    #[simd_test(enable = "avx512fp16")]
23742    fn test_mm512_reduce_ph() {
23743        let a = _mm512_set1_ph(1.25);
23744        let r = _mm512_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23745        let e = _mm512_set1_ph(0.25);
23746        assert_eq_m512h(r, e);
23747    }
23748
23749    #[simd_test(enable = "avx512fp16")]
23750    fn test_mm512_mask_reduce_ph() {
23751        let a = _mm512_set1_ph(1.25);
23752        let src = _mm512_set1_ph(2.0);
23753        let r = _mm512_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
23754            src,
23755            0b01010101010101010101010101010101,
23756            a,
23757        );
23758        let e = _mm512_set_ph(
23759            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23760            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23761        );
23762        assert_eq_m512h(r, e);
23763    }
23764
23765    #[simd_test(enable = "avx512fp16")]
23766    fn test_mm512_maskz_reduce_ph() {
23767        let a = _mm512_set1_ph(1.25);
23768        let r = _mm512_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
23769            0b01010101010101010101010101010101,
23770            a,
23771        );
23772        let e = _mm512_set_ph(
23773            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23774            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23775        );
23776        assert_eq_m512h(r, e);
23777    }
23778
23779    #[simd_test(enable = "avx512fp16")]
23780    fn test_mm512_reduce_round_ph() {
23781        let a = _mm512_set1_ph(1.25);
23782        let r = _mm512_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a);
23783        let e = _mm512_set1_ph(0.25);
23784        assert_eq_m512h(r, e);
23785    }
23786
23787    #[simd_test(enable = "avx512fp16")]
23788    fn test_mm512_mask_reduce_round_ph() {
23789        let a = _mm512_set1_ph(1.25);
23790        let src = _mm512_set1_ph(2.0);
23791        let r = _mm512_mask_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23792            src,
23793            0b01010101010101010101010101010101,
23794            a,
23795        );
23796        let e = _mm512_set_ph(
23797            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23798            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23799        );
23800        assert_eq_m512h(r, e);
23801    }
23802
23803    #[simd_test(enable = "avx512fp16")]
23804    fn test_mm512_maskz_reduce_round_ph() {
23805        let a = _mm512_set1_ph(1.25);
23806        let r = _mm512_maskz_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23807            0b01010101010101010101010101010101,
23808            a,
23809        );
23810        let e = _mm512_set_ph(
23811            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23812            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23813        );
23814        assert_eq_m512h(r, e);
23815    }
23816
23817    #[simd_test(enable = "avx512fp16,avx512vl")]
23818    fn test_mm_reduce_sh() {
23819        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23820        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23821        let r = _mm_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(a, b);
23822        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23823        assert_eq_m128h(r, e);
23824    }
23825
23826    #[simd_test(enable = "avx512fp16,avx512vl")]
23827    fn test_mm_mask_reduce_sh() {
23828        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23829        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23830        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23831        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0, a, b);
23832        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23833        assert_eq_m128h(r, e);
23834        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 1, a, b);
23835        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23836        assert_eq_m128h(r, e);
23837    }
23838
23839    #[simd_test(enable = "avx512fp16,avx512vl")]
23840    fn test_mm_maskz_reduce_sh() {
23841        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23842        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23843        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(0, a, b);
23844        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23845        assert_eq_m128h(r, e);
23846        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(1, a, b);
23847        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23848        assert_eq_m128h(r, e);
23849    }
23850
23851    #[simd_test(enable = "avx512fp16,avx512vl")]
23852    fn test_mm_reduce_round_sh() {
23853        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23854        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23855        let r = _mm_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a, b);
23856        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23857        assert_eq_m128h(r, e);
23858    }
23859
23860    #[simd_test(enable = "avx512fp16,avx512vl")]
23861    fn test_mm_mask_reduce_round_sh() {
23862        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23863        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23864        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23865        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23866            src, 0, a, b,
23867        );
23868        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23869        assert_eq_m128h(r, e);
23870        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23871            src, 1, a, b,
23872        );
23873        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23874        assert_eq_m128h(r, e);
23875    }
23876
23877    #[simd_test(enable = "avx512fp16,avx512vl")]
23878    fn test_mm_maskz_reduce_round_sh() {
23879        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23880        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23881        let r =
23882            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(0, a, b);
23883        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23884        assert_eq_m128h(r, e);
23885        let r =
23886            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(1, a, b);
23887        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23888        assert_eq_m128h(r, e);
23889    }
23890
23891    #[simd_test(enable = "avx512fp16,avx512vl")]
23892    const fn test_mm_reduce_add_ph() {
23893        let a = _mm_set1_ph(2.0);
23894        let r = _mm_reduce_add_ph(a);
23895        assert_eq!(r, 16.0);
23896    }
23897
23898    #[simd_test(enable = "avx512fp16,avx512vl")]
23899    const fn test_mm256_reduce_add_ph() {
23900        let a = _mm256_set1_ph(2.0);
23901        let r = _mm256_reduce_add_ph(a);
23902        assert_eq!(r, 32.0);
23903    }
23904
23905    #[simd_test(enable = "avx512fp16")]
23906    const fn test_mm512_reduce_add_ph() {
23907        let a = _mm512_set1_ph(2.0);
23908        let r = _mm512_reduce_add_ph(a);
23909        assert_eq!(r, 64.0);
23910    }
23911
23912    #[simd_test(enable = "avx512fp16,avx512vl")]
23913    const fn test_mm_reduce_mul_ph() {
23914        let a = _mm_set1_ph(2.0);
23915        let r = _mm_reduce_mul_ph(a);
23916        assert_eq!(r, 256.0);
23917    }
23918
23919    #[simd_test(enable = "avx512fp16,avx512vl")]
23920    const fn test_mm256_reduce_mul_ph() {
23921        let a = _mm256_set1_ph(1.2);
23922        let r = _mm256_reduce_mul_ph(a);
23923        assert_eq!(r, 18.5);
23924    }
23925
23926    #[simd_test(enable = "avx512fp16")]
23927    const fn test_mm512_reduce_mul_ph() {
23928        let a = _mm512_set1_ph(1.2);
23929        let r = _mm512_reduce_mul_ph(a);
23930        assert_eq!(r, 342.3);
23931    }
23932
23933    #[simd_test(enable = "avx512fp16,avx512vl")]
23934    fn test_mm_reduce_max_ph() {
23935        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23936        let r = _mm_reduce_max_ph(a);
23937        assert_eq!(r, 8.0);
23938    }
23939
23940    #[simd_test(enable = "avx512fp16,avx512vl")]
23941    fn test_mm256_reduce_max_ph() {
23942        let a = _mm256_set_ph(
23943            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23944        );
23945        let r = _mm256_reduce_max_ph(a);
23946        assert_eq!(r, 16.0);
23947    }
23948
23949    #[simd_test(enable = "avx512fp16")]
23950    fn test_mm512_reduce_max_ph() {
23951        let a = _mm512_set_ph(
23952            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23953            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23954            31.0, 32.0,
23955        );
23956        let r = _mm512_reduce_max_ph(a);
23957        assert_eq!(r, 32.0);
23958    }
23959
23960    #[simd_test(enable = "avx512fp16,avx512vl")]
23961    fn test_mm_reduce_min_ph() {
23962        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23963        let r = _mm_reduce_min_ph(a);
23964        assert_eq!(r, 1.0);
23965    }
23966
23967    #[simd_test(enable = "avx512fp16,avx512vl")]
23968    fn test_mm256_reduce_min_ph() {
23969        let a = _mm256_set_ph(
23970            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23971        );
23972        let r = _mm256_reduce_min_ph(a);
23973        assert_eq!(r, 1.0);
23974    }
23975
23976    #[simd_test(enable = "avx512fp16")]
23977    fn test_mm512_reduce_min_ph() {
23978        let a = _mm512_set_ph(
23979            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23980            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23981            31.0, 32.0,
23982        );
23983        let r = _mm512_reduce_min_ph(a);
23984        assert_eq!(r, 1.0);
23985    }
23986
23987    #[simd_test(enable = "avx512fp16,avx512vl")]
23988    fn test_mm_fpclass_ph_mask() {
23989        let a = _mm_set_ph(
23990            1.,
23991            f16::INFINITY,
23992            f16::NEG_INFINITY,
23993            0.0,
23994            -0.0,
23995            -2.0,
23996            f16::NAN,
23997            5.9e-8, // Denormal
23998        );
23999        let r = _mm_fpclass_ph_mask::<0x18>(a); // infinities
24000        assert_eq!(r, 0b01100000);
24001    }
24002
24003    #[simd_test(enable = "avx512fp16,avx512vl")]
24004    fn test_mm_mask_fpclass_ph_mask() {
24005        let a = _mm_set_ph(
24006            1.,
24007            f16::INFINITY,
24008            f16::NEG_INFINITY,
24009            0.0,
24010            -0.0,
24011            -2.0,
24012            f16::NAN,
24013            5.9e-8, // Denormal
24014        );
24015        let r = _mm_mask_fpclass_ph_mask::<0x18>(0b01010101, a);
24016        assert_eq!(r, 0b01000000);
24017    }
24018
24019    #[simd_test(enable = "avx512fp16,avx512vl")]
24020    fn test_mm256_fpclass_ph_mask() {
24021        let a = _mm256_set_ph(
24022            1.,
24023            f16::INFINITY,
24024            f16::NEG_INFINITY,
24025            0.0,
24026            -0.0,
24027            -2.0,
24028            f16::NAN,
24029            5.9e-8, // Denormal
24030            1.,
24031            f16::INFINITY,
24032            f16::NEG_INFINITY,
24033            0.0,
24034            -0.0,
24035            -2.0,
24036            f16::NAN,
24037            5.9e-8, // Denormal
24038        );
24039        let r = _mm256_fpclass_ph_mask::<0x18>(a); // infinities
24040        assert_eq!(r, 0b0110000001100000);
24041    }
24042
24043    #[simd_test(enable = "avx512fp16,avx512vl")]
24044    fn test_mm256_mask_fpclass_ph_mask() {
24045        let a = _mm256_set_ph(
24046            1.,
24047            f16::INFINITY,
24048            f16::NEG_INFINITY,
24049            0.0,
24050            -0.0,
24051            -2.0,
24052            f16::NAN,
24053            5.9e-8, // Denormal
24054            1.,
24055            f16::INFINITY,
24056            f16::NEG_INFINITY,
24057            0.0,
24058            -0.0,
24059            -2.0,
24060            f16::NAN,
24061            5.9e-8, // Denormal
24062        );
24063        let r = _mm256_mask_fpclass_ph_mask::<0x18>(0b0101010101010101, a);
24064        assert_eq!(r, 0b0100000001000000);
24065    }
24066
24067    #[simd_test(enable = "avx512fp16")]
24068    fn test_mm512_fpclass_ph_mask() {
24069        let a = _mm512_set_ph(
24070            1.,
24071            f16::INFINITY,
24072            f16::NEG_INFINITY,
24073            0.0,
24074            -0.0,
24075            -2.0,
24076            f16::NAN,
24077            5.9e-8, // Denormal
24078            1.,
24079            f16::INFINITY,
24080            f16::NEG_INFINITY,
24081            0.0,
24082            -0.0,
24083            -2.0,
24084            f16::NAN,
24085            5.9e-8, // Denormal
24086            1.,
24087            f16::INFINITY,
24088            f16::NEG_INFINITY,
24089            0.0,
24090            -0.0,
24091            -2.0,
24092            f16::NAN,
24093            5.9e-8, // Denormal
24094            1.,
24095            f16::INFINITY,
24096            f16::NEG_INFINITY,
24097            0.0,
24098            -0.0,
24099            -2.0,
24100            f16::NAN,
24101            5.9e-8, // Denormal
24102        );
24103        let r = _mm512_fpclass_ph_mask::<0x18>(a); // infinities
24104        assert_eq!(r, 0b01100000011000000110000001100000);
24105    }
24106
24107    #[simd_test(enable = "avx512fp16")]
24108    fn test_mm512_mask_fpclass_ph_mask() {
24109        let a = _mm512_set_ph(
24110            1.,
24111            f16::INFINITY,
24112            f16::NEG_INFINITY,
24113            0.0,
24114            -0.0,
24115            -2.0,
24116            f16::NAN,
24117            5.9e-8, // Denormal
24118            1.,
24119            f16::INFINITY,
24120            f16::NEG_INFINITY,
24121            0.0,
24122            -0.0,
24123            -2.0,
24124            f16::NAN,
24125            5.9e-8, // Denormal
24126            1.,
24127            f16::INFINITY,
24128            f16::NEG_INFINITY,
24129            0.0,
24130            -0.0,
24131            -2.0,
24132            f16::NAN,
24133            5.9e-8, // Denormal
24134            1.,
24135            f16::INFINITY,
24136            f16::NEG_INFINITY,
24137            0.0,
24138            -0.0,
24139            -2.0,
24140            f16::NAN,
24141            5.9e-8, // Denormal
24142        );
24143        let r = _mm512_mask_fpclass_ph_mask::<0x18>(0b01010101010101010101010101010101, a);
24144        assert_eq!(r, 0b01000000010000000100000001000000);
24145    }
24146
24147    #[simd_test(enable = "avx512fp16")]
24148    fn test_mm_fpclass_sh_mask() {
24149        let a = _mm_set_sh(f16::INFINITY);
24150        let r = _mm_fpclass_sh_mask::<0x18>(a);
24151        assert_eq!(r, 1);
24152    }
24153
24154    #[simd_test(enable = "avx512fp16")]
24155    fn test_mm_mask_fpclass_sh_mask() {
24156        let a = _mm_set_sh(f16::INFINITY);
24157        let r = _mm_mask_fpclass_sh_mask::<0x18>(0, a);
24158        assert_eq!(r, 0);
24159        let r = _mm_mask_fpclass_sh_mask::<0x18>(1, a);
24160        assert_eq!(r, 1);
24161    }
24162
24163    #[simd_test(enable = "avx512fp16,avx512vl")]
24164    const fn test_mm_mask_blend_ph() {
24165        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24166        let b = _mm_set_ph(-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0);
24167        let r = _mm_mask_blend_ph(0b01010101, a, b);
24168        let e = _mm_set_ph(1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0);
24169        assert_eq_m128h(r, e);
24170    }
24171
24172    #[simd_test(enable = "avx512fp16,avx512vl")]
24173    const fn test_mm256_mask_blend_ph() {
24174        let a = _mm256_set_ph(
24175            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24176        );
24177        let b = _mm256_set_ph(
24178            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
24179            -14.0, -15.0, -16.0,
24180        );
24181        let r = _mm256_mask_blend_ph(0b0101010101010101, a, b);
24182        let e = _mm256_set_ph(
24183            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
24184            -16.0,
24185        );
24186        assert_eq_m256h(r, e);
24187    }
24188
24189    #[simd_test(enable = "avx512fp16")]
24190    const fn test_mm512_mask_blend_ph() {
24191        let a = _mm512_set_ph(
24192            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24193            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24194            31.0, 32.0,
24195        );
24196        let b = _mm512_set_ph(
24197            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
24198            -14.0, -15.0, -16.0, -17.0, -18.0, -19.0, -20.0, -21.0, -22.0, -23.0, -24.0, -25.0,
24199            -26.0, -27.0, -28.0, -29.0, -30.0, -31.0, -32.0,
24200        );
24201        let r = _mm512_mask_blend_ph(0b01010101010101010101010101010101, a, b);
24202        let e = _mm512_set_ph(
24203            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
24204            -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0, 27.0, -28.0,
24205            29.0, -30.0, 31.0, -32.0,
24206        );
24207        assert_eq_m512h(r, e);
24208    }
24209
24210    #[simd_test(enable = "avx512fp16,avx512vl")]
24211    fn test_mm_permutex2var_ph() {
24212        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24213        let b = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
24214        let idx = _mm_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14);
24215        let r = _mm_permutex2var_ph(a, idx, b);
24216        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0);
24217        assert_eq_m128h(r, e);
24218    }
24219
24220    #[simd_test(enable = "avx512fp16,avx512vl")]
24221    fn test_mm256_permutex2var_ph() {
24222        let a = _mm256_setr_ph(
24223            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24224        );
24225        let b = _mm256_setr_ph(
24226            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24227            31.0, 32.0,
24228        );
24229        let idx = _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
24230        let r = _mm256_permutex2var_ph(a, idx, b);
24231        let e = _mm256_setr_ph(
24232            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24233            31.0,
24234        );
24235        assert_eq_m256h(r, e);
24236    }
24237
24238    #[simd_test(enable = "avx512fp16")]
24239    fn test_mm512_permutex2var_ph() {
24240        let a = _mm512_setr_ph(
24241            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24242            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24243            31.0, 32.0,
24244        );
24245        let b = _mm512_setr_ph(
24246            33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0,
24247            47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
24248            61.0, 62.0, 63.0, 64.0,
24249        );
24250        let idx = _mm512_set_epi16(
24251            62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20,
24252            18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
24253        );
24254        let r = _mm512_permutex2var_ph(a, idx, b);
24255        let e = _mm512_setr_ph(
24256            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24257            31.0, 33.0, 35.0, 37.0, 39.0, 41.0, 43.0, 45.0, 47.0, 49.0, 51.0, 53.0, 55.0, 57.0,
24258            59.0, 61.0, 63.0,
24259        );
24260        assert_eq_m512h(r, e);
24261    }
24262
24263    #[simd_test(enable = "avx512fp16,avx512vl")]
24264    fn test_mm_permutexvar_ph() {
24265        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24266        let idx = _mm_set_epi16(0, 2, 4, 6, 1, 3, 5, 7);
24267        let r = _mm_permutexvar_ph(idx, a);
24268        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 2.0, 4.0, 6.0, 8.0);
24269        assert_eq_m128h(r, e);
24270    }
24271
24272    #[simd_test(enable = "avx512fp16,avx512vl")]
24273    fn test_mm256_permutexvar_ph() {
24274        let a = _mm256_set_ph(
24275            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24276        );
24277        let idx = _mm256_set_epi16(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
24278        let r = _mm256_permutexvar_ph(idx, a);
24279        let e = _mm256_setr_ph(
24280            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0,
24281        );
24282        assert_eq_m256h(r, e);
24283    }
24284
24285    #[simd_test(enable = "avx512fp16")]
24286    fn test_mm512_permutexvar_ph() {
24287        let a = _mm512_set_ph(
24288            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24289            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24290            31.0, 32.0,
24291        );
24292        let idx = _mm512_set_epi16(
24293            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 1, 3, 5, 7, 9, 11, 13, 15,
24294            17, 19, 21, 23, 25, 27, 29, 31,
24295        );
24296        let r = _mm512_permutexvar_ph(idx, a);
24297        let e = _mm512_setr_ph(
24298            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24299            31.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0,
24300            30.0, 32.0,
24301        );
24302        assert_eq_m512h(r, e);
24303    }
24304
24305    #[simd_test(enable = "avx512fp16,avx512vl")]
24306    fn test_mm_cvtepi16_ph() {
24307        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24308        let r = _mm_cvtepi16_ph(a);
24309        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24310        assert_eq_m128h(r, e);
24311    }
24312
24313    #[simd_test(enable = "avx512fp16,avx512vl")]
24314    fn test_mm_mask_cvtepi16_ph() {
24315        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24316        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24317        let r = _mm_mask_cvtepi16_ph(src, 0b01010101, a);
24318        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24319        assert_eq_m128h(r, e);
24320    }
24321
24322    #[simd_test(enable = "avx512fp16,avx512vl")]
24323    fn test_mm_maskz_cvtepi16_ph() {
24324        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24325        let r = _mm_maskz_cvtepi16_ph(0b01010101, a);
24326        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
24327        assert_eq_m128h(r, e);
24328    }
24329
24330    #[simd_test(enable = "avx512fp16,avx512vl")]
24331    fn test_mm256_cvtepi16_ph() {
24332        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24333        let r = _mm256_cvtepi16_ph(a);
24334        let e = _mm256_set_ph(
24335            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24336        );
24337        assert_eq_m256h(r, e);
24338    }
24339
24340    #[simd_test(enable = "avx512fp16,avx512vl")]
24341    fn test_mm256_mask_cvtepi16_ph() {
24342        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24343        let src = _mm256_set_ph(
24344            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24345        );
24346        let r = _mm256_mask_cvtepi16_ph(src, 0b0101010101010101, a);
24347        let e = _mm256_set_ph(
24348            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24349        );
24350        assert_eq_m256h(r, e);
24351    }
24352
24353    #[simd_test(enable = "avx512fp16,avx512vl")]
24354    fn test_mm256_maskz_cvtepi16_ph() {
24355        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24356        let r = _mm256_maskz_cvtepi16_ph(0b0101010101010101, a);
24357        let e = _mm256_set_ph(
24358            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
24359        );
24360        assert_eq_m256h(r, e);
24361    }
24362
24363    #[simd_test(enable = "avx512fp16")]
24364    fn test_mm512_cvtepi16_ph() {
24365        let a = _mm512_set_epi16(
24366            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24367            25, 26, 27, 28, 29, 30, 31, 32,
24368        );
24369        let r = _mm512_cvtepi16_ph(a);
24370        let e = _mm512_set_ph(
24371            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24372            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24373            31.0, 32.0,
24374        );
24375        assert_eq_m512h(r, e);
24376    }
24377
24378    #[simd_test(enable = "avx512fp16")]
24379    fn test_mm512_mask_cvtepi16_ph() {
24380        let a = _mm512_set_epi16(
24381            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24382            25, 26, 27, 28, 29, 30, 31, 32,
24383        );
24384        let src = _mm512_set_ph(
24385            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24386            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24387        );
24388        let r = _mm512_mask_cvtepi16_ph(src, 0b01010101010101010101010101010101, a);
24389        let e = _mm512_set_ph(
24390            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24391            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24392        );
24393        assert_eq_m512h(r, e);
24394    }
24395
24396    #[simd_test(enable = "avx512fp16")]
24397    fn test_mm512_maskz_cvtepi16_ph() {
24398        let a = _mm512_set_epi16(
24399            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24400            25, 26, 27, 28, 29, 30, 31, 32,
24401        );
24402        let r = _mm512_maskz_cvtepi16_ph(0b01010101010101010101010101010101, a);
24403        let e = _mm512_set_ph(
24404            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24405            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24406        );
24407        assert_eq_m512h(r, e);
24408    }
24409
24410    #[simd_test(enable = "avx512fp16")]
24411    fn test_mm512_cvt_roundepi16_ph() {
24412        let a = _mm512_set_epi16(
24413            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24414            25, 26, 27, 28, 29, 30, 31, 32,
24415        );
24416        let r = _mm512_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24417        let e = _mm512_set_ph(
24418            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24419            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24420            31.0, 32.0,
24421        );
24422        assert_eq_m512h(r, e);
24423    }
24424
24425    #[simd_test(enable = "avx512fp16")]
24426    fn test_mm512_mask_cvt_roundepi16_ph() {
24427        let a = _mm512_set_epi16(
24428            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24429            25, 26, 27, 28, 29, 30, 31, 32,
24430        );
24431        let src = _mm512_set_ph(
24432            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24433            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24434        );
24435        let r = _mm512_mask_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24436            src,
24437            0b01010101010101010101010101010101,
24438            a,
24439        );
24440        let e = _mm512_set_ph(
24441            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24442            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24443        );
24444        assert_eq_m512h(r, e);
24445    }
24446
24447    #[simd_test(enable = "avx512fp16")]
24448    fn test_mm512_maskz_cvt_roundepi16_ph() {
24449        let a = _mm512_set_epi16(
24450            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24451            25, 26, 27, 28, 29, 30, 31, 32,
24452        );
24453        let r = _mm512_maskz_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24454            0b01010101010101010101010101010101,
24455            a,
24456        );
24457        let e = _mm512_set_ph(
24458            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24459            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24460        );
24461        assert_eq_m512h(r, e);
24462    }
24463
24464    #[simd_test(enable = "avx512fp16,avx512vl")]
24465    fn test_mm_cvtepu16_ph() {
24466        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24467        let r = _mm_cvtepu16_ph(a);
24468        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24469        assert_eq_m128h(r, e);
24470    }
24471
24472    #[simd_test(enable = "avx512fp16,avx512vl")]
24473    fn test_mm_mask_cvtepu16_ph() {
24474        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24475        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24476        let r = _mm_mask_cvtepu16_ph(src, 0b01010101, a);
24477        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24478        assert_eq_m128h(r, e);
24479    }
24480
24481    #[simd_test(enable = "avx512fp16,avx512vl")]
24482    fn test_mm_maskz_cvtepu16_ph() {
24483        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24484        let r = _mm_maskz_cvtepu16_ph(0b01010101, a);
24485        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
24486        assert_eq_m128h(r, e);
24487    }
24488
24489    #[simd_test(enable = "avx512fp16,avx512vl")]
24490    fn test_mm256_cvtepu16_ph() {
24491        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24492        let r = _mm256_cvtepu16_ph(a);
24493        let e = _mm256_set_ph(
24494            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24495        );
24496        assert_eq_m256h(r, e);
24497    }
24498
24499    #[simd_test(enable = "avx512fp16,avx512vl")]
24500    fn test_mm256_mask_cvtepu16_ph() {
24501        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24502        let src = _mm256_set_ph(
24503            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24504        );
24505        let r = _mm256_mask_cvtepu16_ph(src, 0b0101010101010101, a);
24506        let e = _mm256_set_ph(
24507            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24508        );
24509        assert_eq_m256h(r, e);
24510    }
24511
24512    #[simd_test(enable = "avx512fp16,avx512vl")]
24513    fn test_mm256_maskz_cvtepu16_ph() {
24514        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24515        let r = _mm256_maskz_cvtepu16_ph(0b0101010101010101, a);
24516        let e = _mm256_set_ph(
24517            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
24518        );
24519        assert_eq_m256h(r, e);
24520    }
24521
24522    #[simd_test(enable = "avx512fp16")]
24523    fn test_mm512_cvtepu16_ph() {
24524        let a = _mm512_set_epi16(
24525            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24526            25, 26, 27, 28, 29, 30, 31, 32,
24527        );
24528        let r = _mm512_cvtepu16_ph(a);
24529        let e = _mm512_set_ph(
24530            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24531            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24532            31.0, 32.0,
24533        );
24534        assert_eq_m512h(r, e);
24535    }
24536
24537    #[simd_test(enable = "avx512fp16")]
24538    fn test_mm512_mask_cvtepu16_ph() {
24539        let a = _mm512_set_epi16(
24540            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24541            25, 26, 27, 28, 29, 30, 31, 32,
24542        );
24543        let src = _mm512_set_ph(
24544            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24545            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24546        );
24547        let r = _mm512_mask_cvtepu16_ph(src, 0b01010101010101010101010101010101, a);
24548        let e = _mm512_set_ph(
24549            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24550            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24551        );
24552        assert_eq_m512h(r, e);
24553    }
24554
24555    #[simd_test(enable = "avx512fp16")]
24556    fn test_mm512_maskz_cvtepu16_ph() {
24557        let a = _mm512_set_epi16(
24558            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24559            25, 26, 27, 28, 29, 30, 31, 32,
24560        );
24561        let r = _mm512_maskz_cvtepu16_ph(0b01010101010101010101010101010101, a);
24562        let e = _mm512_set_ph(
24563            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24564            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24565        );
24566        assert_eq_m512h(r, e);
24567    }
24568
24569    #[simd_test(enable = "avx512fp16")]
24570    fn test_mm512_cvt_roundepu16_ph() {
24571        let a = _mm512_set_epi16(
24572            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24573            25, 26, 27, 28, 29, 30, 31, 32,
24574        );
24575        let r = _mm512_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24576        let e = _mm512_set_ph(
24577            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24578            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24579            31.0, 32.0,
24580        );
24581        assert_eq_m512h(r, e);
24582    }
24583
24584    #[simd_test(enable = "avx512fp16")]
24585    fn test_mm512_mask_cvt_roundepu16_ph() {
24586        let a = _mm512_set_epi16(
24587            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24588            25, 26, 27, 28, 29, 30, 31, 32,
24589        );
24590        let src = _mm512_set_ph(
24591            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24592            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24593        );
24594        let r = _mm512_mask_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24595            src,
24596            0b01010101010101010101010101010101,
24597            a,
24598        );
24599        let e = _mm512_set_ph(
24600            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24601            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24602        );
24603        assert_eq_m512h(r, e);
24604    }
24605
24606    #[simd_test(enable = "avx512fp16")]
24607    fn test_mm512_maskz_cvt_roundepu16_ph() {
24608        let a = _mm512_set_epi16(
24609            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24610            25, 26, 27, 28, 29, 30, 31, 32,
24611        );
24612        let r = _mm512_maskz_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24613            0b01010101010101010101010101010101,
24614            a,
24615        );
24616        let e = _mm512_set_ph(
24617            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24618            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24619        );
24620        assert_eq_m512h(r, e);
24621    }
24622
24623    #[simd_test(enable = "avx512fp16,avx512vl")]
24624    fn test_mm_cvtepi32_ph() {
24625        let a = _mm_set_epi32(1, 2, 3, 4);
24626        let r = _mm_cvtepi32_ph(a);
24627        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24628        assert_eq_m128h(r, e);
24629    }
24630
24631    #[simd_test(enable = "avx512fp16,avx512vl")]
24632    fn test_mm_mask_cvtepi32_ph() {
24633        let a = _mm_set_epi32(1, 2, 3, 4);
24634        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24635        let r = _mm_mask_cvtepi32_ph(src, 0b0101, a);
24636        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
24637        assert_eq_m128h(r, e);
24638    }
24639
24640    #[simd_test(enable = "avx512fp16,avx512vl")]
24641    fn test_mm_maskz_cvtepi32_ph() {
24642        let a = _mm_set_epi32(1, 2, 3, 4);
24643        let r = _mm_maskz_cvtepi32_ph(0b0101, a);
24644        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
24645        assert_eq_m128h(r, e);
24646    }
24647
24648    #[simd_test(enable = "avx512fp16,avx512vl")]
24649    fn test_mm256_cvtepi32_ph() {
24650        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24651        let r = _mm256_cvtepi32_ph(a);
24652        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24653        assert_eq_m128h(r, e);
24654    }
24655
24656    #[simd_test(enable = "avx512fp16,avx512vl")]
24657    fn test_mm256_mask_cvtepi32_ph() {
24658        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24659        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24660        let r = _mm256_mask_cvtepi32_ph(src, 0b01010101, a);
24661        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24662        assert_eq_m128h(r, e);
24663    }
24664
24665    #[simd_test(enable = "avx512fp16,avx512vl")]
24666    fn test_mm256_maskz_cvtepi32_ph() {
24667        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24668        let r = _mm256_maskz_cvtepi32_ph(0b01010101, a);
24669        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24670        assert_eq_m128h(r, e);
24671    }
24672
24673    #[simd_test(enable = "avx512fp16")]
24674    fn test_mm512_cvtepi32_ph() {
24675        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24676        let r = _mm512_cvtepi32_ph(a);
24677        let e = _mm256_set_ph(
24678            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24679        );
24680        assert_eq_m256h(r, e);
24681    }
24682
24683    #[simd_test(enable = "avx512fp16,avx512vl")]
24684    fn test_mm512_mask_cvtepi32_ph() {
24685        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24686        let src = _mm256_set_ph(
24687            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24688        );
24689        let r = _mm512_mask_cvtepi32_ph(src, 0b0101010101010101, a);
24690        let e = _mm256_set_ph(
24691            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24692        );
24693        assert_eq_m256h(r, e);
24694    }
24695
24696    #[simd_test(enable = "avx512fp16,avx512vl")]
24697    fn test_mm512_maskz_cvtepi32_ph() {
24698        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24699        let r = _mm512_maskz_cvtepi32_ph(0b0101010101010101, a);
24700        let e = _mm256_set_ph(
24701            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24702        );
24703        assert_eq_m256h(r, e);
24704    }
24705
24706    #[simd_test(enable = "avx512fp16,avx512vl")]
24707    fn test_mm512_cvt_roundepi32_ph() {
24708        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24709        let r = _mm512_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24710        let e = _mm256_set_ph(
24711            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24712        );
24713        assert_eq_m256h(r, e);
24714    }
24715
24716    #[simd_test(enable = "avx512fp16,avx512vl")]
24717    fn test_mm512_mask_cvt_roundepi32_ph() {
24718        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24719        let src = _mm256_set_ph(
24720            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24721        );
24722        let r = _mm512_mask_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24723            src,
24724            0b0101010101010101,
24725            a,
24726        );
24727        let e = _mm256_set_ph(
24728            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24729        );
24730        assert_eq_m256h(r, e);
24731    }
24732
24733    #[simd_test(enable = "avx512fp16,avx512vl")]
24734    fn test_mm512_maskz_cvt_roundepi32_ph() {
24735        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24736        let r = _mm512_maskz_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24737            0b0101010101010101,
24738            a,
24739        );
24740        let e = _mm256_set_ph(
24741            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24742        );
24743        assert_eq_m256h(r, e);
24744    }
24745
24746    #[simd_test(enable = "avx512fp16,avx512vl")]
24747    fn test_mm_cvti32_sh() {
24748        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24749        let r = _mm_cvti32_sh(a, 10);
24750        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24751        assert_eq_m128h(r, e);
24752    }
24753
24754    #[simd_test(enable = "avx512fp16,avx512vl")]
24755    fn test_mm_cvt_roundi32_sh() {
24756        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24757        let r = _mm_cvt_roundi32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
24758        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24759        assert_eq_m128h(r, e);
24760    }
24761
24762    #[simd_test(enable = "avx512fp16,avx512vl")]
24763    fn test_mm_cvtepu32_ph() {
24764        let a = _mm_set_epi32(1, 2, 3, 4);
24765        let r = _mm_cvtepu32_ph(a);
24766        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24767        assert_eq_m128h(r, e);
24768    }
24769
24770    #[simd_test(enable = "avx512fp16,avx512vl")]
24771    fn test_mm_mask_cvtepu32_ph() {
24772        let a = _mm_set_epi32(1, 2, 3, 4);
24773        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24774        let r = _mm_mask_cvtepu32_ph(src, 0b0101, a);
24775        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
24776        assert_eq_m128h(r, e);
24777    }
24778
24779    #[simd_test(enable = "avx512fp16,avx512vl")]
24780    fn test_mm_maskz_cvtepu32_ph() {
24781        let a = _mm_set_epi32(1, 2, 3, 4);
24782        let r = _mm_maskz_cvtepu32_ph(0b0101, a);
24783        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
24784        assert_eq_m128h(r, e);
24785    }
24786
24787    #[simd_test(enable = "avx512fp16,avx512vl")]
24788    fn test_mm256_cvtepu32_ph() {
24789        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24790        let r = _mm256_cvtepu32_ph(a);
24791        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24792        assert_eq_m128h(r, e);
24793    }
24794
24795    #[simd_test(enable = "avx512fp16,avx512vl")]
24796    fn test_mm256_mask_cvtepu32_ph() {
24797        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24798        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24799        let r = _mm256_mask_cvtepu32_ph(src, 0b01010101, a);
24800        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24801        assert_eq_m128h(r, e);
24802    }
24803
24804    #[simd_test(enable = "avx512fp16,avx512vl")]
24805    fn test_mm256_maskz_cvtepu32_ph() {
24806        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24807        let r = _mm256_maskz_cvtepu32_ph(0b01010101, a);
24808        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24809        assert_eq_m128h(r, e);
24810    }
24811
24812    #[simd_test(enable = "avx512fp16,avx512vl")]
24813    fn test_mm512_cvtepu32_ph() {
24814        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24815        let r = _mm512_cvtepu32_ph(a);
24816        let e = _mm256_set_ph(
24817            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24818        );
24819        assert_eq_m256h(r, e);
24820    }
24821
24822    #[simd_test(enable = "avx512fp16,avx512vl")]
24823    fn test_mm512_mask_cvtepu32_ph() {
24824        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24825        let src = _mm256_set_ph(
24826            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24827        );
24828        let r = _mm512_mask_cvtepu32_ph(src, 0b0101010101010101, a);
24829        let e = _mm256_set_ph(
24830            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
24831        );
24832        assert_eq_m256h(r, e);
24833    }
24834
24835    #[simd_test(enable = "avx512fp16,avx512vl")]
24836    fn test_mm512_maskz_cvtepu32_ph() {
24837        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24838        let r = _mm512_maskz_cvtepu32_ph(0b0101010101010101, a);
24839        let e = _mm256_set_ph(
24840            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24841        );
24842        assert_eq_m256h(r, e);
24843    }
24844
24845    #[simd_test(enable = "avx512fp16,avx512vl")]
24846    fn test_mm512_cvt_roundepu32_ph() {
24847        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24848        let r = _mm512_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24849        let e = _mm256_set_ph(
24850            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24851        );
24852        assert_eq_m256h(r, e);
24853    }
24854
24855    #[simd_test(enable = "avx512fp16,avx512vl")]
24856    fn test_mm512_mask_cvt_roundepu32_ph() {
24857        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24858        let src = _mm256_set_ph(
24859            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24860        );
24861        let r = _mm512_mask_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24862            src,
24863            0b0101010101010101,
24864            a,
24865        );
24866        let e = _mm256_set_ph(
24867            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
24868            16.0,
24869        );
24870        assert_eq_m256h(r, e);
24871    }
24872
24873    #[simd_test(enable = "avx512fp16,avx512vl")]
24874    fn test_mm512_maskz_cvt_roundepu32_ph() {
24875        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24876        let r = _mm512_maskz_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24877            0b0101010101010101,
24878            a,
24879        );
24880        let e = _mm256_set_ph(
24881            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24882        );
24883        assert_eq_m256h(r, e);
24884    }
24885
24886    #[simd_test(enable = "avx512fp16,avx512vl")]
24887    fn test_mm_cvtu32_sh() {
24888        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24889        let r = _mm_cvtu32_sh(a, 10);
24890        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24891        assert_eq_m128h(r, e);
24892    }
24893
24894    #[simd_test(enable = "avx512fp16,avx512vl")]
24895    fn test_mm_cvt_roundu32_sh() {
24896        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24897        let r = _mm_cvt_roundu32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
24898        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24899        assert_eq_m128h(r, e);
24900    }
24901
24902    #[simd_test(enable = "avx512fp16,avx512vl")]
24903    fn test_mm_cvtepi64_ph() {
24904        let a = _mm_set_epi64x(1, 2);
24905        let r = _mm_cvtepi64_ph(a);
24906        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
24907        assert_eq_m128h(r, e);
24908    }
24909
24910    #[simd_test(enable = "avx512fp16,avx512vl")]
24911    fn test_mm_mask_cvtepi64_ph() {
24912        let a = _mm_set_epi64x(1, 2);
24913        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24914        let r = _mm_mask_cvtepi64_ph(src, 0b01, a);
24915        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
24916        assert_eq_m128h(r, e);
24917    }
24918
24919    #[simd_test(enable = "avx512fp16,avx512vl")]
24920    fn test_mm_maskz_cvtepi64_ph() {
24921        let a = _mm_set_epi64x(1, 2);
24922        let r = _mm_maskz_cvtepi64_ph(0b01, a);
24923        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.);
24924        assert_eq_m128h(r, e);
24925    }
24926
24927    #[simd_test(enable = "avx512fp16,avx512vl")]
24928    fn test_mm256_cvtepi64_ph() {
24929        let a = _mm256_set_epi64x(1, 2, 3, 4);
24930        let r = _mm256_cvtepi64_ph(a);
24931        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24932        assert_eq_m128h(r, e);
24933    }
24934
24935    #[simd_test(enable = "avx512fp16,avx512vl")]
24936    fn test_mm256_mask_cvtepi64_ph() {
24937        let a = _mm256_set_epi64x(1, 2, 3, 4);
24938        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24939        let r = _mm256_mask_cvtepi64_ph(src, 0b0101, a);
24940        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
24941        assert_eq_m128h(r, e);
24942    }
24943
24944    #[simd_test(enable = "avx512fp16,avx512vl")]
24945    fn test_mm256_maskz_cvtepi64_ph() {
24946        let a = _mm256_set_epi64x(1, 2, 3, 4);
24947        let r = _mm256_maskz_cvtepi64_ph(0b0101, a);
24948        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
24949        assert_eq_m128h(r, e);
24950    }
24951
24952    #[simd_test(enable = "avx512fp16,avx512vl")]
24953    fn test_mm512_cvtepi64_ph() {
24954        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24955        let r = _mm512_cvtepi64_ph(a);
24956        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24957        assert_eq_m128h(r, e);
24958    }
24959
24960    #[simd_test(enable = "avx512fp16,avx512vl")]
24961    fn test_mm512_mask_cvtepi64_ph() {
24962        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24963        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24964        let r = _mm512_mask_cvtepi64_ph(src, 0b01010101, a);
24965        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24966        assert_eq_m128h(r, e);
24967    }
24968
24969    #[simd_test(enable = "avx512fp16,avx512vl")]
24970    fn test_mm512_maskz_cvtepi64_ph() {
24971        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24972        let r = _mm512_maskz_cvtepi64_ph(0b01010101, a);
24973        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24974        assert_eq_m128h(r, e);
24975    }
24976
24977    #[simd_test(enable = "avx512fp16,avx512vl")]
24978    fn test_mm512_cvt_roundepi64_ph() {
24979        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24980        let r = _mm512_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24981        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24982        assert_eq_m128h(r, e);
24983    }
24984
24985    #[simd_test(enable = "avx512fp16")]
24986    fn test_mm512_mask_cvt_roundepi64_ph() {
24987        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24988        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24989        let r = _mm512_mask_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24990            src, 0b01010101, a,
24991        );
24992        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24993        assert_eq_m128h(r, e);
24994    }
24995
24996    #[simd_test(enable = "avx512fp16,avx512vl")]
24997    fn test_mm512_maskz_cvt_roundepi64_ph() {
24998        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24999        let r = _mm512_maskz_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25000            0b01010101, a,
25001        );
25002        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25003        assert_eq_m128h(r, e);
25004    }
25005
25006    #[simd_test(enable = "avx512fp16,avx512vl")]
25007    fn test_mm_cvtepu64_ph() {
25008        let a = _mm_set_epi64x(1, 2);
25009        let r = _mm_cvtepu64_ph(a);
25010        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
25011        assert_eq_m128h(r, e);
25012    }
25013
25014    #[simd_test(enable = "avx512fp16,avx512vl")]
25015    fn test_mm_mask_cvtepu64_ph() {
25016        let a = _mm_set_epi64x(1, 2);
25017        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25018        let r = _mm_mask_cvtepu64_ph(src, 0b01, a);
25019        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
25020        assert_eq_m128h(r, e);
25021    }
25022
25023    #[simd_test(enable = "avx512fp16,avx512vl")]
25024    fn test_mm_maskz_cvtepu64_ph() {
25025        let a = _mm_set_epi64x(1, 2);
25026        let r = _mm_maskz_cvtepu64_ph(0b01, a);
25027        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
25028        assert_eq_m128h(r, e);
25029    }
25030
25031    #[simd_test(enable = "avx512fp16,avx512vl")]
25032    fn test_mm256_cvtepu64_ph() {
25033        let a = _mm256_set_epi64x(1, 2, 3, 4);
25034        let r = _mm256_cvtepu64_ph(a);
25035        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25036        assert_eq_m128h(r, e);
25037    }
25038
25039    #[simd_test(enable = "avx512fp16,avx512vl")]
25040    fn test_mm256_mask_cvtepu64_ph() {
25041        let a = _mm256_set_epi64x(1, 2, 3, 4);
25042        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25043        let r = _mm256_mask_cvtepu64_ph(src, 0b0101, a);
25044        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
25045        assert_eq_m128h(r, e);
25046    }
25047
25048    #[simd_test(enable = "avx512fp16,avx512vl")]
25049    fn test_mm256_maskz_cvtepu64_ph() {
25050        let a = _mm256_set_epi64x(1, 2, 3, 4);
25051        let r = _mm256_maskz_cvtepu64_ph(0b0101, a);
25052        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
25053        assert_eq_m128h(r, e);
25054    }
25055
25056    #[simd_test(enable = "avx512fp16,avx512vl")]
25057    fn test_mm512_cvtepu64_ph() {
25058        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25059        let r = _mm512_cvtepu64_ph(a);
25060        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25061        assert_eq_m128h(r, e);
25062    }
25063
25064    #[simd_test(enable = "avx512fp16,avx512vl")]
25065    fn test_mm512_mask_cvtepu64_ph() {
25066        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25067        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25068        let r = _mm512_mask_cvtepu64_ph(src, 0b01010101, a);
25069        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25070        assert_eq_m128h(r, e);
25071    }
25072
25073    #[simd_test(enable = "avx512fp16,avx512vl")]
25074    fn test_mm512_maskz_cvtepu64_ph() {
25075        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25076        let r = _mm512_maskz_cvtepu64_ph(0b01010101, a);
25077        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25078        assert_eq_m128h(r, e);
25079    }
25080
25081    #[simd_test(enable = "avx512fp16,avx512vl")]
25082    fn test_mm512_cvt_roundepu64_ph() {
25083        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25084        let r = _mm512_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25085        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25086        assert_eq_m128h(r, e);
25087    }
25088
25089    #[simd_test(enable = "avx512fp16,avx512vl")]
25090    fn test_mm512_mask_cvt_roundepu64_ph() {
25091        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25092        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25093        let r = _mm512_mask_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25094            src, 0b01010101, a,
25095        );
25096        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25097        assert_eq_m128h(r, e);
25098    }
25099
25100    #[simd_test(enable = "avx512fp16,avx512vl")]
25101    fn test_mm512_maskz_cvt_roundepu64_ph() {
25102        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25103        let r = _mm512_maskz_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25104            0b01010101, a,
25105        );
25106        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25107        assert_eq_m128h(r, e);
25108    }
25109
25110    #[simd_test(enable = "avx512fp16,avx512vl")]
25111    fn test_mm_cvtxps_ph() {
25112        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
25113        let r = _mm_cvtxps_ph(a);
25114        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25115        assert_eq_m128h(r, e);
25116    }
25117
25118    #[simd_test(enable = "avx512fp16,avx512vl")]
25119    fn test_mm_mask_cvtxps_ph() {
25120        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
25121        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25122        let r = _mm_mask_cvtxps_ph(src, 0b0101, a);
25123        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16., 4.0);
25124        assert_eq_m128h(r, e);
25125    }
25126
25127    #[simd_test(enable = "avx512fp16,avx512vl")]
25128    fn test_mm_maskz_cvtxps_ph() {
25129        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
25130        let r = _mm_maskz_cvtxps_ph(0b0101, a);
25131        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
25132        assert_eq_m128h(r, e);
25133    }
25134
25135    #[simd_test(enable = "avx512fp16,avx512vl")]
25136    fn test_mm256_cvtxps_ph() {
25137        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25138        let r = _mm256_cvtxps_ph(a);
25139        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25140        assert_eq_m128h(r, e);
25141    }
25142
25143    #[simd_test(enable = "avx512fp16,avx512vl")]
25144    fn test_mm256_mask_cvtxps_ph() {
25145        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25146        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25147        let r = _mm256_mask_cvtxps_ph(src, 0b01010101, a);
25148        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25149        assert_eq_m128h(r, e);
25150    }
25151
25152    #[simd_test(enable = "avx512fp16,avx512vl")]
25153    fn test_mm256_maskz_cvtxps_ph() {
25154        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25155        let r = _mm256_maskz_cvtxps_ph(0b01010101, a);
25156        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
25157        assert_eq_m128h(r, e);
25158    }
25159
25160    #[simd_test(enable = "avx512fp16,avx512vl")]
25161    fn test_mm512_cvtxps_ph() {
25162        let a = _mm512_set_ps(
25163            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25164        );
25165        let r = _mm512_cvtxps_ph(a);
25166        let e = _mm256_set_ph(
25167            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25168        );
25169        assert_eq_m256h(r, e);
25170    }
25171
25172    #[simd_test(enable = "avx512fp16,avx512vl")]
25173    fn test_mm512_mask_cvtxps_ph() {
25174        let a = _mm512_set_ps(
25175            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25176        );
25177        let src = _mm256_set_ph(
25178            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
25179        );
25180        let r = _mm512_mask_cvtxps_ph(src, 0b0101010101010101, a);
25181        let e = _mm256_set_ph(
25182            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
25183        );
25184        assert_eq_m256h(r, e);
25185    }
25186
25187    #[simd_test(enable = "avx512fp16,avx512vl")]
25188    fn test_mm512_maskz_cvtxps_ph() {
25189        let a = _mm512_set_ps(
25190            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25191        );
25192        let r = _mm512_maskz_cvtxps_ph(0b0101010101010101, a);
25193        let e = _mm256_set_ph(
25194            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
25195        );
25196        assert_eq_m256h(r, e);
25197    }
25198
25199    #[simd_test(enable = "avx512fp16,avx512vl")]
25200    fn test_mm512_cvtx_roundps_ph() {
25201        let a = _mm512_set_ps(
25202            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25203        );
25204        let r = _mm512_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25205        let e = _mm256_set_ph(
25206            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25207        );
25208        assert_eq_m256h(r, e);
25209    }
25210
25211    #[simd_test(enable = "avx512fp16,avx512vl")]
25212    fn test_mm512_mask_cvtx_roundps_ph() {
25213        let a = _mm512_set_ps(
25214            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25215        );
25216        let src = _mm256_set_ph(
25217            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
25218        );
25219        let r = _mm512_mask_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25220            src,
25221            0b0101010101010101,
25222            a,
25223        );
25224        let e = _mm256_set_ph(
25225            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
25226            16.0,
25227        );
25228        assert_eq_m256h(r, e);
25229    }
25230
25231    #[simd_test(enable = "avx512fp16,avx512vl")]
25232    fn test_mm512_maskz_cvtx_roundps_ph() {
25233        let a = _mm512_set_ps(
25234            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25235        );
25236        let r = _mm512_maskz_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25237            0b0101010101010101,
25238            a,
25239        );
25240        let e = _mm256_set_ph(
25241            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
25242        );
25243        assert_eq_m256h(r, e);
25244    }
25245
25246    #[simd_test(enable = "avx512fp16,avx512vl")]
25247    fn test_mm_cvtss_sh() {
25248        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25249        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25250        let r = _mm_cvtss_sh(a, b);
25251        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25252        assert_eq_m128h(r, e);
25253    }
25254
25255    #[simd_test(enable = "avx512fp16,avx512vl")]
25256    fn test_mm_mask_cvtss_sh() {
25257        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25258        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25259        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25260        let r = _mm_mask_cvtss_sh(src, 0, a, b);
25261        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25262        assert_eq_m128h(r, e);
25263        let r = _mm_mask_cvtss_sh(src, 1, a, b);
25264        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25265        assert_eq_m128h(r, e);
25266    }
25267
25268    #[simd_test(enable = "avx512fp16,avx512vl")]
25269    fn test_mm_maskz_cvtss_sh() {
25270        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25271        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25272        let r = _mm_maskz_cvtss_sh(0, a, b);
25273        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25274        assert_eq_m128h(r, e);
25275        let r = _mm_maskz_cvtss_sh(1, a, b);
25276        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25277        assert_eq_m128h(r, e);
25278    }
25279
25280    #[simd_test(enable = "avx512fp16,avx512vl")]
25281    fn test_mm_cvt_roundss_sh() {
25282        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25283        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25284        let r = _mm_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
25285        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25286        assert_eq_m128h(r, e);
25287    }
25288
25289    #[simd_test(enable = "avx512fp16,avx512vl")]
25290    fn test_mm_mask_cvt_roundss_sh() {
25291        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25292        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25293        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25294        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25295            src, 0, a, b,
25296        );
25297        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25298        assert_eq_m128h(r, e);
25299        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25300            src, 1, a, b,
25301        );
25302        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25303        assert_eq_m128h(r, e);
25304    }
25305
25306    #[simd_test(enable = "avx512fp16,avx512vl")]
25307    fn test_mm_maskz_cvt_roundss_sh() {
25308        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25309        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25310        let r =
25311            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
25312        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25313        assert_eq_m128h(r, e);
25314        let r =
25315            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
25316        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25317        assert_eq_m128h(r, e);
25318    }
25319
25320    #[simd_test(enable = "avx512fp16,avx512vl")]
25321    fn test_mm_cvtpd_ph() {
25322        let a = _mm_set_pd(1.0, 2.0);
25323        let r = _mm_cvtpd_ph(a);
25324        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
25325        assert_eq_m128h(r, e);
25326    }
25327
25328    #[simd_test(enable = "avx512fp16,avx512vl")]
25329    fn test_mm_mask_cvtpd_ph() {
25330        let a = _mm_set_pd(1.0, 2.0);
25331        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25332        let r = _mm_mask_cvtpd_ph(src, 0b01, a);
25333        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
25334        assert_eq_m128h(r, e);
25335    }
25336
25337    #[simd_test(enable = "avx512fp16,avx512vl")]
25338    fn test_mm_maskz_cvtpd_ph() {
25339        let a = _mm_set_pd(1.0, 2.0);
25340        let r = _mm_maskz_cvtpd_ph(0b01, a);
25341        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
25342        assert_eq_m128h(r, e);
25343    }
25344
25345    #[simd_test(enable = "avx512fp16,avx512vl")]
25346    fn test_mm256_cvtpd_ph() {
25347        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25348        let r = _mm256_cvtpd_ph(a);
25349        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25350        assert_eq_m128h(r, e);
25351    }
25352
25353    #[simd_test(enable = "avx512fp16,avx512vl")]
25354    fn test_mm256_mask_cvtpd_ph() {
25355        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25356        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25357        let r = _mm256_mask_cvtpd_ph(src, 0b0101, a);
25358        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
25359        assert_eq_m128h(r, e);
25360    }
25361
25362    #[simd_test(enable = "avx512fp16,avx512vl")]
25363    fn test_mm256_maskz_cvtpd_ph() {
25364        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25365        let r = _mm256_maskz_cvtpd_ph(0b0101, a);
25366        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
25367        assert_eq_m128h(r, e);
25368    }
25369
25370    #[simd_test(enable = "avx512fp16,avx512vl")]
25371    fn test_mm512_cvtpd_ph() {
25372        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25373        let r = _mm512_cvtpd_ph(a);
25374        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25375        assert_eq_m128h(r, e);
25376    }
25377
25378    #[simd_test(enable = "avx512fp16,avx512vl")]
25379    fn test_mm512_mask_cvtpd_ph() {
25380        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25381        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25382        let r = _mm512_mask_cvtpd_ph(src, 0b01010101, a);
25383        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25384        assert_eq_m128h(r, e);
25385    }
25386
25387    #[simd_test(enable = "avx512fp16,avx512vl")]
25388    fn test_mm512_maskz_cvtpd_ph() {
25389        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25390        let r = _mm512_maskz_cvtpd_ph(0b01010101, a);
25391        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25392        assert_eq_m128h(r, e);
25393    }
25394
25395    #[simd_test(enable = "avx512fp16,avx512vl")]
25396    fn test_mm512_cvt_roundpd_ph() {
25397        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25398        let r = _mm512_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25399        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25400        assert_eq_m128h(r, e);
25401    }
25402
25403    #[simd_test(enable = "avx512fp16,avx512vl")]
25404    fn test_mm512_mask_cvt_roundpd_ph() {
25405        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25406        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25407        let r = _mm512_mask_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25408            src, 0b01010101, a,
25409        );
25410        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25411        assert_eq_m128h(r, e);
25412    }
25413
25414    #[simd_test(enable = "avx512fp16,avx512vl")]
25415    fn test_mm512_maskz_cvt_roundpd_ph() {
25416        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25417        let r = _mm512_maskz_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25418            0b01010101, a,
25419        );
25420        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25421        assert_eq_m128h(r, e);
25422    }
25423
25424    #[simd_test(enable = "avx512fp16,avx512vl")]
25425    fn test_mm_cvtsd_sh() {
25426        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25427        let b = _mm_setr_pd(1.0, 2.0);
25428        let r = _mm_cvtsd_sh(a, b);
25429        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25430        assert_eq_m128h(r, e);
25431    }
25432
25433    #[simd_test(enable = "avx512fp16,avx512vl")]
25434    fn test_mm_mask_cvtsd_sh() {
25435        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25436        let b = _mm_setr_pd(1.0, 2.0);
25437        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25438        let r = _mm_mask_cvtsd_sh(src, 0, a, b);
25439        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25440        assert_eq_m128h(r, e);
25441        let r = _mm_mask_cvtsd_sh(src, 1, a, b);
25442        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25443        assert_eq_m128h(r, e);
25444    }
25445
25446    #[simd_test(enable = "avx512fp16,avx512vl")]
25447    fn test_mm_maskz_cvtsd_sh() {
25448        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25449        let b = _mm_setr_pd(1.0, 2.0);
25450        let r = _mm_maskz_cvtsd_sh(0, a, b);
25451        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25452        assert_eq_m128h(r, e);
25453        let r = _mm_maskz_cvtsd_sh(1, a, b);
25454        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25455        assert_eq_m128h(r, e);
25456    }
25457
25458    #[simd_test(enable = "avx512fp16,avx512vl")]
25459    fn test_mm_cvt_roundsd_sh() {
25460        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25461        let b = _mm_setr_pd(1.0, 2.0);
25462        let r = _mm_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
25463        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25464        assert_eq_m128h(r, e);
25465    }
25466
25467    #[simd_test(enable = "avx512fp16,avx512vl")]
25468    fn test_mm_mask_cvt_roundsd_sh() {
25469        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25470        let b = _mm_setr_pd(1.0, 2.0);
25471        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25472        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25473            src, 0, a, b,
25474        );
25475        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25476        assert_eq_m128h(r, e);
25477        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25478            src, 1, a, b,
25479        );
25480        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25481        assert_eq_m128h(r, e);
25482    }
25483
25484    #[simd_test(enable = "avx512fp16,avx512vl")]
25485    fn test_mm_maskz_cvt_roundsd_sh() {
25486        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25487        let b = _mm_setr_pd(1.0, 2.0);
25488        let r =
25489            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
25490        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25491        assert_eq_m128h(r, e);
25492        let r =
25493            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
25494        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25495        assert_eq_m128h(r, e);
25496    }
25497
25498    #[simd_test(enable = "avx512fp16,avx512vl")]
25499    fn test_mm_cvtph_epi16() {
25500        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25501        let r = _mm_cvttph_epi16(a);
25502        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25503        assert_eq_m128i(r, e);
25504    }
25505
25506    #[simd_test(enable = "avx512fp16,avx512vl")]
25507    fn test_mm_mask_cvtph_epi16() {
25508        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25509        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25510        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
25511        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25512        assert_eq_m128i(r, e);
25513    }
25514
25515    #[simd_test(enable = "avx512fp16,avx512vl")]
25516    fn test_mm_maskz_cvtph_epi16() {
25517        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25518        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
25519        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25520        assert_eq_m128i(r, e);
25521    }
25522
25523    #[simd_test(enable = "avx512fp16,avx512vl")]
25524    fn test_mm256_cvtph_epi16() {
25525        let a = _mm256_set_ph(
25526            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25527        );
25528        let r = _mm256_cvttph_epi16(a);
25529        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25530        assert_eq_m256i(r, e);
25531    }
25532
25533    #[simd_test(enable = "avx512fp16,avx512vl")]
25534    fn test_mm256_mask_cvtph_epi16() {
25535        let a = _mm256_set_ph(
25536            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25537        );
25538        let src = _mm256_set_epi16(
25539            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25540        );
25541        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
25542        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25543        assert_eq_m256i(r, e);
25544    }
25545
25546    #[simd_test(enable = "avx512fp16,avx512vl")]
25547    fn test_mm256_maskz_cvtph_epi16() {
25548        let a = _mm256_set_ph(
25549            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25550        );
25551        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
25552        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25553        assert_eq_m256i(r, e);
25554    }
25555
25556    #[simd_test(enable = "avx512fp16")]
25557    fn test_mm512_cvtph_epi16() {
25558        let a = _mm512_set_ph(
25559            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25560            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25561            31.0, 32.0,
25562        );
25563        let r = _mm512_cvttph_epi16(a);
25564        let e = _mm512_set_epi16(
25565            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25566            25, 26, 27, 28, 29, 30, 31, 32,
25567        );
25568        assert_eq_m512i(r, e);
25569    }
25570
25571    #[simd_test(enable = "avx512fp16")]
25572    fn test_mm512_mask_cvtph_epi16() {
25573        let a = _mm512_set_ph(
25574            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25575            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25576            31.0, 32.0,
25577        );
25578        let src = _mm512_set_epi16(
25579            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25580            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25581        );
25582        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
25583        let e = _mm512_set_epi16(
25584            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25585            24, 34, 26, 36, 28, 38, 30, 40, 32,
25586        );
25587        assert_eq_m512i(r, e);
25588    }
25589
25590    #[simd_test(enable = "avx512fp16")]
25591    fn test_mm512_maskz_cvtph_epi16() {
25592        let a = _mm512_set_ph(
25593            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25594            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25595            31.0, 32.0,
25596        );
25597        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
25598        let e = _mm512_set_epi16(
25599            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25600            0, 28, 0, 30, 0, 32,
25601        );
25602        assert_eq_m512i(r, e);
25603    }
25604
25605    #[simd_test(enable = "avx512fp16")]
25606    fn test_mm512_cvt_roundph_epi16() {
25607        let a = _mm512_set_ph(
25608            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25609            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25610            31.0, 32.0,
25611        );
25612        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
25613        let e = _mm512_set_epi16(
25614            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25615            25, 26, 27, 28, 29, 30, 31, 32,
25616        );
25617        assert_eq_m512i(r, e);
25618    }
25619
25620    #[simd_test(enable = "avx512fp16")]
25621    fn test_mm512_mask_cvt_roundph_epi16() {
25622        let a = _mm512_set_ph(
25623            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25624            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25625            31.0, 32.0,
25626        );
25627        let src = _mm512_set_epi16(
25628            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25629            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25630        );
25631        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25632            src,
25633            0b01010101010101010101010101010101,
25634            a,
25635        );
25636        let e = _mm512_set_epi16(
25637            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25638            24, 34, 26, 36, 28, 38, 30, 40, 32,
25639        );
25640        assert_eq_m512i(r, e);
25641    }
25642
25643    #[simd_test(enable = "avx512fp16")]
25644    fn test_mm512_maskz_cvt_roundph_epi16() {
25645        let a = _mm512_set_ph(
25646            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25647            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25648            31.0, 32.0,
25649        );
25650        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25651            0b01010101010101010101010101010101,
25652            a,
25653        );
25654        let e = _mm512_set_epi16(
25655            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25656            0, 28, 0, 30, 0, 32,
25657        );
25658        assert_eq_m512i(r, e);
25659    }
25660
25661    #[simd_test(enable = "avx512fp16,avx512vl")]
25662    fn test_mm_cvtph_epu16() {
25663        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25664        let r = _mm_cvttph_epu16(a);
25665        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25666        assert_eq_m128i(r, e);
25667    }
25668
25669    #[simd_test(enable = "avx512fp16,avx512vl")]
25670    fn test_mm_mask_cvtph_epu16() {
25671        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25672        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25673        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
25674        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25675        assert_eq_m128i(r, e);
25676    }
25677
25678    #[simd_test(enable = "avx512fp16,avx512vl")]
25679    fn test_mm_maskz_cvtph_epu16() {
25680        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25681        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
25682        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25683        assert_eq_m128i(r, e);
25684    }
25685
25686    #[simd_test(enable = "avx512fp16,avx512vl")]
25687    fn test_mm256_cvtph_epu16() {
25688        let a = _mm256_set_ph(
25689            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25690        );
25691        let r = _mm256_cvttph_epu16(a);
25692        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25693        assert_eq_m256i(r, e);
25694    }
25695
25696    #[simd_test(enable = "avx512fp16,avx512vl")]
25697    fn test_mm256_mask_cvtph_epu16() {
25698        let a = _mm256_set_ph(
25699            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25700        );
25701        let src = _mm256_set_epi16(
25702            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25703        );
25704        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
25705        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25706        assert_eq_m256i(r, e);
25707    }
25708
25709    #[simd_test(enable = "avx512fp16,avx512vl")]
25710    fn test_mm256_maskz_cvtph_epu16() {
25711        let a = _mm256_set_ph(
25712            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25713        );
25714        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
25715        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25716        assert_eq_m256i(r, e);
25717    }
25718
25719    #[simd_test(enable = "avx512fp16")]
25720    fn test_mm512_cvtph_epu16() {
25721        let a = _mm512_set_ph(
25722            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25723            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25724            31.0, 32.0,
25725        );
25726        let r = _mm512_cvttph_epu16(a);
25727        let e = _mm512_set_epi16(
25728            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25729            25, 26, 27, 28, 29, 30, 31, 32,
25730        );
25731        assert_eq_m512i(r, e);
25732    }
25733
25734    #[simd_test(enable = "avx512fp16")]
25735    fn test_mm512_mask_cvtph_epu16() {
25736        let a = _mm512_set_ph(
25737            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25738            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25739            31.0, 32.0,
25740        );
25741        let src = _mm512_set_epi16(
25742            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25743            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25744        );
25745        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
25746        let e = _mm512_set_epi16(
25747            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25748            24, 34, 26, 36, 28, 38, 30, 40, 32,
25749        );
25750        assert_eq_m512i(r, e);
25751    }
25752
25753    #[simd_test(enable = "avx512fp16")]
25754    fn test_mm512_maskz_cvtph_epu16() {
25755        let a = _mm512_set_ph(
25756            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25757            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25758            31.0, 32.0,
25759        );
25760        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
25761        let e = _mm512_set_epi16(
25762            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25763            0, 28, 0, 30, 0, 32,
25764        );
25765        assert_eq_m512i(r, e);
25766    }
25767
25768    #[simd_test(enable = "avx512fp16")]
25769    fn test_mm512_cvt_roundph_epu16() {
25770        let a = _mm512_set_ph(
25771            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25772            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25773            31.0, 32.0,
25774        );
25775        let r = _mm512_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25776        let e = _mm512_set_epi16(
25777            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25778            25, 26, 27, 28, 29, 30, 31, 32,
25779        );
25780        assert_eq_m512i(r, e);
25781    }
25782
25783    #[simd_test(enable = "avx512fp16")]
25784    fn test_mm512_mask_cvt_roundph_epu16() {
25785        let a = _mm512_set_ph(
25786            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25787            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25788            31.0, 32.0,
25789        );
25790        let src = _mm512_set_epi16(
25791            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25792            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25793        );
25794        let r = _mm512_mask_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25795            src,
25796            0b01010101010101010101010101010101,
25797            a,
25798        );
25799        let e = _mm512_set_epi16(
25800            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25801            24, 34, 26, 36, 28, 38, 30, 40, 32,
25802        );
25803        assert_eq_m512i(r, e);
25804    }
25805
25806    #[simd_test(enable = "avx512fp16")]
25807    fn test_mm512_maskz_cvt_roundph_epu16() {
25808        let a = _mm512_set_ph(
25809            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25810            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25811            31.0, 32.0,
25812        );
25813        let r = _mm512_maskz_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25814            0b01010101010101010101010101010101,
25815            a,
25816        );
25817        let e = _mm512_set_epi16(
25818            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25819            0, 28, 0, 30, 0, 32,
25820        );
25821        assert_eq_m512i(r, e);
25822    }
25823
25824    #[simd_test(enable = "avx512fp16,avx512vl")]
25825    fn test_mm_cvttph_epi16() {
25826        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25827        let r = _mm_cvttph_epi16(a);
25828        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25829        assert_eq_m128i(r, e);
25830    }
25831
25832    #[simd_test(enable = "avx512fp16,avx512vl")]
25833    fn test_mm_mask_cvttph_epi16() {
25834        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25835        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25836        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
25837        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25838        assert_eq_m128i(r, e);
25839    }
25840
25841    #[simd_test(enable = "avx512fp16,avx512vl")]
25842    fn test_mm_maskz_cvttph_epi16() {
25843        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25844        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
25845        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25846        assert_eq_m128i(r, e);
25847    }
25848
25849    #[simd_test(enable = "avx512fp16,avx512vl")]
25850    fn test_mm256_cvttph_epi16() {
25851        let a = _mm256_set_ph(
25852            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25853        );
25854        let r = _mm256_cvttph_epi16(a);
25855        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25856        assert_eq_m256i(r, e);
25857    }
25858
25859    #[simd_test(enable = "avx512fp16,avx512vl")]
25860    fn test_mm256_mask_cvttph_epi16() {
25861        let a = _mm256_set_ph(
25862            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25863        );
25864        let src = _mm256_set_epi16(
25865            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25866        );
25867        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
25868        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25869        assert_eq_m256i(r, e);
25870    }
25871
25872    #[simd_test(enable = "avx512fp16,avx512vl")]
25873    fn test_mm256_maskz_cvttph_epi16() {
25874        let a = _mm256_set_ph(
25875            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25876        );
25877        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
25878        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25879        assert_eq_m256i(r, e);
25880    }
25881
25882    #[simd_test(enable = "avx512fp16")]
25883    fn test_mm512_cvttph_epi16() {
25884        let a = _mm512_set_ph(
25885            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25886            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25887            31.0, 32.0,
25888        );
25889        let r = _mm512_cvttph_epi16(a);
25890        let e = _mm512_set_epi16(
25891            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25892            25, 26, 27, 28, 29, 30, 31, 32,
25893        );
25894        assert_eq_m512i(r, e);
25895    }
25896
25897    #[simd_test(enable = "avx512fp16")]
25898    fn test_mm512_mask_cvttph_epi16() {
25899        let a = _mm512_set_ph(
25900            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25901            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25902            31.0, 32.0,
25903        );
25904        let src = _mm512_set_epi16(
25905            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25906            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25907        );
25908        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
25909        let e = _mm512_set_epi16(
25910            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25911            24, 34, 26, 36, 28, 38, 30, 40, 32,
25912        );
25913        assert_eq_m512i(r, e);
25914    }
25915
25916    #[simd_test(enable = "avx512fp16")]
25917    fn test_mm512_maskz_cvttph_epi16() {
25918        let a = _mm512_set_ph(
25919            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25920            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25921            31.0, 32.0,
25922        );
25923        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
25924        let e = _mm512_set_epi16(
25925            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25926            0, 28, 0, 30, 0, 32,
25927        );
25928        assert_eq_m512i(r, e);
25929    }
25930
25931    #[simd_test(enable = "avx512fp16")]
25932    fn test_mm512_cvtt_roundph_epi16() {
25933        let a = _mm512_set_ph(
25934            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25935            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25936            31.0, 32.0,
25937        );
25938        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
25939        let e = _mm512_set_epi16(
25940            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25941            25, 26, 27, 28, 29, 30, 31, 32,
25942        );
25943        assert_eq_m512i(r, e);
25944    }
25945
25946    #[simd_test(enable = "avx512fp16")]
25947    fn test_mm512_mask_cvtt_roundph_epi16() {
25948        let a = _mm512_set_ph(
25949            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25950            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25951            31.0, 32.0,
25952        );
25953        let src = _mm512_set_epi16(
25954            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25955            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25956        );
25957        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25958            src,
25959            0b01010101010101010101010101010101,
25960            a,
25961        );
25962        let e = _mm512_set_epi16(
25963            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25964            24, 34, 26, 36, 28, 38, 30, 40, 32,
25965        );
25966        assert_eq_m512i(r, e);
25967    }
25968
25969    #[simd_test(enable = "avx512fp16")]
25970    fn test_mm512_maskz_cvtt_roundph_epi16() {
25971        let a = _mm512_set_ph(
25972            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25973            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25974            31.0, 32.0,
25975        );
25976        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25977            0b01010101010101010101010101010101,
25978            a,
25979        );
25980        let e = _mm512_set_epi16(
25981            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25982            0, 28, 0, 30, 0, 32,
25983        );
25984        assert_eq_m512i(r, e);
25985    }
25986
25987    #[simd_test(enable = "avx512fp16,avx512vl")]
25988    fn test_mm_cvttph_epu16() {
25989        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25990        let r = _mm_cvttph_epu16(a);
25991        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25992        assert_eq_m128i(r, e);
25993    }
25994
25995    #[simd_test(enable = "avx512fp16,avx512vl")]
25996    fn test_mm_mask_cvttph_epu16() {
25997        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25998        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25999        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
26000        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
26001        assert_eq_m128i(r, e);
26002    }
26003
26004    #[simd_test(enable = "avx512fp16,avx512vl")]
26005    fn test_mm_maskz_cvttph_epu16() {
26006        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26007        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
26008        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
26009        assert_eq_m128i(r, e);
26010    }
26011
26012    #[simd_test(enable = "avx512fp16,avx512vl")]
26013    fn test_mm256_cvttph_epu16() {
26014        let a = _mm256_set_ph(
26015            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26016        );
26017        let r = _mm256_cvttph_epu16(a);
26018        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26019        assert_eq_m256i(r, e);
26020    }
26021
26022    #[simd_test(enable = "avx512fp16,avx512vl")]
26023    fn test_mm256_mask_cvttph_epu16() {
26024        let a = _mm256_set_ph(
26025            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26026        );
26027        let src = _mm256_set_epi16(
26028            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26029        );
26030        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
26031        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26032        assert_eq_m256i(r, e);
26033    }
26034
26035    #[simd_test(enable = "avx512fp16,avx512vl")]
26036    fn test_mm256_maskz_cvttph_epu16() {
26037        let a = _mm256_set_ph(
26038            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26039        );
26040        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
26041        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26042        assert_eq_m256i(r, e);
26043    }
26044
26045    #[simd_test(enable = "avx512fp16")]
26046    fn test_mm512_cvttph_epu16() {
26047        let a = _mm512_set_ph(
26048            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26049            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26050            31.0, 32.0,
26051        );
26052        let r = _mm512_cvttph_epu16(a);
26053        let e = _mm512_set_epi16(
26054            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
26055            25, 26, 27, 28, 29, 30, 31, 32,
26056        );
26057        assert_eq_m512i(r, e);
26058    }
26059
26060    #[simd_test(enable = "avx512fp16")]
26061    fn test_mm512_mask_cvttph_epu16() {
26062        let a = _mm512_set_ph(
26063            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26064            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26065            31.0, 32.0,
26066        );
26067        let src = _mm512_set_epi16(
26068            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
26069            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
26070        );
26071        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
26072        let e = _mm512_set_epi16(
26073            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
26074            24, 34, 26, 36, 28, 38, 30, 40, 32,
26075        );
26076        assert_eq_m512i(r, e);
26077    }
26078
26079    #[simd_test(enable = "avx512fp16")]
26080    fn test_mm512_maskz_cvttph_epu16() {
26081        let a = _mm512_set_ph(
26082            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26083            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26084            31.0, 32.0,
26085        );
26086        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
26087        let e = _mm512_set_epi16(
26088            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
26089            0, 28, 0, 30, 0, 32,
26090        );
26091        assert_eq_m512i(r, e);
26092    }
26093
26094    #[simd_test(enable = "avx512fp16")]
26095    fn test_mm512_cvtt_roundph_epu16() {
26096        let a = _mm512_set_ph(
26097            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26098            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26099            31.0, 32.0,
26100        );
26101        let r = _mm512_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(a);
26102        let e = _mm512_set_epi16(
26103            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
26104            25, 26, 27, 28, 29, 30, 31, 32,
26105        );
26106        assert_eq_m512i(r, e);
26107    }
26108
26109    #[simd_test(enable = "avx512fp16")]
26110    fn test_mm512_mask_cvtt_roundph_epu16() {
26111        let a = _mm512_set_ph(
26112            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26113            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26114            31.0, 32.0,
26115        );
26116        let src = _mm512_set_epi16(
26117            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
26118            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
26119        );
26120        let r = _mm512_mask_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
26121            src,
26122            0b01010101010101010101010101010101,
26123            a,
26124        );
26125        let e = _mm512_set_epi16(
26126            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
26127            24, 34, 26, 36, 28, 38, 30, 40, 32,
26128        );
26129        assert_eq_m512i(r, e);
26130    }
26131
26132    #[simd_test(enable = "avx512fp16")]
26133    fn test_mm512_maskz_cvtt_roundph_epu16() {
26134        let a = _mm512_set_ph(
26135            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26136            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26137            31.0, 32.0,
26138        );
26139        let r = _mm512_maskz_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
26140            0b01010101010101010101010101010101,
26141            a,
26142        );
26143        let e = _mm512_set_epi16(
26144            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
26145            0, 28, 0, 30, 0, 32,
26146        );
26147        assert_eq_m512i(r, e);
26148    }
26149
26150    #[simd_test(enable = "avx512fp16,avx512vl")]
26151    fn test_mm_cvtph_epi32() {
26152        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26153        let r = _mm_cvtph_epi32(a);
26154        let e = _mm_set_epi32(1, 2, 3, 4);
26155        assert_eq_m128i(r, e);
26156    }
26157
26158    #[simd_test(enable = "avx512fp16,avx512vl")]
26159    fn test_mm_mask_cvtph_epi32() {
26160        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26161        let src = _mm_set_epi32(10, 11, 12, 13);
26162        let r = _mm_mask_cvtph_epi32(src, 0b0101, a);
26163        let e = _mm_set_epi32(10, 2, 12, 4);
26164        assert_eq_m128i(r, e);
26165    }
26166
26167    #[simd_test(enable = "avx512fp16,avx512vl")]
26168    fn test_mm_maskz_cvtph_epi32() {
26169        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26170        let r = _mm_maskz_cvtph_epi32(0b0101, a);
26171        let e = _mm_set_epi32(0, 2, 0, 4);
26172        assert_eq_m128i(r, e);
26173    }
26174
26175    #[simd_test(enable = "avx512fp16,avx512vl")]
26176    fn test_mm256_cvtph_epi32() {
26177        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26178        let r = _mm256_cvtph_epi32(a);
26179        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26180        assert_eq_m256i(r, e);
26181    }
26182
26183    #[simd_test(enable = "avx512fp16,avx512vl")]
26184    fn test_mm256_mask_cvtph_epi32() {
26185        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26186        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26187        let r = _mm256_mask_cvtph_epi32(src, 0b01010101, a);
26188        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26189        assert_eq_m256i(r, e);
26190    }
26191
26192    #[simd_test(enable = "avx512fp16,avx512vl")]
26193    fn test_mm256_maskz_cvtph_epi32() {
26194        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26195        let r = _mm256_maskz_cvtph_epi32(0b01010101, a);
26196        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26197        assert_eq_m256i(r, e);
26198    }
26199
26200    #[simd_test(enable = "avx512fp16")]
26201    fn test_mm512_cvtph_epi32() {
26202        let a = _mm256_set_ph(
26203            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26204        );
26205        let r = _mm512_cvtph_epi32(a);
26206        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26207        assert_eq_m512i(r, e);
26208    }
26209
26210    #[simd_test(enable = "avx512fp16")]
26211    fn test_mm512_mask_cvtph_epi32() {
26212        let a = _mm256_set_ph(
26213            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26214        );
26215        let src = _mm512_set_epi32(
26216            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26217        );
26218        let r = _mm512_mask_cvtph_epi32(src, 0b0101010101010101, a);
26219        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26220        assert_eq_m512i(r, e);
26221    }
26222
26223    #[simd_test(enable = "avx512fp16")]
26224    fn test_mm512_maskz_cvtph_epi32() {
26225        let a = _mm256_set_ph(
26226            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26227        );
26228        let r = _mm512_maskz_cvtph_epi32(0b0101010101010101, a);
26229        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26230        assert_eq_m512i(r, e);
26231    }
26232
26233    #[simd_test(enable = "avx512fp16")]
26234    fn test_mm512_cvt_roundph_epi32() {
26235        let a = _mm256_set_ph(
26236            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26237        );
26238        let r = _mm512_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26239        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26240        assert_eq_m512i(r, e);
26241    }
26242
26243    #[simd_test(enable = "avx512fp16")]
26244    fn test_mm512_mask_cvt_roundph_epi32() {
26245        let a = _mm256_set_ph(
26246            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26247        );
26248        let src = _mm512_set_epi32(
26249            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26250        );
26251        let r = _mm512_mask_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26252            src,
26253            0b0101010101010101,
26254            a,
26255        );
26256        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26257        assert_eq_m512i(r, e);
26258    }
26259
26260    #[simd_test(enable = "avx512fp16")]
26261    fn test_mm512_maskz_cvt_roundph_epi32() {
26262        let a = _mm256_set_ph(
26263            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26264        );
26265        let r = _mm512_maskz_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26266            0b0101010101010101,
26267            a,
26268        );
26269        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26270        assert_eq_m512i(r, e);
26271    }
26272
26273    #[simd_test(enable = "avx512fp16")]
26274    fn test_mm_cvtsh_i32() {
26275        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26276        let r = _mm_cvtsh_i32(a);
26277        assert_eq!(r, 1);
26278    }
26279
26280    #[simd_test(enable = "avx512fp16")]
26281    fn test_mm_cvt_roundsh_i32() {
26282        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26283        let r = _mm_cvt_roundsh_i32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26284        assert_eq!(r, 1);
26285    }
26286
26287    #[simd_test(enable = "avx512fp16,avx512vl")]
26288    fn test_mm_cvtph_epu32() {
26289        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26290        let r = _mm_cvtph_epu32(a);
26291        let e = _mm_set_epi32(1, 2, 3, 4);
26292        assert_eq_m128i(r, e);
26293    }
26294
26295    #[simd_test(enable = "avx512fp16,avx512vl")]
26296    fn test_mm_mask_cvtph_epu32() {
26297        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26298        let src = _mm_set_epi32(10, 11, 12, 13);
26299        let r = _mm_mask_cvtph_epu32(src, 0b0101, a);
26300        let e = _mm_set_epi32(10, 2, 12, 4);
26301        assert_eq_m128i(r, e);
26302    }
26303
26304    #[simd_test(enable = "avx512fp16,avx512vl")]
26305    fn test_mm_maskz_cvtph_epu32() {
26306        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26307        let r = _mm_maskz_cvtph_epu32(0b0101, a);
26308        let e = _mm_set_epi32(0, 2, 0, 4);
26309        assert_eq_m128i(r, e);
26310    }
26311
26312    #[simd_test(enable = "avx512fp16,avx512vl")]
26313    fn test_mm256_cvtph_epu32() {
26314        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26315        let r = _mm256_cvtph_epu32(a);
26316        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26317        assert_eq_m256i(r, e);
26318    }
26319
26320    #[simd_test(enable = "avx512fp16,avx512vl")]
26321    fn test_mm256_mask_cvtph_epu32() {
26322        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26323        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26324        let r = _mm256_mask_cvtph_epu32(src, 0b01010101, a);
26325        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26326        assert_eq_m256i(r, e);
26327    }
26328
26329    #[simd_test(enable = "avx512fp16,avx512vl")]
26330    fn test_mm256_maskz_cvtph_epu32() {
26331        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26332        let r = _mm256_maskz_cvtph_epu32(0b01010101, a);
26333        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26334        assert_eq_m256i(r, e);
26335    }
26336
26337    #[simd_test(enable = "avx512fp16")]
26338    fn test_mm512_cvtph_epu32() {
26339        let a = _mm256_set_ph(
26340            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26341        );
26342        let r = _mm512_cvtph_epu32(a);
26343        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26344        assert_eq_m512i(r, e);
26345    }
26346
26347    #[simd_test(enable = "avx512fp16")]
26348    fn test_mm512_mask_cvtph_epu32() {
26349        let a = _mm256_set_ph(
26350            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26351        );
26352        let src = _mm512_set_epi32(
26353            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26354        );
26355        let r = _mm512_mask_cvtph_epu32(src, 0b0101010101010101, a);
26356        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26357        assert_eq_m512i(r, e);
26358    }
26359
26360    #[simd_test(enable = "avx512fp16")]
26361    fn test_mm512_maskz_cvtph_epu32() {
26362        let a = _mm256_set_ph(
26363            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26364        );
26365        let r = _mm512_maskz_cvtph_epu32(0b0101010101010101, a);
26366        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26367        assert_eq_m512i(r, e);
26368    }
26369
26370    #[simd_test(enable = "avx512fp16")]
26371    fn test_mm512_cvt_roundph_epu32() {
26372        let a = _mm256_set_ph(
26373            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26374        );
26375        let r = _mm512_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26376        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26377        assert_eq_m512i(r, e);
26378    }
26379
26380    #[simd_test(enable = "avx512fp16")]
26381    fn test_mm512_mask_cvt_roundph_epu32() {
26382        let a = _mm256_set_ph(
26383            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26384        );
26385        let src = _mm512_set_epi32(
26386            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26387        );
26388        let r = _mm512_mask_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26389            src,
26390            0b0101010101010101,
26391            a,
26392        );
26393        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26394        assert_eq_m512i(r, e);
26395    }
26396
26397    #[simd_test(enable = "avx512fp16")]
26398    fn test_mm512_maskz_cvt_roundph_epu32() {
26399        let a = _mm256_set_ph(
26400            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26401        );
26402        let r = _mm512_maskz_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26403            0b0101010101010101,
26404            a,
26405        );
26406        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26407        assert_eq_m512i(r, e);
26408    }
26409
26410    #[simd_test(enable = "avx512fp16")]
26411    fn test_mm_cvtsh_u32() {
26412        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26413        let r = _mm_cvtsh_u32(a);
26414        assert_eq!(r, 1);
26415    }
26416
26417    #[simd_test(enable = "avx512fp16")]
26418    fn test_mm_cvt_roundsh_u32() {
26419        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26420        let r = _mm_cvt_roundsh_u32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26421        assert_eq!(r, 1);
26422    }
26423
26424    #[simd_test(enable = "avx512fp16,avx512vl")]
26425    fn test_mm_cvttph_epi32() {
26426        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26427        let r = _mm_cvttph_epi32(a);
26428        let e = _mm_set_epi32(1, 2, 3, 4);
26429        assert_eq_m128i(r, e);
26430    }
26431
26432    #[simd_test(enable = "avx512fp16,avx512vl")]
26433    fn test_mm_mask_cvttph_epi32() {
26434        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26435        let src = _mm_set_epi32(10, 11, 12, 13);
26436        let r = _mm_mask_cvttph_epi32(src, 0b0101, a);
26437        let e = _mm_set_epi32(10, 2, 12, 4);
26438        assert_eq_m128i(r, e);
26439    }
26440
26441    #[simd_test(enable = "avx512fp16,avx512vl")]
26442    fn test_mm_maskz_cvttph_epi32() {
26443        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26444        let r = _mm_maskz_cvttph_epi32(0b0101, a);
26445        let e = _mm_set_epi32(0, 2, 0, 4);
26446        assert_eq_m128i(r, e);
26447    }
26448
26449    #[simd_test(enable = "avx512fp16,avx512vl")]
26450    fn test_mm256_cvttph_epi32() {
26451        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26452        let r = _mm256_cvttph_epi32(a);
26453        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26454        assert_eq_m256i(r, e);
26455    }
26456
26457    #[simd_test(enable = "avx512fp16,avx512vl")]
26458    fn test_mm256_mask_cvttph_epi32() {
26459        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26460        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26461        let r = _mm256_mask_cvttph_epi32(src, 0b01010101, a);
26462        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26463        assert_eq_m256i(r, e);
26464    }
26465
26466    #[simd_test(enable = "avx512fp16,avx512vl")]
26467    fn test_mm256_maskz_cvttph_epi32() {
26468        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26469        let r = _mm256_maskz_cvttph_epi32(0b01010101, a);
26470        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26471        assert_eq_m256i(r, e);
26472    }
26473
26474    #[simd_test(enable = "avx512fp16")]
26475    fn test_mm512_cvttph_epi32() {
26476        let a = _mm256_set_ph(
26477            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26478        );
26479        let r = _mm512_cvttph_epi32(a);
26480        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26481        assert_eq_m512i(r, e);
26482    }
26483
26484    #[simd_test(enable = "avx512fp16")]
26485    fn test_mm512_mask_cvttph_epi32() {
26486        let a = _mm256_set_ph(
26487            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26488        );
26489        let src = _mm512_set_epi32(
26490            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26491        );
26492        let r = _mm512_mask_cvttph_epi32(src, 0b0101010101010101, a);
26493        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26494        assert_eq_m512i(r, e);
26495    }
26496
26497    #[simd_test(enable = "avx512fp16")]
26498    fn test_mm512_maskz_cvttph_epi32() {
26499        let a = _mm256_set_ph(
26500            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26501        );
26502        let r = _mm512_maskz_cvttph_epi32(0b0101010101010101, a);
26503        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26504        assert_eq_m512i(r, e);
26505    }
26506
26507    #[simd_test(enable = "avx512fp16")]
26508    fn test_mm512_cvtt_roundph_epi32() {
26509        let a = _mm256_set_ph(
26510            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26511        );
26512        let r = _mm512_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(a);
26513        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26514        assert_eq_m512i(r, e);
26515    }
26516
26517    #[simd_test(enable = "avx512fp16")]
26518    fn test_mm512_mask_cvtt_roundph_epi32() {
26519        let a = _mm256_set_ph(
26520            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26521        );
26522        let src = _mm512_set_epi32(
26523            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26524        );
26525        let r = _mm512_mask_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26526        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26527        assert_eq_m512i(r, e);
26528    }
26529
26530    #[simd_test(enable = "avx512fp16")]
26531    fn test_mm512_maskz_cvtt_roundph_epi32() {
26532        let a = _mm256_set_ph(
26533            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26534        );
26535        let r = _mm512_maskz_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26536        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26537        assert_eq_m512i(r, e);
26538    }
26539
26540    #[simd_test(enable = "avx512fp16")]
26541    fn test_mm_cvttsh_i32() {
26542        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26543        let r = _mm_cvttsh_i32(a);
26544        assert_eq!(r, 1);
26545    }
26546
26547    #[simd_test(enable = "avx512fp16")]
26548    fn test_mm_cvtt_roundsh_i32() {
26549        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26550        let r = _mm_cvtt_roundsh_i32::<_MM_FROUND_NO_EXC>(a);
26551        assert_eq!(r, 1);
26552    }
26553
26554    #[simd_test(enable = "avx512fp16,avx512vl")]
26555    fn test_mm_cvttph_epu32() {
26556        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26557        let r = _mm_cvttph_epu32(a);
26558        let e = _mm_set_epi32(1, 2, 3, 4);
26559        assert_eq_m128i(r, e);
26560    }
26561
26562    #[simd_test(enable = "avx512fp16,avx512vl")]
26563    fn test_mm_mask_cvttph_epu32() {
26564        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26565        let src = _mm_set_epi32(10, 11, 12, 13);
26566        let r = _mm_mask_cvttph_epu32(src, 0b0101, a);
26567        let e = _mm_set_epi32(10, 2, 12, 4);
26568        assert_eq_m128i(r, e);
26569    }
26570
26571    #[simd_test(enable = "avx512fp16,avx512vl")]
26572    fn test_mm_maskz_cvttph_epu32() {
26573        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26574        let r = _mm_maskz_cvttph_epu32(0b0101, a);
26575        let e = _mm_set_epi32(0, 2, 0, 4);
26576        assert_eq_m128i(r, e);
26577    }
26578
26579    #[simd_test(enable = "avx512fp16,avx512vl")]
26580    fn test_mm256_cvttph_epu32() {
26581        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26582        let r = _mm256_cvttph_epu32(a);
26583        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26584        assert_eq_m256i(r, e);
26585    }
26586
26587    #[simd_test(enable = "avx512fp16,avx512vl")]
26588    fn test_mm256_mask_cvttph_epu32() {
26589        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26590        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26591        let r = _mm256_mask_cvttph_epu32(src, 0b01010101, a);
26592        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26593        assert_eq_m256i(r, e);
26594    }
26595
26596    #[simd_test(enable = "avx512fp16,avx512vl")]
26597    fn test_mm256_maskz_cvttph_epu32() {
26598        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26599        let r = _mm256_maskz_cvttph_epu32(0b01010101, a);
26600        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26601        assert_eq_m256i(r, e);
26602    }
26603
26604    #[simd_test(enable = "avx512fp16")]
26605    fn test_mm512_cvttph_epu32() {
26606        let a = _mm256_set_ph(
26607            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26608        );
26609        let r = _mm512_cvttph_epu32(a);
26610        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26611        assert_eq_m512i(r, e);
26612    }
26613
26614    #[simd_test(enable = "avx512fp16")]
26615    fn test_mm512_mask_cvttph_epu32() {
26616        let a = _mm256_set_ph(
26617            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26618        );
26619        let src = _mm512_set_epi32(
26620            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26621        );
26622        let r = _mm512_mask_cvttph_epu32(src, 0b0101010101010101, a);
26623        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26624        assert_eq_m512i(r, e);
26625    }
26626
26627    #[simd_test(enable = "avx512fp16")]
26628    fn test_mm512_maskz_cvttph_epu32() {
26629        let a = _mm256_set_ph(
26630            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26631        );
26632        let r = _mm512_maskz_cvttph_epu32(0b0101010101010101, a);
26633        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26634        assert_eq_m512i(r, e);
26635    }
26636
26637    #[simd_test(enable = "avx512fp16")]
26638    fn test_mm512_cvtt_roundph_epu32() {
26639        let a = _mm256_set_ph(
26640            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26641        );
26642        let r = _mm512_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(a);
26643        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26644        assert_eq_m512i(r, e);
26645    }
26646
26647    #[simd_test(enable = "avx512fp16")]
26648    fn test_mm512_mask_cvtt_roundph_epu32() {
26649        let a = _mm256_set_ph(
26650            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26651        );
26652        let src = _mm512_set_epi32(
26653            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26654        );
26655        let r = _mm512_mask_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26656        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26657        assert_eq_m512i(r, e);
26658    }
26659
26660    #[simd_test(enable = "avx512fp16")]
26661    fn test_mm512_maskz_cvtt_roundph_epu32() {
26662        let a = _mm256_set_ph(
26663            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26664        );
26665        let r = _mm512_maskz_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26666        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26667        assert_eq_m512i(r, e);
26668    }
26669
26670    #[simd_test(enable = "avx512fp16")]
26671    fn test_mm_cvttsh_u32() {
26672        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26673        let r = _mm_cvttsh_u32(a);
26674        assert_eq!(r, 1);
26675    }
26676
26677    #[simd_test(enable = "avx512fp16")]
26678    fn test_mm_cvtt_roundsh_u32() {
26679        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26680        let r = _mm_cvtt_roundsh_u32::<_MM_FROUND_NO_EXC>(a);
26681        assert_eq!(r, 1);
26682    }
26683
26684    #[simd_test(enable = "avx512fp16,avx512vl")]
26685    fn test_mm_cvtph_epi64() {
26686        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26687        let r = _mm_cvtph_epi64(a);
26688        let e = _mm_set_epi64x(1, 2);
26689        assert_eq_m128i(r, e);
26690    }
26691
26692    #[simd_test(enable = "avx512fp16,avx512vl")]
26693    fn test_mm_mask_cvtph_epi64() {
26694        let src = _mm_set_epi64x(3, 4);
26695        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26696        let r = _mm_mask_cvtph_epi64(src, 0b01, a);
26697        let e = _mm_set_epi64x(3, 2);
26698        assert_eq_m128i(r, e);
26699    }
26700
26701    #[simd_test(enable = "avx512fp16,avx512vl")]
26702    fn test_mm_maskz_cvtph_epi64() {
26703        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26704        let r = _mm_maskz_cvtph_epi64(0b01, a);
26705        let e = _mm_set_epi64x(0, 2);
26706        assert_eq_m128i(r, e);
26707    }
26708
26709    #[simd_test(enable = "avx512fp16,avx512vl")]
26710    fn test_mm256_cvtph_epi64() {
26711        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26712        let r = _mm256_cvtph_epi64(a);
26713        let e = _mm256_set_epi64x(1, 2, 3, 4);
26714        assert_eq_m256i(r, e);
26715    }
26716
26717    #[simd_test(enable = "avx512fp16,avx512vl")]
26718    fn test_mm256_mask_cvtph_epi64() {
26719        let src = _mm256_set_epi64x(5, 6, 7, 8);
26720        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26721        let r = _mm256_mask_cvtph_epi64(src, 0b0101, a);
26722        let e = _mm256_set_epi64x(5, 2, 7, 4);
26723        assert_eq_m256i(r, e);
26724    }
26725
26726    #[simd_test(enable = "avx512fp16,avx512vl")]
26727    fn test_mm256_maskz_cvtph_epi64() {
26728        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26729        let r = _mm256_maskz_cvtph_epi64(0b0101, a);
26730        let e = _mm256_set_epi64x(0, 2, 0, 4);
26731        assert_eq_m256i(r, e);
26732    }
26733
26734    #[simd_test(enable = "avx512fp16")]
26735    fn test_mm512_cvtph_epi64() {
26736        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26737        let r = _mm512_cvtph_epi64(a);
26738        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26739        assert_eq_m512i(r, e);
26740    }
26741
26742    #[simd_test(enable = "avx512fp16")]
26743    fn test_mm512_mask_cvtph_epi64() {
26744        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26745        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26746        let r = _mm512_mask_cvtph_epi64(src, 0b01010101, a);
26747        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26748        assert_eq_m512i(r, e);
26749    }
26750
26751    #[simd_test(enable = "avx512fp16")]
26752    fn test_mm512_maskz_cvtph_epi64() {
26753        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26754        let r = _mm512_maskz_cvtph_epi64(0b01010101, a);
26755        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26756        assert_eq_m512i(r, e);
26757    }
26758
26759    #[simd_test(enable = "avx512fp16")]
26760    fn test_mm512_cvt_roundph_epi64() {
26761        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26762        let r = _mm512_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26763        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26764        assert_eq_m512i(r, e);
26765    }
26766
26767    #[simd_test(enable = "avx512fp16")]
26768    fn test_mm512_mask_cvt_roundph_epi64() {
26769        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26770        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26771        let r = _mm512_mask_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26772            src, 0b01010101, a,
26773        );
26774        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26775        assert_eq_m512i(r, e);
26776    }
26777
26778    #[simd_test(enable = "avx512fp16")]
26779    fn test_mm512_maskz_cvt_roundph_epi64() {
26780        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26781        let r = _mm512_maskz_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26782            0b01010101, a,
26783        );
26784        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26785        assert_eq_m512i(r, e);
26786    }
26787
26788    #[simd_test(enable = "avx512fp16,avx512vl")]
26789    fn test_mm_cvtph_epu64() {
26790        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26791        let r = _mm_cvtph_epu64(a);
26792        let e = _mm_set_epi64x(1, 2);
26793        assert_eq_m128i(r, e);
26794    }
26795
26796    #[simd_test(enable = "avx512fp16,avx512vl")]
26797    fn test_mm_mask_cvtph_epu64() {
26798        let src = _mm_set_epi64x(3, 4);
26799        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26800        let r = _mm_mask_cvtph_epu64(src, 0b01, a);
26801        let e = _mm_set_epi64x(3, 2);
26802        assert_eq_m128i(r, e);
26803    }
26804
26805    #[simd_test(enable = "avx512fp16,avx512vl")]
26806    fn test_mm_maskz_cvtph_epu64() {
26807        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26808        let r = _mm_maskz_cvtph_epu64(0b01, a);
26809        let e = _mm_set_epi64x(0, 2);
26810        assert_eq_m128i(r, e);
26811    }
26812
26813    #[simd_test(enable = "avx512fp16,avx512vl")]
26814    fn test_mm256_cvtph_epu64() {
26815        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26816        let r = _mm256_cvtph_epu64(a);
26817        let e = _mm256_set_epi64x(1, 2, 3, 4);
26818        assert_eq_m256i(r, e);
26819    }
26820
26821    #[simd_test(enable = "avx512fp16,avx512vl")]
26822    fn test_mm256_mask_cvtph_epu64() {
26823        let src = _mm256_set_epi64x(5, 6, 7, 8);
26824        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26825        let r = _mm256_mask_cvtph_epu64(src, 0b0101, a);
26826        let e = _mm256_set_epi64x(5, 2, 7, 4);
26827        assert_eq_m256i(r, e);
26828    }
26829
26830    #[simd_test(enable = "avx512fp16,avx512vl")]
26831    fn test_mm256_maskz_cvtph_epu64() {
26832        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26833        let r = _mm256_maskz_cvtph_epu64(0b0101, a);
26834        let e = _mm256_set_epi64x(0, 2, 0, 4);
26835        assert_eq_m256i(r, e);
26836    }
26837
26838    #[simd_test(enable = "avx512fp16")]
26839    fn test_mm512_cvtph_epu64() {
26840        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26841        let r = _mm512_cvtph_epu64(a);
26842        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26843        assert_eq_m512i(r, e);
26844    }
26845
26846    #[simd_test(enable = "avx512fp16")]
26847    fn test_mm512_mask_cvtph_epu64() {
26848        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26849        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26850        let r = _mm512_mask_cvtph_epu64(src, 0b01010101, a);
26851        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26852        assert_eq_m512i(r, e);
26853    }
26854
26855    #[simd_test(enable = "avx512fp16")]
26856    fn test_mm512_maskz_cvtph_epu64() {
26857        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26858        let r = _mm512_maskz_cvtph_epu64(0b01010101, a);
26859        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26860        assert_eq_m512i(r, e);
26861    }
26862
26863    #[simd_test(enable = "avx512fp16")]
26864    fn test_mm512_cvt_roundph_epu64() {
26865        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26866        let r = _mm512_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26867        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26868        assert_eq_m512i(r, e);
26869    }
26870
26871    #[simd_test(enable = "avx512fp16")]
26872    fn test_mm512_mask_cvt_roundph_epu64() {
26873        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26874        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26875        let r = _mm512_mask_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26876            src, 0b01010101, a,
26877        );
26878        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26879        assert_eq_m512i(r, e);
26880    }
26881
26882    #[simd_test(enable = "avx512fp16")]
26883    fn test_mm512_maskz_cvt_roundph_epu64() {
26884        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26885        let r = _mm512_maskz_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26886            0b01010101, a,
26887        );
26888        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26889        assert_eq_m512i(r, e);
26890    }
26891
26892    #[simd_test(enable = "avx512fp16,avx512vl")]
26893    fn test_mm_cvttph_epi64() {
26894        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26895        let r = _mm_cvttph_epi64(a);
26896        let e = _mm_set_epi64x(1, 2);
26897        assert_eq_m128i(r, e);
26898    }
26899
26900    #[simd_test(enable = "avx512fp16,avx512vl")]
26901    fn test_mm_mask_cvttph_epi64() {
26902        let src = _mm_set_epi64x(3, 4);
26903        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26904        let r = _mm_mask_cvttph_epi64(src, 0b01, a);
26905        let e = _mm_set_epi64x(3, 2);
26906        assert_eq_m128i(r, e);
26907    }
26908
26909    #[simd_test(enable = "avx512fp16,avx512vl")]
26910    fn test_mm_maskz_cvttph_epi64() {
26911        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26912        let r = _mm_maskz_cvttph_epi64(0b01, a);
26913        let e = _mm_set_epi64x(0, 2);
26914        assert_eq_m128i(r, e);
26915    }
26916
26917    #[simd_test(enable = "avx512fp16,avx512vl")]
26918    fn test_mm256_cvttph_epi64() {
26919        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26920        let r = _mm256_cvttph_epi64(a);
26921        let e = _mm256_set_epi64x(1, 2, 3, 4);
26922        assert_eq_m256i(r, e);
26923    }
26924
26925    #[simd_test(enable = "avx512fp16,avx512vl")]
26926    fn test_mm256_mask_cvttph_epi64() {
26927        let src = _mm256_set_epi64x(5, 6, 7, 8);
26928        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26929        let r = _mm256_mask_cvttph_epi64(src, 0b0101, a);
26930        let e = _mm256_set_epi64x(5, 2, 7, 4);
26931        assert_eq_m256i(r, e);
26932    }
26933
26934    #[simd_test(enable = "avx512fp16,avx512vl")]
26935    fn test_mm256_maskz_cvttph_epi64() {
26936        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26937        let r = _mm256_maskz_cvttph_epi64(0b0101, a);
26938        let e = _mm256_set_epi64x(0, 2, 0, 4);
26939        assert_eq_m256i(r, e);
26940    }
26941
26942    #[simd_test(enable = "avx512fp16")]
26943    fn test_mm512_cvttph_epi64() {
26944        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26945        let r = _mm512_cvttph_epi64(a);
26946        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26947        assert_eq_m512i(r, e);
26948    }
26949
26950    #[simd_test(enable = "avx512fp16")]
26951    fn test_mm512_mask_cvttph_epi64() {
26952        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26953        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26954        let r = _mm512_mask_cvttph_epi64(src, 0b01010101, a);
26955        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26956        assert_eq_m512i(r, e);
26957    }
26958
26959    #[simd_test(enable = "avx512fp16")]
26960    fn test_mm512_maskz_cvttph_epi64() {
26961        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26962        let r = _mm512_maskz_cvttph_epi64(0b01010101, a);
26963        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26964        assert_eq_m512i(r, e);
26965    }
26966
26967    #[simd_test(enable = "avx512fp16")]
26968    fn test_mm512_cvtt_roundph_epi64() {
26969        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26970        let r = _mm512_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(a);
26971        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26972        assert_eq_m512i(r, e);
26973    }
26974
26975    #[simd_test(enable = "avx512fp16")]
26976    fn test_mm512_mask_cvtt_roundph_epi64() {
26977        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26978        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26979        let r = _mm512_mask_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
26980        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26981        assert_eq_m512i(r, e);
26982    }
26983
26984    #[simd_test(enable = "avx512fp16")]
26985    fn test_mm512_maskz_cvtt_roundph_epi64() {
26986        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26987        let r = _mm512_maskz_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(0b01010101, a);
26988        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26989        assert_eq_m512i(r, e);
26990    }
26991
26992    #[simd_test(enable = "avx512fp16,avx512vl")]
26993    fn test_mm_cvttph_epu64() {
26994        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26995        let r = _mm_cvttph_epu64(a);
26996        let e = _mm_set_epi64x(1, 2);
26997        assert_eq_m128i(r, e);
26998    }
26999
27000    #[simd_test(enable = "avx512fp16,avx512vl")]
27001    fn test_mm_mask_cvttph_epu64() {
27002        let src = _mm_set_epi64x(3, 4);
27003        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27004        let r = _mm_mask_cvttph_epu64(src, 0b01, a);
27005        let e = _mm_set_epi64x(3, 2);
27006        assert_eq_m128i(r, e);
27007    }
27008
27009    #[simd_test(enable = "avx512fp16,avx512vl")]
27010    fn test_mm_maskz_cvttph_epu64() {
27011        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27012        let r = _mm_maskz_cvttph_epu64(0b01, a);
27013        let e = _mm_set_epi64x(0, 2);
27014        assert_eq_m128i(r, e);
27015    }
27016
27017    #[simd_test(enable = "avx512fp16,avx512vl")]
27018    fn test_mm256_cvttph_epu64() {
27019        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27020        let r = _mm256_cvttph_epu64(a);
27021        let e = _mm256_set_epi64x(1, 2, 3, 4);
27022        assert_eq_m256i(r, e);
27023    }
27024
27025    #[simd_test(enable = "avx512fp16,avx512vl")]
27026    fn test_mm256_mask_cvttph_epu64() {
27027        let src = _mm256_set_epi64x(5, 6, 7, 8);
27028        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27029        let r = _mm256_mask_cvttph_epu64(src, 0b0101, a);
27030        let e = _mm256_set_epi64x(5, 2, 7, 4);
27031        assert_eq_m256i(r, e);
27032    }
27033
27034    #[simd_test(enable = "avx512fp16,avx512vl")]
27035    fn test_mm256_maskz_cvttph_epu64() {
27036        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27037        let r = _mm256_maskz_cvttph_epu64(0b0101, a);
27038        let e = _mm256_set_epi64x(0, 2, 0, 4);
27039        assert_eq_m256i(r, e);
27040    }
27041
27042    #[simd_test(enable = "avx512fp16")]
27043    fn test_mm512_cvttph_epu64() {
27044        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27045        let r = _mm512_cvttph_epu64(a);
27046        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
27047        assert_eq_m512i(r, e);
27048    }
27049
27050    #[simd_test(enable = "avx512fp16")]
27051    fn test_mm512_mask_cvttph_epu64() {
27052        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
27053        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27054        let r = _mm512_mask_cvttph_epu64(src, 0b01010101, a);
27055        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
27056        assert_eq_m512i(r, e);
27057    }
27058
27059    #[simd_test(enable = "avx512fp16")]
27060    fn test_mm512_maskz_cvttph_epu64() {
27061        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27062        let r = _mm512_maskz_cvttph_epu64(0b01010101, a);
27063        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
27064        assert_eq_m512i(r, e);
27065    }
27066
27067    #[simd_test(enable = "avx512fp16")]
27068    fn test_mm512_cvtt_roundph_epu64() {
27069        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27070        let r = _mm512_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(a);
27071        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
27072        assert_eq_m512i(r, e);
27073    }
27074
27075    #[simd_test(enable = "avx512fp16")]
27076    fn test_mm512_mask_cvtt_roundph_epu64() {
27077        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
27078        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27079        let r = _mm512_mask_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
27080        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
27081        assert_eq_m512i(r, e);
27082    }
27083
27084    #[simd_test(enable = "avx512fp16")]
27085    fn test_mm512_maskz_cvtt_roundph_epu64() {
27086        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27087        let r = _mm512_maskz_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(0b01010101, a);
27088        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
27089        assert_eq_m512i(r, e);
27090    }
27091
27092    #[simd_test(enable = "avx512fp16,avx512vl")]
27093    fn test_mm_cvtxph_ps() {
27094        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27095        let r = _mm_cvtxph_ps(a);
27096        let e = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
27097        assert_eq_m128(r, e);
27098    }
27099
27100    #[simd_test(enable = "avx512fp16,avx512vl")]
27101    fn test_mm_mask_cvtxph_ps() {
27102        let src = _mm_set_ps(10.0, 11.0, 12.0, 13.0);
27103        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27104        let r = _mm_mask_cvtxph_ps(src, 0b0101, a);
27105        let e = _mm_set_ps(10.0, 2.0, 12.0, 4.0);
27106        assert_eq_m128(r, e);
27107    }
27108
27109    #[simd_test(enable = "avx512fp16,avx512vl")]
27110    fn test_mm_maskz_cvtxph_ps() {
27111        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27112        let r = _mm_maskz_cvtxph_ps(0b0101, a);
27113        let e = _mm_set_ps(0.0, 2.0, 0.0, 4.0);
27114        assert_eq_m128(r, e);
27115    }
27116
27117    #[simd_test(enable = "avx512fp16,avx512vl")]
27118    fn test_mm256_cvtxph_ps() {
27119        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27120        let r = _mm256_cvtxph_ps(a);
27121        let e = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27122        assert_eq_m256(r, e);
27123    }
27124
27125    #[simd_test(enable = "avx512fp16,avx512vl")]
27126    fn test_mm256_mask_cvtxph_ps() {
27127        let src = _mm256_set_ps(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
27128        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27129        let r = _mm256_mask_cvtxph_ps(src, 0b01010101, a);
27130        let e = _mm256_set_ps(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
27131        assert_eq_m256(r, e);
27132    }
27133
27134    #[simd_test(enable = "avx512fp16,avx512vl")]
27135    fn test_mm256_maskz_cvtxph_ps() {
27136        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27137        let r = _mm256_maskz_cvtxph_ps(0b01010101, a);
27138        let e = _mm256_set_ps(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
27139        assert_eq_m256(r, e);
27140    }
27141
27142    #[simd_test(enable = "avx512fp16")]
27143    fn test_mm512_cvtxph_ps() {
27144        let a = _mm256_set_ph(
27145            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27146        );
27147        let r = _mm512_cvtxph_ps(a);
27148        let e = _mm512_set_ps(
27149            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27150        );
27151        assert_eq_m512(r, e);
27152    }
27153
27154    #[simd_test(enable = "avx512fp16")]
27155    fn test_mm512_mask_cvtxph_ps() {
27156        let src = _mm512_set_ps(
27157            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
27158            24.0, 25.0,
27159        );
27160        let a = _mm256_set_ph(
27161            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27162        );
27163        let r = _mm512_mask_cvtxph_ps(src, 0b0101010101010101, a);
27164        let e = _mm512_set_ps(
27165            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
27166            16.0,
27167        );
27168        assert_eq_m512(r, e);
27169    }
27170
27171    #[simd_test(enable = "avx512fp16")]
27172    fn test_mm512_maskz_cvtxph_ps() {
27173        let a = _mm256_set_ph(
27174            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27175        );
27176        let r = _mm512_maskz_cvtxph_ps(0b0101010101010101, a);
27177        let e = _mm512_set_ps(
27178            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
27179        );
27180        assert_eq_m512(r, e);
27181    }
27182
27183    #[simd_test(enable = "avx512fp16")]
27184    fn test_mm512_cvtx_roundph_ps() {
27185        let a = _mm256_set_ph(
27186            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27187        );
27188        let r = _mm512_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(a);
27189        let e = _mm512_set_ps(
27190            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27191        );
27192        assert_eq_m512(r, e);
27193    }
27194
27195    #[simd_test(enable = "avx512fp16")]
27196    fn test_mm512_mask_cvtx_roundph_ps() {
27197        let src = _mm512_set_ps(
27198            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
27199            24.0, 25.0,
27200        );
27201        let a = _mm256_set_ph(
27202            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27203        );
27204        let r = _mm512_mask_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
27205        let e = _mm512_set_ps(
27206            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
27207            16.0,
27208        );
27209        assert_eq_m512(r, e);
27210    }
27211
27212    #[simd_test(enable = "avx512fp16")]
27213    fn test_mm512_maskz_cvtx_roundph_ps() {
27214        let a = _mm256_set_ph(
27215            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27216        );
27217        let r = _mm512_maskz_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
27218        let e = _mm512_set_ps(
27219            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
27220        );
27221        assert_eq_m512(r, e);
27222    }
27223
27224    #[simd_test(enable = "avx512fp16")]
27225    fn test_mm_cvtsh_ss() {
27226        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27227        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27228        let r = _mm_cvtsh_ss(a, b);
27229        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27230        assert_eq_m128(r, e);
27231    }
27232
27233    #[simd_test(enable = "avx512fp16")]
27234    fn test_mm_mask_cvtsh_ss() {
27235        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
27236        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27237        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27238        let r = _mm_mask_cvtsh_ss(src, 0, a, b);
27239        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
27240        assert_eq_m128(r, e);
27241        let r = _mm_mask_cvtsh_ss(src, 1, a, b);
27242        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27243        assert_eq_m128(r, e);
27244    }
27245
27246    #[simd_test(enable = "avx512fp16")]
27247    fn test_mm_maskz_cvtsh_ss() {
27248        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27249        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27250        let r = _mm_maskz_cvtsh_ss(0, a, b);
27251        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
27252        assert_eq_m128(r, e);
27253        let r = _mm_maskz_cvtsh_ss(1, a, b);
27254        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27255        assert_eq_m128(r, e);
27256    }
27257
27258    #[simd_test(enable = "avx512fp16")]
27259    fn test_mm_cvt_roundsh_ss() {
27260        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27261        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27262        let r = _mm_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(a, b);
27263        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27264        assert_eq_m128(r, e);
27265    }
27266
27267    #[simd_test(enable = "avx512fp16")]
27268    fn test_mm_mask_cvt_roundsh_ss() {
27269        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
27270        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27271        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27272        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 0, a, b);
27273        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
27274        assert_eq_m128(r, e);
27275        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 1, a, b);
27276        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27277        assert_eq_m128(r, e);
27278    }
27279
27280    #[simd_test(enable = "avx512fp16")]
27281    fn test_mm_maskz_cvt_roundsh_ss() {
27282        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27283        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27284        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(0, a, b);
27285        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
27286        assert_eq_m128(r, e);
27287        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(1, a, b);
27288        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27289        assert_eq_m128(r, e);
27290    }
27291
27292    #[simd_test(enable = "avx512fp16,avx512vl")]
27293    fn test_mm_cvtph_pd() {
27294        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27295        let r = _mm_cvtph_pd(a);
27296        let e = _mm_set_pd(1.0, 2.0);
27297        assert_eq_m128d(r, e);
27298    }
27299
27300    #[simd_test(enable = "avx512fp16,avx512vl")]
27301    fn test_mm_mask_cvtph_pd() {
27302        let src = _mm_set_pd(10.0, 11.0);
27303        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27304        let r = _mm_mask_cvtph_pd(src, 0b01, a);
27305        let e = _mm_set_pd(10.0, 2.0);
27306        assert_eq_m128d(r, e);
27307    }
27308
27309    #[simd_test(enable = "avx512fp16,avx512vl")]
27310    fn test_mm_maskz_cvtph_pd() {
27311        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27312        let r = _mm_maskz_cvtph_pd(0b01, a);
27313        let e = _mm_set_pd(0.0, 2.0);
27314        assert_eq_m128d(r, e);
27315    }
27316
27317    #[simd_test(enable = "avx512fp16,avx512vl")]
27318    fn test_mm256_cvtph_pd() {
27319        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27320        let r = _mm256_cvtph_pd(a);
27321        let e = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
27322        assert_eq_m256d(r, e);
27323    }
27324
27325    #[simd_test(enable = "avx512fp16,avx512vl")]
27326    fn test_mm256_mask_cvtph_pd() {
27327        let src = _mm256_set_pd(10.0, 11.0, 12.0, 13.0);
27328        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27329        let r = _mm256_mask_cvtph_pd(src, 0b0101, a);
27330        let e = _mm256_set_pd(10.0, 2.0, 12.0, 4.0);
27331        assert_eq_m256d(r, e);
27332    }
27333
27334    #[simd_test(enable = "avx512fp16,avx512vl")]
27335    fn test_mm256_maskz_cvtph_pd() {
27336        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27337        let r = _mm256_maskz_cvtph_pd(0b0101, a);
27338        let e = _mm256_set_pd(0.0, 2.0, 0.0, 4.0);
27339        assert_eq_m256d(r, e);
27340    }
27341
27342    #[simd_test(enable = "avx512fp16")]
27343    fn test_mm512_cvtph_pd() {
27344        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27345        let r = _mm512_cvtph_pd(a);
27346        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27347        assert_eq_m512d(r, e);
27348    }
27349
27350    #[simd_test(enable = "avx512fp16")]
27351    fn test_mm512_mask_cvtph_pd() {
27352        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
27353        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27354        let r = _mm512_mask_cvtph_pd(src, 0b01010101, a);
27355        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
27356        assert_eq_m512d(r, e);
27357    }
27358
27359    #[simd_test(enable = "avx512fp16")]
27360    fn test_mm512_maskz_cvtph_pd() {
27361        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27362        let r = _mm512_maskz_cvtph_pd(0b01010101, a);
27363        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
27364        assert_eq_m512d(r, e);
27365    }
27366
27367    #[simd_test(enable = "avx512fp16")]
27368    fn test_mm512_cvt_roundph_pd() {
27369        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27370        let r = _mm512_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(a);
27371        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27372        assert_eq_m512d(r, e);
27373    }
27374
27375    #[simd_test(enable = "avx512fp16")]
27376    fn test_mm512_mask_cvt_roundph_pd() {
27377        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
27378        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27379        let r = _mm512_mask_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
27380        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
27381        assert_eq_m512d(r, e);
27382    }
27383
27384    #[simd_test(enable = "avx512fp16")]
27385    fn test_mm512_maskz_cvt_roundph_pd() {
27386        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27387        let r = _mm512_maskz_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(0b01010101, a);
27388        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
27389        assert_eq_m512d(r, e);
27390    }
27391
27392    #[simd_test(enable = "avx512fp16")]
27393    fn test_mm_cvtsh_sd() {
27394        let a = _mm_setr_pd(2.0, 20.0);
27395        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27396        let r = _mm_cvtsh_sd(a, b);
27397        let e = _mm_setr_pd(1.0, 20.0);
27398        assert_eq_m128d(r, e);
27399    }
27400
27401    #[simd_test(enable = "avx512fp16")]
27402    fn test_mm_mask_cvtsh_sd() {
27403        let src = _mm_setr_pd(3.0, 11.0);
27404        let a = _mm_setr_pd(2.0, 20.0);
27405        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27406        let r = _mm_mask_cvtsh_sd(src, 0, a, b);
27407        let e = _mm_setr_pd(3.0, 20.0);
27408        assert_eq_m128d(r, e);
27409        let r = _mm_mask_cvtsh_sd(src, 1, a, b);
27410        let e = _mm_setr_pd(1.0, 20.0);
27411        assert_eq_m128d(r, e);
27412    }
27413
27414    #[simd_test(enable = "avx512fp16")]
27415    fn test_mm_maskz_cvtsh_sd() {
27416        let a = _mm_setr_pd(2.0, 20.0);
27417        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27418        let r = _mm_maskz_cvtsh_sd(0, a, b);
27419        let e = _mm_setr_pd(0.0, 20.0);
27420        assert_eq_m128d(r, e);
27421        let r = _mm_maskz_cvtsh_sd(1, a, b);
27422        let e = _mm_setr_pd(1.0, 20.0);
27423        assert_eq_m128d(r, e);
27424    }
27425
27426    #[simd_test(enable = "avx512fp16")]
27427    fn test_mm_cvt_roundsh_sd() {
27428        let a = _mm_setr_pd(2.0, 20.0);
27429        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27430        let r = _mm_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(a, b);
27431        let e = _mm_setr_pd(1.0, 20.0);
27432        assert_eq_m128d(r, e);
27433    }
27434
27435    #[simd_test(enable = "avx512fp16")]
27436    fn test_mm_mask_cvt_roundsh_sd() {
27437        let src = _mm_setr_pd(3.0, 11.0);
27438        let a = _mm_setr_pd(2.0, 20.0);
27439        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27440        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 0, a, b);
27441        let e = _mm_setr_pd(3.0, 20.0);
27442        assert_eq_m128d(r, e);
27443        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 1, a, b);
27444        let e = _mm_setr_pd(1.0, 20.0);
27445        assert_eq_m128d(r, e);
27446    }
27447
27448    #[simd_test(enable = "avx512fp16")]
27449    fn test_mm_maskz_cvt_roundsh_sd() {
27450        let a = _mm_setr_pd(2.0, 20.0);
27451        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27452        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(0, a, b);
27453        let e = _mm_setr_pd(0.0, 20.0);
27454        assert_eq_m128d(r, e);
27455        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(1, a, b);
27456        let e = _mm_setr_pd(1.0, 20.0);
27457        assert_eq_m128d(r, e);
27458    }
27459
27460    #[simd_test(enable = "avx512fp16")]
27461    const fn test_mm_cvtsh_h() {
27462        let a = _mm_setr_ph(1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0);
27463        let r = _mm_cvtsh_h(a);
27464        assert_eq!(r, 1.0);
27465    }
27466
27467    #[simd_test(enable = "avx512fp16")]
27468    const fn test_mm256_cvtsh_h() {
27469        let a = _mm256_setr_ph(
27470            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27471        );
27472        let r = _mm256_cvtsh_h(a);
27473        assert_eq!(r, 1.0);
27474    }
27475
27476    #[simd_test(enable = "avx512fp16")]
27477    const fn test_mm512_cvtsh_h() {
27478        let a = _mm512_setr_ph(
27479            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27480            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
27481            31.0, 32.0,
27482        );
27483        let r = _mm512_cvtsh_h(a);
27484        assert_eq!(r, 1.0);
27485    }
27486
27487    #[simd_test(enable = "avx512fp16")]
27488    const fn test_mm_cvtsi128_si16() {
27489        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
27490        let r = _mm_cvtsi128_si16(a);
27491        assert_eq!(r, 1);
27492    }
27493
27494    #[simd_test(enable = "avx512fp16")]
27495    const fn test_mm_cvtsi16_si128() {
27496        let a = 1;
27497        let r = _mm_cvtsi16_si128(a);
27498        let e = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
27499        assert_eq_m128i(r, e);
27500    }
27501}