core/stdarch/crates/core_arch/src/x86/
avx512fp16.rs

1use crate::arch::asm;
2use crate::core_arch::{simd::*, x86::*};
3use crate::intrinsics::{fmaf16, simd::*};
4use crate::ptr;
5
6/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
7///
8/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ph)
9#[inline]
10#[target_feature(enable = "avx512fp16")]
11#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12pub fn _mm_set_ph(
13    e7: f16,
14    e6: f16,
15    e5: f16,
16    e4: f16,
17    e3: f16,
18    e2: f16,
19    e1: f16,
20    e0: f16,
21) -> __m128h {
22    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
23}
24
25/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
26///
27/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_ph)
28#[inline]
29#[target_feature(enable = "avx512fp16")]
30#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
31pub fn _mm256_set_ph(
32    e15: f16,
33    e14: f16,
34    e13: f16,
35    e12: f16,
36    e11: f16,
37    e10: f16,
38    e9: f16,
39    e8: f16,
40    e7: f16,
41    e6: f16,
42    e5: f16,
43    e4: f16,
44    e3: f16,
45    e2: f16,
46    e1: f16,
47    e0: f16,
48) -> __m256h {
49    __m256h([
50        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
51    ])
52}
53
54/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
55///
56/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_ph)
57#[inline]
58#[target_feature(enable = "avx512fp16")]
59#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
60pub fn _mm512_set_ph(
61    e31: f16,
62    e30: f16,
63    e29: f16,
64    e28: f16,
65    e27: f16,
66    e26: f16,
67    e25: f16,
68    e24: f16,
69    e23: f16,
70    e22: f16,
71    e21: f16,
72    e20: f16,
73    e19: f16,
74    e18: f16,
75    e17: f16,
76    e16: f16,
77    e15: f16,
78    e14: f16,
79    e13: f16,
80    e12: f16,
81    e11: f16,
82    e10: f16,
83    e9: f16,
84    e8: f16,
85    e7: f16,
86    e6: f16,
87    e5: f16,
88    e4: f16,
89    e3: f16,
90    e2: f16,
91    e1: f16,
92    e0: f16,
93) -> __m512h {
94    __m512h([
95        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
96        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
97    ])
98}
99
100/// Copy half-precision (16-bit) floating-point elements from a to the lower element of dst and zero
101/// the upper 7 elements.
102///
103/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sh)
104#[inline]
105#[target_feature(enable = "avx512fp16")]
106#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
107pub fn _mm_set_sh(a: f16) -> __m128h {
108    __m128h([a, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
109}
110
111/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
112///
113/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_ph)
114#[inline]
115#[target_feature(enable = "avx512fp16")]
116#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
117pub fn _mm_set1_ph(a: f16) -> __m128h {
118    unsafe { transmute(f16x8::splat(a)) }
119}
120
121/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
122///
123/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_ph)
124#[inline]
125#[target_feature(enable = "avx512fp16")]
126#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
127pub fn _mm256_set1_ph(a: f16) -> __m256h {
128    unsafe { transmute(f16x16::splat(a)) }
129}
130
131/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
132///
133/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_ph)
134#[inline]
135#[target_feature(enable = "avx512fp16")]
136#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
137pub fn _mm512_set1_ph(a: f16) -> __m512h {
138    unsafe { transmute(f16x32::splat(a)) }
139}
140
141/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
142///
143/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_ph)
144#[inline]
145#[target_feature(enable = "avx512fp16")]
146#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
147pub fn _mm_setr_ph(
148    e0: f16,
149    e1: f16,
150    e2: f16,
151    e3: f16,
152    e4: f16,
153    e5: f16,
154    e6: f16,
155    e7: f16,
156) -> __m128h {
157    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
158}
159
160/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
161///
162/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_ph)
163#[inline]
164#[target_feature(enable = "avx512fp16")]
165#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
166pub fn _mm256_setr_ph(
167    e0: f16,
168    e1: f16,
169    e2: f16,
170    e3: f16,
171    e4: f16,
172    e5: f16,
173    e6: f16,
174    e7: f16,
175    e8: f16,
176    e9: f16,
177    e10: f16,
178    e11: f16,
179    e12: f16,
180    e13: f16,
181    e14: f16,
182    e15: f16,
183) -> __m256h {
184    __m256h([
185        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
186    ])
187}
188
189/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
190///
191/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_ph)
192#[inline]
193#[target_feature(enable = "avx512fp16")]
194#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
195pub fn _mm512_setr_ph(
196    e0: f16,
197    e1: f16,
198    e2: f16,
199    e3: f16,
200    e4: f16,
201    e5: f16,
202    e6: f16,
203    e7: f16,
204    e8: f16,
205    e9: f16,
206    e10: f16,
207    e11: f16,
208    e12: f16,
209    e13: f16,
210    e14: f16,
211    e15: f16,
212    e16: f16,
213    e17: f16,
214    e18: f16,
215    e19: f16,
216    e20: f16,
217    e21: f16,
218    e22: f16,
219    e23: f16,
220    e24: f16,
221    e25: f16,
222    e26: f16,
223    e27: f16,
224    e28: f16,
225    e29: f16,
226    e30: f16,
227    e31: f16,
228) -> __m512h {
229    __m512h([
230        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
231        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
232    ])
233}
234
235/// Return vector of type __m128h with all elements set to zero.
236///
237/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_ph)
238#[inline]
239#[target_feature(enable = "avx512fp16,avx512vl")]
240#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
241pub fn _mm_setzero_ph() -> __m128h {
242    unsafe { transmute(f16x8::ZERO) }
243}
244
245/// Return vector of type __m256h with all elements set to zero.
246///
247/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_ph)
248#[inline]
249#[target_feature(enable = "avx512fp16,avx512vl")]
250#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
251pub fn _mm256_setzero_ph() -> __m256h {
252    f16x16::ZERO.as_m256h()
253}
254
255/// Return vector of type __m512h with all elements set to zero.
256///
257/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_ph)
258#[inline]
259#[target_feature(enable = "avx512fp16")]
260#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
261pub fn _mm512_setzero_ph() -> __m512h {
262    f16x32::ZERO.as_m512h()
263}
264
265/// Return vector of type `__m128h` with indetermination elements.
266/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
267/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
268/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
269///
270/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ph)
271#[inline]
272#[target_feature(enable = "avx512fp16,avx512vl")]
273#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
274pub fn _mm_undefined_ph() -> __m128h {
275    f16x8::ZERO.as_m128h()
276}
277
278/// Return vector of type `__m256h` with indetermination elements.
279/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
280/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
281/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
282///
283/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_ph)
284#[inline]
285#[target_feature(enable = "avx512fp16,avx512vl")]
286#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
287pub fn _mm256_undefined_ph() -> __m256h {
288    f16x16::ZERO.as_m256h()
289}
290
291/// Return vector of type `__m512h` with indetermination elements.
292/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
293/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
294/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
295///
296/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ph)
297#[inline]
298#[target_feature(enable = "avx512fp16")]
299#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
300pub fn _mm512_undefined_ph() -> __m512h {
301    f16x32::ZERO.as_m512h()
302}
303
304/// Cast vector of type `__m128d` to type `__m128h`. This intrinsic is only used for compilation and
305/// does not generate any instructions, thus it has zero latency.
306///
307/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ph)
308#[inline]
309#[target_feature(enable = "avx512fp16")]
310#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
311pub fn _mm_castpd_ph(a: __m128d) -> __m128h {
312    unsafe { transmute(a) }
313}
314
315/// Cast vector of type `__m256d` to type `__m256h`. This intrinsic is only used for compilation and
316/// does not generate any instructions, thus it has zero latency.
317///
318/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_ph)
319#[inline]
320#[target_feature(enable = "avx512fp16")]
321#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
322pub fn _mm256_castpd_ph(a: __m256d) -> __m256h {
323    unsafe { transmute(a) }
324}
325
326/// Cast vector of type `__m512d` to type `__m512h`. This intrinsic is only used for compilation and
327/// does not generate any instructions, thus it has zero latency.
328///
329/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ph)
330#[inline]
331#[target_feature(enable = "avx512fp16")]
332#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
333pub fn _mm512_castpd_ph(a: __m512d) -> __m512h {
334    unsafe { transmute(a) }
335}
336
337/// Cast vector of type `__m128h` to type `__m128d`. This intrinsic is only used for compilation and
338/// does not generate any instructions, thus it has zero latency.
339///
340/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_pd)
341#[inline]
342#[target_feature(enable = "avx512fp16")]
343#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
344pub fn _mm_castph_pd(a: __m128h) -> __m128d {
345    unsafe { transmute(a) }
346}
347
348/// Cast vector of type `__m256h` to type `__m256d`. This intrinsic is only used for compilation and
349/// does not generate any instructions, thus it has zero latency.
350///
351/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_pd)
352#[inline]
353#[target_feature(enable = "avx512fp16")]
354#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
355pub fn _mm256_castph_pd(a: __m256h) -> __m256d {
356    unsafe { transmute(a) }
357}
358
359/// Cast vector of type `__m512h` to type `__m512d`. This intrinsic is only used for compilation and
360/// does not generate any instructions, thus it has zero latency.
361///
362/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_pd)
363#[inline]
364#[target_feature(enable = "avx512fp16")]
365#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
366pub fn _mm512_castph_pd(a: __m512h) -> __m512d {
367    unsafe { transmute(a) }
368}
369
370/// Cast vector of type `__m128` to type `__m128h`. This intrinsic is only used for compilation and
371/// does not generate any instructions, thus it has zero latency.
372///
373/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_ph)
374#[inline]
375#[target_feature(enable = "avx512fp16")]
376#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
377pub fn _mm_castps_ph(a: __m128) -> __m128h {
378    unsafe { transmute(a) }
379}
380
381/// Cast vector of type `__m256` to type `__m256h`. This intrinsic is only used for compilation and
382/// does not generate any instructions, thus it has zero latency.
383///
384/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps_ph)
385#[inline]
386#[target_feature(enable = "avx512fp16")]
387#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
388pub fn _mm256_castps_ph(a: __m256) -> __m256h {
389    unsafe { transmute(a) }
390}
391
392/// Cast vector of type `__m512` to type `__m512h`. This intrinsic is only used for compilation and
393/// does not generate any instructions, thus it has zero latency.
394///
395/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_ph)
396#[inline]
397#[target_feature(enable = "avx512fp16")]
398#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
399pub fn _mm512_castps_ph(a: __m512) -> __m512h {
400    unsafe { transmute(a) }
401}
402
403/// Cast vector of type `__m128h` to type `__m128`. This intrinsic is only used for compilation and
404/// does not generate any instructions, thus it has zero latency.
405///
406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_ps)
407#[inline]
408#[target_feature(enable = "avx512fp16")]
409#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
410pub fn _mm_castph_ps(a: __m128h) -> __m128 {
411    unsafe { transmute(a) }
412}
413
414/// Cast vector of type `__m256h` to type `__m256`. This intrinsic is only used for compilation and
415/// does not generate any instructions, thus it has zero latency.
416///
417/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_ps)
418#[inline]
419#[target_feature(enable = "avx512fp16")]
420#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
421pub fn _mm256_castph_ps(a: __m256h) -> __m256 {
422    unsafe { transmute(a) }
423}
424
425/// Cast vector of type `__m512h` to type `__m512`. This intrinsic is only used for compilation and
426/// does not generate any instructions, thus it has zero latency.
427///
428/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_ps)
429#[inline]
430#[target_feature(enable = "avx512fp16")]
431#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
432pub fn _mm512_castph_ps(a: __m512h) -> __m512 {
433    unsafe { transmute(a) }
434}
435
436/// Cast vector of type `__m128i` to type `__m128h`. This intrinsic is only used for compilation and
437/// does not generate any instructions, thus it has zero latency.
438///
439/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_ph)
440#[inline]
441#[target_feature(enable = "avx512fp16")]
442#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
443pub fn _mm_castsi128_ph(a: __m128i) -> __m128h {
444    unsafe { transmute(a) }
445}
446
447/// Cast vector of type `__m256i` to type `__m256h`. This intrinsic is only used for compilation and
448/// does not generate any instructions, thus it has zero latency.
449///
450/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_ph)
451#[inline]
452#[target_feature(enable = "avx512fp16")]
453#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
454pub fn _mm256_castsi256_ph(a: __m256i) -> __m256h {
455    unsafe { transmute(a) }
456}
457
458/// Cast vector of type `__m512i` to type `__m512h`. This intrinsic is only used for compilation and
459/// does not generate any instructions, thus it has zero latency.
460///
461/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_ph)
462#[inline]
463#[target_feature(enable = "avx512fp16")]
464#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
465pub fn _mm512_castsi512_ph(a: __m512i) -> __m512h {
466    unsafe { transmute(a) }
467}
468
469/// Cast vector of type `__m128h` to type `__m128i`. This intrinsic is only used for compilation and
470/// does not generate any instructions, thus it has zero latency.
471///
472/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_si128)
473#[inline]
474#[target_feature(enable = "avx512fp16")]
475#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
476pub fn _mm_castph_si128(a: __m128h) -> __m128i {
477    unsafe { transmute(a) }
478}
479
480/// Cast vector of type `__m256h` to type `__m256i`. This intrinsic is only used for compilation and
481/// does not generate any instructions, thus it has zero latency.
482///
483/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_si256)
484#[inline]
485#[target_feature(enable = "avx512fp16")]
486#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
487pub fn _mm256_castph_si256(a: __m256h) -> __m256i {
488    unsafe { transmute(a) }
489}
490
491/// Cast vector of type `__m512h` to type `__m512i`. This intrinsic is only used for compilation and
492/// does not generate any instructions, thus it has zero latency.
493///
494/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_si512)
495#[inline]
496#[target_feature(enable = "avx512fp16")]
497#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
498pub fn _mm512_castph_si512(a: __m512h) -> __m512i {
499    unsafe { transmute(a) }
500}
501
502/// Cast vector of type `__m256h` to type `__m128h`. This intrinsic is only used for compilation and
503/// does not generate any instructions, thus it has zero latency.
504///
505/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph256_ph128)
506#[inline]
507#[target_feature(enable = "avx512fp16")]
508#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
509pub fn _mm256_castph256_ph128(a: __m256h) -> __m128h {
510    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
511}
512
513/// Cast vector of type `__m512h` to type `__m128h`. This intrinsic is only used for compilation and
514/// does not generate any instructions, thus it has zero latency.
515///
516/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph128)
517#[inline]
518#[target_feature(enable = "avx512fp16")]
519#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
520pub fn _mm512_castph512_ph128(a: __m512h) -> __m128h {
521    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
522}
523
524/// Cast vector of type `__m512h` to type `__m256h`. This intrinsic is only used for compilation and
525/// does not generate any instructions, thus it has zero latency.
526///
527/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph256)
528#[inline]
529#[target_feature(enable = "avx512fp16")]
530#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
531pub fn _mm512_castph512_ph256(a: __m512h) -> __m256h {
532    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
533}
534
535/// Cast vector of type `__m128h` to type `__m256h`. The upper 8 elements of the result are undefined.
536/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
537/// but most of the time it does not generate any instructions.
538///
539/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph128_ph256)
540#[inline]
541#[target_feature(enable = "avx512fp16")]
542#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
543pub fn _mm256_castph128_ph256(a: __m128h) -> __m256h {
544    unsafe {
545        simd_shuffle!(
546            a,
547            _mm_undefined_ph(),
548            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
549        )
550    }
551}
552
553/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are undefined.
554/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
555/// but most of the time it does not generate any instructions.
556///
557/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph128_ph512)
558#[inline]
559#[target_feature(enable = "avx512fp16")]
560#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
561pub fn _mm512_castph128_ph512(a: __m128h) -> __m512h {
562    unsafe {
563        simd_shuffle!(
564            a,
565            _mm_undefined_ph(),
566            [
567                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
568                8, 8, 8, 8
569            ]
570        )
571    }
572}
573
574/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are undefined.
575/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
576/// but most of the time it does not generate any instructions.
577///
578/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph256_ph512)
579#[inline]
580#[target_feature(enable = "avx512fp16")]
581#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
582pub fn _mm512_castph256_ph512(a: __m256h) -> __m512h {
583    unsafe {
584        simd_shuffle!(
585            a,
586            _mm256_undefined_ph(),
587            [
588                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
589                16, 16, 16, 16, 16, 16, 16, 16, 16
590            ]
591        )
592    }
593}
594
595/// Cast vector of type `__m256h` to type `__m128h`. The upper 8 elements of the result are zeroed.
596/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
597/// any instructions.
598///
599/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextph128_ph256)
600#[inline]
601#[target_feature(enable = "avx512fp16")]
602#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
603pub fn _mm256_zextph128_ph256(a: __m128h) -> __m256h {
604    unsafe {
605        simd_shuffle!(
606            a,
607            _mm_setzero_ph(),
608            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
609        )
610    }
611}
612
613/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are zeroed.
614/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
615/// any instructions.
616///
617/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph256_ph512)
618#[inline]
619#[target_feature(enable = "avx512fp16")]
620#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
621pub fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
622    unsafe {
623        simd_shuffle!(
624            a,
625            _mm256_setzero_ph(),
626            [
627                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
628                16, 16, 16, 16, 16, 16, 16, 16, 16
629            ]
630        )
631    }
632}
633
634/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are zeroed.
635/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
636/// any instructions.
637///
638/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph128_ph512)
639#[inline]
640#[target_feature(enable = "avx512fp16")]
641#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
642pub fn _mm512_zextph128_ph512(a: __m128h) -> __m512h {
643    unsafe {
644        simd_shuffle!(
645            a,
646            _mm_setzero_ph(),
647            [
648                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
649                8, 8, 8, 8
650            ]
651        )
652    }
653}
654
655macro_rules! cmp_asm { // FIXME: use LLVM intrinsics
656    ($mask_type: ty, $reg: ident, $a: expr, $b: expr) => {{
657        let dst: $mask_type;
658        asm!(
659            "vcmpph {k}, {a}, {b}, {imm8}",
660            k = lateout(kreg) dst,
661            a = in($reg) $a,
662            b = in($reg) $b,
663            imm8 = const IMM5,
664            options(pure, nomem, nostack)
665        );
666        dst
667    }};
668    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr, $b: expr) => {{
669        let dst: $mask_type;
670        asm!(
671            "vcmpph {k} {{ {mask} }}, {a}, {b}, {imm8}",
672            k = lateout(kreg) dst,
673            mask = in(kreg) $mask,
674            a = in($reg) $a,
675            b = in($reg) $b,
676            imm8 = const IMM5,
677            options(pure, nomem, nostack)
678        );
679        dst
680    }};
681}
682
683/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
684/// operand specified by imm8, and store the results in mask vector k.
685///
686/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ph_mask)
687#[inline]
688#[target_feature(enable = "avx512fp16,avx512vl")]
689#[rustc_legacy_const_generics(2)]
690#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
691pub fn _mm_cmp_ph_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
692    unsafe {
693        static_assert_uimm_bits!(IMM5, 5);
694        cmp_asm!(__mmask8, xmm_reg, a, b)
695    }
696}
697
698/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
699/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
700/// zeroed out when the corresponding mask bit is not set).
701///
702/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ph_mask)
703#[inline]
704#[target_feature(enable = "avx512fp16,avx512vl")]
705#[rustc_legacy_const_generics(3)]
706#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
707pub fn _mm_mask_cmp_ph_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
708    unsafe {
709        static_assert_uimm_bits!(IMM5, 5);
710        cmp_asm!(__mmask8, k1, xmm_reg, a, b)
711    }
712}
713
714/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
715/// operand specified by imm8, and store the results in mask vector k.
716///
717/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_ph_mask)
718#[inline]
719#[target_feature(enable = "avx512fp16,avx512vl")]
720#[rustc_legacy_const_generics(2)]
721#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
722pub fn _mm256_cmp_ph_mask<const IMM5: i32>(a: __m256h, b: __m256h) -> __mmask16 {
723    unsafe {
724        static_assert_uimm_bits!(IMM5, 5);
725        cmp_asm!(__mmask16, ymm_reg, a, b)
726    }
727}
728
729/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
730/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
731/// zeroed out when the corresponding mask bit is not set).
732///
733/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_ph_mask)
734#[inline]
735#[target_feature(enable = "avx512fp16,avx512vl")]
736#[rustc_legacy_const_generics(3)]
737#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
738pub fn _mm256_mask_cmp_ph_mask<const IMM5: i32>(
739    k1: __mmask16,
740    a: __m256h,
741    b: __m256h,
742) -> __mmask16 {
743    unsafe {
744        static_assert_uimm_bits!(IMM5, 5);
745        cmp_asm!(__mmask16, k1, ymm_reg, a, b)
746    }
747}
748
749/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
750/// operand specified by imm8, and store the results in mask vector k.
751///
752/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_ph_mask)
753#[inline]
754#[target_feature(enable = "avx512fp16")]
755#[rustc_legacy_const_generics(2)]
756#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
757pub fn _mm512_cmp_ph_mask<const IMM5: i32>(a: __m512h, b: __m512h) -> __mmask32 {
758    unsafe {
759        static_assert_uimm_bits!(IMM5, 5);
760        cmp_asm!(__mmask32, zmm_reg, a, b)
761    }
762}
763
764/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
765/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
766/// zeroed out when the corresponding mask bit is not set).
767///
768/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_ph_mask)
769#[inline]
770#[target_feature(enable = "avx512fp16")]
771#[rustc_legacy_const_generics(3)]
772#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
773pub fn _mm512_mask_cmp_ph_mask<const IMM5: i32>(
774    k1: __mmask32,
775    a: __m512h,
776    b: __m512h,
777) -> __mmask32 {
778    unsafe {
779        static_assert_uimm_bits!(IMM5, 5);
780        cmp_asm!(__mmask32, k1, zmm_reg, a, b)
781    }
782}
783
784/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
785/// operand specified by imm8, and store the results in mask vector k.
786///
787/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
788///
789/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_ph_mask)
790#[inline]
791#[target_feature(enable = "avx512fp16")]
792#[rustc_legacy_const_generics(2, 3)]
793#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
794pub fn _mm512_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
795    a: __m512h,
796    b: __m512h,
797) -> __mmask32 {
798    unsafe {
799        static_assert_uimm_bits!(IMM5, 5);
800        static_assert_sae!(SAE);
801        if SAE == _MM_FROUND_NO_EXC {
802            let dst: __mmask32;
803            asm!(
804                "vcmpph {k}, {a}, {b}, {{sae}}, {imm8}",
805                k = lateout(kreg) dst,
806                a = in(zmm_reg) a,
807                b = in(zmm_reg) b,
808                imm8 = const IMM5,
809                options(pure, nomem, nostack)
810            );
811            dst
812        } else {
813            cmp_asm!(__mmask32, zmm_reg, a, b)
814        }
815    }
816}
817
818/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
819/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
820/// zeroed out when the corresponding mask bit is not set).
821///
822/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
823///
824/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_ph_mask)
825#[inline]
826#[target_feature(enable = "avx512fp16")]
827#[rustc_legacy_const_generics(3, 4)]
828#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
829pub fn _mm512_mask_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
830    k1: __mmask32,
831    a: __m512h,
832    b: __m512h,
833) -> __mmask32 {
834    unsafe {
835        static_assert_uimm_bits!(IMM5, 5);
836        static_assert_sae!(SAE);
837        if SAE == _MM_FROUND_NO_EXC {
838            let dst: __mmask32;
839            asm!(
840                "vcmpph {k} {{{k1}}}, {a}, {b}, {{sae}}, {imm8}",
841                k = lateout(kreg) dst,
842                k1 = in(kreg) k1,
843                a = in(zmm_reg) a,
844                b = in(zmm_reg) b,
845                imm8 = const IMM5,
846                options(pure, nomem, nostack)
847            );
848            dst
849        } else {
850            cmp_asm!(__mmask32, k1, zmm_reg, a, b)
851        }
852    }
853}
854
855/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
856/// operand specified by imm8, and store the result in mask vector k. Exceptions can be suppressed by
857/// passing _MM_FROUND_NO_EXC in the sae parameter.
858///
859/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sh_mask)
860#[inline]
861#[target_feature(enable = "avx512fp16")]
862#[rustc_legacy_const_generics(2, 3)]
863#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
864pub fn _mm_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __mmask8 {
865    static_assert_uimm_bits!(IMM5, 5);
866    static_assert_sae!(SAE);
867    _mm_mask_cmp_round_sh_mask::<IMM5, SAE>(0xff, a, b)
868}
869
870/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
871/// operand specified by imm8, and store the result in mask vector k using zeromask k1. Exceptions can be
872/// suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
873///
874/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sh_mask)
875#[inline]
876#[target_feature(enable = "avx512fp16")]
877#[rustc_legacy_const_generics(3, 4)]
878#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
879pub fn _mm_mask_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(
880    k1: __mmask8,
881    a: __m128h,
882    b: __m128h,
883) -> __mmask8 {
884    unsafe {
885        static_assert_uimm_bits!(IMM5, 5);
886        static_assert_sae!(SAE);
887        vcmpsh(a, b, IMM5, k1, SAE)
888    }
889}
890
891/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
892/// operand specified by imm8, and store the result in mask vector k.
893///
894/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sh_mask)
895#[inline]
896#[target_feature(enable = "avx512fp16")]
897#[rustc_legacy_const_generics(2)]
898#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
899pub fn _mm_cmp_sh_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
900    static_assert_uimm_bits!(IMM5, 5);
901    _mm_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
902}
903
904/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
905/// operand specified by imm8, and store the result in mask vector k using zeromask k1.
906///
907/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sh_mask)
908#[inline]
909#[target_feature(enable = "avx512fp16")]
910#[rustc_legacy_const_generics(3)]
911#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
912pub fn _mm_mask_cmp_sh_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
913    static_assert_uimm_bits!(IMM5, 5);
914    _mm_mask_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(k1, a, b)
915}
916
917/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
918/// operand specified by imm8, and return the boolean result (0 or 1).
919/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
920///
921/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_round_sh)
922#[inline]
923#[target_feature(enable = "avx512fp16")]
924#[rustc_legacy_const_generics(2, 3)]
925#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
926pub fn _mm_comi_round_sh<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> i32 {
927    unsafe {
928        static_assert_uimm_bits!(IMM5, 5);
929        static_assert_sae!(SAE);
930        vcomish(a, b, IMM5, SAE)
931    }
932}
933
934/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
935/// operand specified by imm8, and return the boolean result (0 or 1).
936///
937/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_sh)
938#[inline]
939#[target_feature(enable = "avx512fp16")]
940#[rustc_legacy_const_generics(2)]
941#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
942pub fn _mm_comi_sh<const IMM5: i32>(a: __m128h, b: __m128h) -> i32 {
943    static_assert_uimm_bits!(IMM5, 5);
944    _mm_comi_round_sh::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
945}
946
947/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and return
948/// the boolean result (0 or 1).
949///
950/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sh)
951#[inline]
952#[target_feature(enable = "avx512fp16")]
953#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
954pub fn _mm_comieq_sh(a: __m128h, b: __m128h) -> i32 {
955    _mm_comi_sh::<_CMP_EQ_OS>(a, b)
956}
957
958/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
959/// and return the boolean result (0 or 1).
960///
961/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sh)
962#[inline]
963#[target_feature(enable = "avx512fp16")]
964#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
965pub fn _mm_comige_sh(a: __m128h, b: __m128h) -> i32 {
966    _mm_comi_sh::<_CMP_GE_OS>(a, b)
967}
968
969/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
970/// the boolean result (0 or 1).
971///
972/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sh)
973#[inline]
974#[target_feature(enable = "avx512fp16")]
975#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
976pub fn _mm_comigt_sh(a: __m128h, b: __m128h) -> i32 {
977    _mm_comi_sh::<_CMP_GT_OS>(a, b)
978}
979
980/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
981/// return the boolean result (0 or 1).
982///
983/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sh)
984#[inline]
985#[target_feature(enable = "avx512fp16")]
986#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
987pub fn _mm_comile_sh(a: __m128h, b: __m128h) -> i32 {
988    _mm_comi_sh::<_CMP_LE_OS>(a, b)
989}
990
991/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
992/// the boolean result (0 or 1).
993///
994/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sh)
995#[inline]
996#[target_feature(enable = "avx512fp16")]
997#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
998pub fn _mm_comilt_sh(a: __m128h, b: __m128h) -> i32 {
999    _mm_comi_sh::<_CMP_LT_OS>(a, b)
1000}
1001
1002/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
1003/// the boolean result (0 or 1).
1004///
1005/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sh)
1006#[inline]
1007#[target_feature(enable = "avx512fp16")]
1008#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1009pub fn _mm_comineq_sh(a: __m128h, b: __m128h) -> i32 {
1010    _mm_comi_sh::<_CMP_NEQ_OS>(a, b)
1011}
1012
1013/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and
1014/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1015///
1016/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_sh)
1017#[inline]
1018#[target_feature(enable = "avx512fp16")]
1019#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1020pub fn _mm_ucomieq_sh(a: __m128h, b: __m128h) -> i32 {
1021    _mm_comi_sh::<_CMP_EQ_OQ>(a, b)
1022}
1023
1024/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
1025/// and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1026///
1027/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_sh)
1028#[inline]
1029#[target_feature(enable = "avx512fp16")]
1030#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1031pub fn _mm_ucomige_sh(a: __m128h, b: __m128h) -> i32 {
1032    _mm_comi_sh::<_CMP_GE_OQ>(a, b)
1033}
1034
1035/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
1036/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1037///
1038/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_sh)
1039#[inline]
1040#[target_feature(enable = "avx512fp16")]
1041#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1042pub fn _mm_ucomigt_sh(a: __m128h, b: __m128h) -> i32 {
1043    _mm_comi_sh::<_CMP_GT_OQ>(a, b)
1044}
1045
1046/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
1047/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1048///
1049/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_sh)
1050#[inline]
1051#[target_feature(enable = "avx512fp16")]
1052#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1053pub fn _mm_ucomile_sh(a: __m128h, b: __m128h) -> i32 {
1054    _mm_comi_sh::<_CMP_LE_OQ>(a, b)
1055}
1056
1057/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
1058/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1059///
1060/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_sh)
1061#[inline]
1062#[target_feature(enable = "avx512fp16")]
1063#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1064pub fn _mm_ucomilt_sh(a: __m128h, b: __m128h) -> i32 {
1065    _mm_comi_sh::<_CMP_LT_OQ>(a, b)
1066}
1067
1068/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
1069/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1070///
1071/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_sh)
1072#[inline]
1073#[target_feature(enable = "avx512fp16")]
1074#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1075pub fn _mm_ucomineq_sh(a: __m128h, b: __m128h) -> i32 {
1076    _mm_comi_sh::<_CMP_NEQ_OQ>(a, b)
1077}
1078
1079/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
1080/// a new vector. The address must be aligned to 16 bytes or a general-protection exception may be generated.
1081///
1082/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ph)
1083#[inline]
1084#[target_feature(enable = "avx512fp16,avx512vl")]
1085#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1086pub unsafe fn _mm_load_ph(mem_addr: *const f16) -> __m128h {
1087    *mem_addr.cast()
1088}
1089
1090/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
1091/// a new vector. The address must be aligned to 32 bytes or a general-protection exception may be generated.
1092///
1093/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_ph)
1094#[inline]
1095#[target_feature(enable = "avx512fp16,avx512vl")]
1096#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1097pub unsafe fn _mm256_load_ph(mem_addr: *const f16) -> __m256h {
1098    *mem_addr.cast()
1099}
1100
1101/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
1102/// a new vector. The address must be aligned to 64 bytes or a general-protection exception may be generated.
1103///
1104/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_ph)
1105#[inline]
1106#[target_feature(enable = "avx512fp16")]
1107#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1108pub unsafe fn _mm512_load_ph(mem_addr: *const f16) -> __m512h {
1109    *mem_addr.cast()
1110}
1111
1112/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector,
1113/// and zero the upper elements
1114///
1115/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sh)
1116#[inline]
1117#[target_feature(enable = "avx512fp16")]
1118#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1119pub unsafe fn _mm_load_sh(mem_addr: *const f16) -> __m128h {
1120    _mm_set_sh(*mem_addr)
1121}
1122
1123/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
1124/// using writemask k (the element is copied from src when mask bit 0 is not set), and zero the upper elements.
1125///
1126/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sh)
1127#[inline]
1128#[target_feature(enable = "avx512fp16")]
1129#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1130pub unsafe fn _mm_mask_load_sh(src: __m128h, k: __mmask8, mem_addr: *const f16) -> __m128h {
1131    let mut dst = src;
1132    asm!(
1133        vpl!("vmovsh {dst}{{{k}}}"),
1134        dst = inout(xmm_reg) dst,
1135        k = in(kreg) k,
1136        p = in(reg) mem_addr,
1137        options(pure, readonly, nostack, preserves_flags)
1138    );
1139    dst
1140}
1141
1142/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
1143/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and zero the upper elements.
1144///
1145/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sh)
1146#[inline]
1147#[target_feature(enable = "avx512fp16")]
1148#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1149pub unsafe fn _mm_maskz_load_sh(k: __mmask8, mem_addr: *const f16) -> __m128h {
1150    let mut dst: __m128h;
1151    asm!(
1152        vpl!("vmovsh {dst}{{{k}}}{{z}}"),
1153        dst = out(xmm_reg) dst,
1154        k = in(kreg) k,
1155        p = in(reg) mem_addr,
1156        options(pure, readonly, nostack, preserves_flags)
1157    );
1158    dst
1159}
1160
1161/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
1162/// a new vector. The address does not need to be aligned to any particular boundary.
1163///
1164/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_ph)
1165#[inline]
1166#[target_feature(enable = "avx512fp16,avx512vl")]
1167#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1168pub unsafe fn _mm_loadu_ph(mem_addr: *const f16) -> __m128h {
1169    ptr::read_unaligned(mem_addr.cast())
1170}
1171
1172/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
1173/// a new vector. The address does not need to be aligned to any particular boundary.
1174///
1175/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_ph)
1176#[inline]
1177#[target_feature(enable = "avx512fp16,avx512vl")]
1178#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1179pub unsafe fn _mm256_loadu_ph(mem_addr: *const f16) -> __m256h {
1180    ptr::read_unaligned(mem_addr.cast())
1181}
1182
1183/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
1184/// a new vector. The address does not need to be aligned to any particular boundary.
1185///
1186/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_ph)
1187#[inline]
1188#[target_feature(enable = "avx512fp16")]
1189#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1190pub unsafe fn _mm512_loadu_ph(mem_addr: *const f16) -> __m512h {
1191    ptr::read_unaligned(mem_addr.cast())
1192}
1193
1194/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
1195/// using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper
1196/// 7 packed elements from a to the upper elements of dst.
1197///
1198/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_sh)
1199#[inline]
1200#[target_feature(enable = "avx512fp16")]
1201#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1202pub fn _mm_mask_move_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1203    unsafe {
1204        let mut mov: f16 = simd_extract!(src, 0);
1205        if (k & 1) != 0 {
1206            mov = simd_extract!(b, 0);
1207        }
1208        simd_insert!(a, 0, mov)
1209    }
1210}
1211
1212/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
1213/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
1214/// elements from a to the upper elements of dst.
1215///
1216/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sh)
1217#[inline]
1218#[target_feature(enable = "avx512fp16")]
1219#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1220pub fn _mm_maskz_move_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1221    unsafe {
1222        let mut mov: f16 = 0.;
1223        if (k & 1) != 0 {
1224            mov = simd_extract!(b, 0);
1225        }
1226        simd_insert!(a, 0, mov)
1227    }
1228}
1229
1230/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst,
1231/// and copy the upper 7 packed elements from a to the upper elements of dst.
1232///
1233/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sh)
1234#[inline]
1235#[target_feature(enable = "avx512fp16")]
1236#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1237pub fn _mm_move_sh(a: __m128h, b: __m128h) -> __m128h {
1238    unsafe {
1239        let mov: f16 = simd_extract!(b, 0);
1240        simd_insert!(a, 0, mov)
1241    }
1242}
1243
1244/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
1245/// The address must be aligned to 16 bytes or a general-protection exception may be generated.
1246///
1247/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ph)
1248#[inline]
1249#[target_feature(enable = "avx512fp16,avx512vl")]
1250#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1251pub unsafe fn _mm_store_ph(mem_addr: *mut f16, a: __m128h) {
1252    *mem_addr.cast() = a;
1253}
1254
1255/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
1256/// The address must be aligned to 32 bytes or a general-protection exception may be generated.
1257///
1258/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_ph)
1259#[inline]
1260#[target_feature(enable = "avx512fp16,avx512vl")]
1261#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1262pub unsafe fn _mm256_store_ph(mem_addr: *mut f16, a: __m256h) {
1263    *mem_addr.cast() = a;
1264}
1265
1266/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
1267/// The address must be aligned to 64 bytes or a general-protection exception may be generated.
1268///
1269/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_store_ph)
1270#[inline]
1271#[target_feature(enable = "avx512fp16")]
1272#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1273pub unsafe fn _mm512_store_ph(mem_addr: *mut f16, a: __m512h) {
1274    *mem_addr.cast() = a;
1275}
1276
1277/// Store the lower half-precision (16-bit) floating-point element from a into memory.
1278///
1279/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_sh)
1280#[inline]
1281#[target_feature(enable = "avx512fp16")]
1282#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1283pub unsafe fn _mm_store_sh(mem_addr: *mut f16, a: __m128h) {
1284    *mem_addr = simd_extract!(a, 0);
1285}
1286
1287/// Store the lower half-precision (16-bit) floating-point element from a into memory using writemask k
1288///
1289/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sh)
1290#[inline]
1291#[target_feature(enable = "avx512fp16")]
1292#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1293pub unsafe fn _mm_mask_store_sh(mem_addr: *mut f16, k: __mmask8, a: __m128h) {
1294    asm!(
1295        vps!("vmovdqu16", "{{{k}}}, {src}"),
1296        p = in(reg) mem_addr,
1297        k = in(kreg) k,
1298        src = in(xmm_reg) a,
1299        options(nostack, preserves_flags)
1300    );
1301}
1302
1303/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
1304/// The address does not need to be aligned to any particular boundary.
1305///
1306/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_ph)
1307#[inline]
1308#[target_feature(enable = "avx512fp16,avx512vl")]
1309#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1310pub unsafe fn _mm_storeu_ph(mem_addr: *mut f16, a: __m128h) {
1311    ptr::write_unaligned(mem_addr.cast(), a);
1312}
1313
1314/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
1315/// The address does not need to be aligned to any particular boundary.
1316///
1317/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_ph)
1318#[inline]
1319#[target_feature(enable = "avx512fp16,avx512vl")]
1320#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1321pub unsafe fn _mm256_storeu_ph(mem_addr: *mut f16, a: __m256h) {
1322    ptr::write_unaligned(mem_addr.cast(), a);
1323}
1324
1325/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
1326/// The address does not need to be aligned to any particular boundary.
1327///
1328/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_ph)
1329#[inline]
1330#[target_feature(enable = "avx512fp16")]
1331#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1332pub unsafe fn _mm512_storeu_ph(mem_addr: *mut f16, a: __m512h) {
1333    ptr::write_unaligned(mem_addr.cast(), a);
1334}
1335
1336/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1337///
1338/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ph)
1339#[inline]
1340#[target_feature(enable = "avx512fp16,avx512vl")]
1341#[cfg_attr(test, assert_instr(vaddph))]
1342#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1343pub fn _mm_add_ph(a: __m128h, b: __m128h) -> __m128h {
1344    unsafe { simd_add(a, b) }
1345}
1346
1347/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1348/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1349///
1350/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ph)
1351#[inline]
1352#[target_feature(enable = "avx512fp16,avx512vl")]
1353#[cfg_attr(test, assert_instr(vaddph))]
1354#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1355pub fn _mm_mask_add_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1356    unsafe {
1357        let r = _mm_add_ph(a, b);
1358        simd_select_bitmask(k, r, src)
1359    }
1360}
1361
1362/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1363/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1364///
1365/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ph)
1366#[inline]
1367#[target_feature(enable = "avx512fp16,avx512vl")]
1368#[cfg_attr(test, assert_instr(vaddph))]
1369#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1370pub fn _mm_maskz_add_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1371    unsafe {
1372        let r = _mm_add_ph(a, b);
1373        simd_select_bitmask(k, r, _mm_setzero_ph())
1374    }
1375}
1376
1377/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1378///
1379/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_ph)
1380#[inline]
1381#[target_feature(enable = "avx512fp16,avx512vl")]
1382#[cfg_attr(test, assert_instr(vaddph))]
1383#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1384pub fn _mm256_add_ph(a: __m256h, b: __m256h) -> __m256h {
1385    unsafe { simd_add(a, b) }
1386}
1387
1388/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1389/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1390///
1391/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_ph)
1392#[inline]
1393#[target_feature(enable = "avx512fp16,avx512vl")]
1394#[cfg_attr(test, assert_instr(vaddph))]
1395#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1396pub fn _mm256_mask_add_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1397    unsafe {
1398        let r = _mm256_add_ph(a, b);
1399        simd_select_bitmask(k, r, src)
1400    }
1401}
1402
1403/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1404/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1405///
1406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_ph)
1407#[inline]
1408#[target_feature(enable = "avx512fp16,avx512vl")]
1409#[cfg_attr(test, assert_instr(vaddph))]
1410#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1411pub fn _mm256_maskz_add_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1412    unsafe {
1413        let r = _mm256_add_ph(a, b);
1414        simd_select_bitmask(k, r, _mm256_setzero_ph())
1415    }
1416}
1417
1418/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1419///
1420/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_ph)
1421#[inline]
1422#[target_feature(enable = "avx512fp16")]
1423#[cfg_attr(test, assert_instr(vaddph))]
1424#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1425pub fn _mm512_add_ph(a: __m512h, b: __m512h) -> __m512h {
1426    unsafe { simd_add(a, b) }
1427}
1428
1429/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1430/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1431///
1432/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_ph)
1433#[inline]
1434#[target_feature(enable = "avx512fp16")]
1435#[cfg_attr(test, assert_instr(vaddph))]
1436#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1437pub fn _mm512_mask_add_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1438    unsafe {
1439        let r = _mm512_add_ph(a, b);
1440        simd_select_bitmask(k, r, src)
1441    }
1442}
1443
1444/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1445/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1446///
1447/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_ph)
1448#[inline]
1449#[target_feature(enable = "avx512fp16")]
1450#[cfg_attr(test, assert_instr(vaddph))]
1451#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1452pub fn _mm512_maskz_add_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1453    unsafe {
1454        let r = _mm512_add_ph(a, b);
1455        simd_select_bitmask(k, r, _mm512_setzero_ph())
1456    }
1457}
1458
1459/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1460/// Rounding is done according to the rounding parameter, which can be one of:
1461///
1462/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1463/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1464/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1465/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1466/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1467///
1468/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_round_ph)
1469#[inline]
1470#[target_feature(enable = "avx512fp16")]
1471#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1472#[rustc_legacy_const_generics(2)]
1473#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1474pub fn _mm512_add_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
1475    unsafe {
1476        static_assert_rounding!(ROUNDING);
1477        vaddph(a, b, ROUNDING)
1478    }
1479}
1480
1481/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1482/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1483/// Rounding is done according to the rounding parameter, which can be one of:
1484///
1485/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1486/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1487/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1488/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1489/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1490///
1491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_round_ph)
1492#[inline]
1493#[target_feature(enable = "avx512fp16")]
1494#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1495#[rustc_legacy_const_generics(4)]
1496#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1497pub fn _mm512_mask_add_round_ph<const ROUNDING: i32>(
1498    src: __m512h,
1499    k: __mmask32,
1500    a: __m512h,
1501    b: __m512h,
1502) -> __m512h {
1503    unsafe {
1504        static_assert_rounding!(ROUNDING);
1505        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
1506        simd_select_bitmask(k, r, src)
1507    }
1508}
1509
1510/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1511/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1512/// Rounding is done according to the rounding parameter, which can be one of:
1513///
1514/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1515/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1516/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1517/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1518///
1519/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_round_ph)
1520#[inline]
1521#[target_feature(enable = "avx512fp16")]
1522#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1523#[rustc_legacy_const_generics(3)]
1524#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1525pub fn _mm512_maskz_add_round_ph<const ROUNDING: i32>(
1526    k: __mmask32,
1527    a: __m512h,
1528    b: __m512h,
1529) -> __m512h {
1530    unsafe {
1531        static_assert_rounding!(ROUNDING);
1532        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
1533        simd_select_bitmask(k, r, _mm512_setzero_ph())
1534    }
1535}
1536
1537/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1538/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1539/// Rounding is done according to the rounding parameter, which can be one of:
1540///
1541/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1542/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1543/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1544/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1545/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1546///
1547/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sh)
1548#[inline]
1549#[target_feature(enable = "avx512fp16")]
1550#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1551#[rustc_legacy_const_generics(2)]
1552#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1553pub fn _mm_add_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
1554    static_assert_rounding!(ROUNDING);
1555    _mm_mask_add_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
1556}
1557
1558/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1559/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1560/// writemask k (the element is copied from src when mask bit 0 is not set).
1561/// Rounding is done according to the rounding parameter, which can be one of:
1562///
1563/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1564/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1565/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1566/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1567/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1568///
1569/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_sh)
1570#[inline]
1571#[target_feature(enable = "avx512fp16")]
1572#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1573#[rustc_legacy_const_generics(4)]
1574#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1575pub fn _mm_mask_add_round_sh<const ROUNDING: i32>(
1576    src: __m128h,
1577    k: __mmask8,
1578    a: __m128h,
1579    b: __m128h,
1580) -> __m128h {
1581    unsafe {
1582        static_assert_rounding!(ROUNDING);
1583        vaddsh(a, b, src, k, ROUNDING)
1584    }
1585}
1586
1587/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1588/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1589/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1590/// Rounding is done according to the rounding parameter, which can be one of:
1591///
1592/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1593/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1594/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1595/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1596/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1597///
1598/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sh)
1599#[inline]
1600#[target_feature(enable = "avx512fp16")]
1601#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1602#[rustc_legacy_const_generics(3)]
1603#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1604pub fn _mm_maskz_add_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1605    static_assert_rounding!(ROUNDING);
1606    _mm_mask_add_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
1607}
1608
1609/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1610/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1611///
1612/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sh)
1613#[inline]
1614#[target_feature(enable = "avx512fp16")]
1615#[cfg_attr(test, assert_instr(vaddsh))]
1616#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1617pub fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
1618    _mm_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
1619}
1620
1621/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1622/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1623/// writemask k (the element is copied from src when mask bit 0 is not set).
1624///
1625/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_sh)
1626#[inline]
1627#[target_feature(enable = "avx512fp16")]
1628#[cfg_attr(test, assert_instr(vaddsh))]
1629#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1630pub fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1631    _mm_mask_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
1632}
1633
1634/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1635/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1636/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1637///
1638/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sh)
1639#[inline]
1640#[target_feature(enable = "avx512fp16")]
1641#[cfg_attr(test, assert_instr(vaddsh))]
1642#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1643pub fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1644    _mm_maskz_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
1645}
1646
1647/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1648///
1649/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ph)
1650#[inline]
1651#[target_feature(enable = "avx512fp16,avx512vl")]
1652#[cfg_attr(test, assert_instr(vsubph))]
1653#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1654pub fn _mm_sub_ph(a: __m128h, b: __m128h) -> __m128h {
1655    unsafe { simd_sub(a, b) }
1656}
1657
1658/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1659/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1660///
1661/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ph)
1662#[inline]
1663#[target_feature(enable = "avx512fp16,avx512vl")]
1664#[cfg_attr(test, assert_instr(vsubph))]
1665#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1666pub fn _mm_mask_sub_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1667    unsafe {
1668        let r = _mm_sub_ph(a, b);
1669        simd_select_bitmask(k, r, src)
1670    }
1671}
1672
1673/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1674/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1675///
1676/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ph)
1677#[inline]
1678#[target_feature(enable = "avx512fp16,avx512vl")]
1679#[cfg_attr(test, assert_instr(vsubph))]
1680#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1681pub fn _mm_maskz_sub_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1682    unsafe {
1683        let r = _mm_sub_ph(a, b);
1684        simd_select_bitmask(k, r, _mm_setzero_ph())
1685    }
1686}
1687
1688/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1689///
1690/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_ph)
1691#[inline]
1692#[target_feature(enable = "avx512fp16,avx512vl")]
1693#[cfg_attr(test, assert_instr(vsubph))]
1694#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1695pub fn _mm256_sub_ph(a: __m256h, b: __m256h) -> __m256h {
1696    unsafe { simd_sub(a, b) }
1697}
1698
1699/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1700/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1701///
1702/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_ph)
1703#[inline]
1704#[target_feature(enable = "avx512fp16,avx512vl")]
1705#[cfg_attr(test, assert_instr(vsubph))]
1706#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1707pub fn _mm256_mask_sub_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1708    unsafe {
1709        let r = _mm256_sub_ph(a, b);
1710        simd_select_bitmask(k, r, src)
1711    }
1712}
1713
1714/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1715/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1716///
1717/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_ph)
1718#[inline]
1719#[target_feature(enable = "avx512fp16,avx512vl")]
1720#[cfg_attr(test, assert_instr(vsubph))]
1721#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1722pub fn _mm256_maskz_sub_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1723    unsafe {
1724        let r = _mm256_sub_ph(a, b);
1725        simd_select_bitmask(k, r, _mm256_setzero_ph())
1726    }
1727}
1728
1729/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1730///
1731/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_ph)
1732#[inline]
1733#[target_feature(enable = "avx512fp16")]
1734#[cfg_attr(test, assert_instr(vsubph))]
1735#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1736pub fn _mm512_sub_ph(a: __m512h, b: __m512h) -> __m512h {
1737    unsafe { simd_sub(a, b) }
1738}
1739
1740/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1741/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1742///
1743/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_ph)
1744#[inline]
1745#[target_feature(enable = "avx512fp16")]
1746#[cfg_attr(test, assert_instr(vsubph))]
1747#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1748pub fn _mm512_mask_sub_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1749    unsafe {
1750        let r = _mm512_sub_ph(a, b);
1751        simd_select_bitmask(k, r, src)
1752    }
1753}
1754
1755/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1756/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1757///
1758/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_ph)
1759#[inline]
1760#[target_feature(enable = "avx512fp16")]
1761#[cfg_attr(test, assert_instr(vsubph))]
1762#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1763pub fn _mm512_maskz_sub_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1764    unsafe {
1765        let r = _mm512_sub_ph(a, b);
1766        simd_select_bitmask(k, r, _mm512_setzero_ph())
1767    }
1768}
1769
1770/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1771/// Rounding is done according to the rounding parameter, which can be one of:
1772///
1773/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1774/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1775/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1776/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1777/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1778///
1779/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_ph)
1780#[inline]
1781#[target_feature(enable = "avx512fp16")]
1782#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1783#[rustc_legacy_const_generics(2)]
1784#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1785pub fn _mm512_sub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
1786    unsafe {
1787        static_assert_rounding!(ROUNDING);
1788        vsubph(a, b, ROUNDING)
1789    }
1790}
1791
1792/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1793/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1794/// Rounding is done according to the rounding parameter, which can be one of:
1795///
1796/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1797/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1798/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1799/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1800/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1801///
1802/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ph)
1803#[inline]
1804#[target_feature(enable = "avx512fp16")]
1805#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1806#[rustc_legacy_const_generics(4)]
1807#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1808pub fn _mm512_mask_sub_round_ph<const ROUNDING: i32>(
1809    src: __m512h,
1810    k: __mmask32,
1811    a: __m512h,
1812    b: __m512h,
1813) -> __m512h {
1814    unsafe {
1815        static_assert_rounding!(ROUNDING);
1816        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
1817        simd_select_bitmask(k, r, src)
1818    }
1819}
1820
1821/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1822/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1823/// Rounding is done according to the rounding parameter, which can be one of:
1824///
1825/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1826/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1827/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1828/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1829/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1830///
1831/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ph)
1832#[inline]
1833#[target_feature(enable = "avx512fp16")]
1834#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1835#[rustc_legacy_const_generics(3)]
1836#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1837pub fn _mm512_maskz_sub_round_ph<const ROUNDING: i32>(
1838    k: __mmask32,
1839    a: __m512h,
1840    b: __m512h,
1841) -> __m512h {
1842    unsafe {
1843        static_assert_rounding!(ROUNDING);
1844        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
1845        simd_select_bitmask(k, r, _mm512_setzero_ph())
1846    }
1847}
1848
1849/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1850/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1851/// Rounding is done according to the rounding parameter, which can be one of:
1852///
1853/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1854/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1855/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1856/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1857/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1858///
1859/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sh)
1860#[inline]
1861#[target_feature(enable = "avx512fp16")]
1862#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1863#[rustc_legacy_const_generics(2)]
1864#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1865pub fn _mm_sub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
1866    static_assert_rounding!(ROUNDING);
1867    _mm_mask_sub_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
1868}
1869
1870/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1871/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1872/// writemask k (the element is copied from src when mask bit 0 is not set).
1873/// Rounding is done according to the rounding parameter, which can be one of:
1874///
1875/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1876/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1877/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1878/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1879/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1880///
1881/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sh)
1882#[inline]
1883#[target_feature(enable = "avx512fp16")]
1884#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1885#[rustc_legacy_const_generics(4)]
1886#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1887pub fn _mm_mask_sub_round_sh<const ROUNDING: i32>(
1888    src: __m128h,
1889    k: __mmask8,
1890    a: __m128h,
1891    b: __m128h,
1892) -> __m128h {
1893    unsafe {
1894        static_assert_rounding!(ROUNDING);
1895        vsubsh(a, b, src, k, ROUNDING)
1896    }
1897}
1898
1899/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1900/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1901/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1902/// Rounding is done according to the rounding parameter, which can be one of:
1903///
1904/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1905/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1906/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1907/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1908/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1909///
1910/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sh)
1911#[inline]
1912#[target_feature(enable = "avx512fp16")]
1913#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1914#[rustc_legacy_const_generics(3)]
1915#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1916pub fn _mm_maskz_sub_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1917    static_assert_rounding!(ROUNDING);
1918    _mm_mask_sub_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
1919}
1920
1921/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1922/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1923///
1924/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sh)
1925#[inline]
1926#[target_feature(enable = "avx512fp16")]
1927#[cfg_attr(test, assert_instr(vsubsh))]
1928#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1929pub fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
1930    _mm_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
1931}
1932
1933/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1934/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1935/// writemask k (the element is copied from src when mask bit 0 is not set).
1936///
1937/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_sh)
1938#[inline]
1939#[target_feature(enable = "avx512fp16")]
1940#[cfg_attr(test, assert_instr(vsubsh))]
1941#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1942pub fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1943    _mm_mask_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
1944}
1945
1946/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1947/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1948/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1949///
1950/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sh)
1951#[inline]
1952#[target_feature(enable = "avx512fp16")]
1953#[cfg_attr(test, assert_instr(vsubsh))]
1954#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1955pub fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1956    _mm_maskz_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
1957}
1958
1959/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1960///
1961/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ph)
1962#[inline]
1963#[target_feature(enable = "avx512fp16,avx512vl")]
1964#[cfg_attr(test, assert_instr(vmulph))]
1965#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1966pub fn _mm_mul_ph(a: __m128h, b: __m128h) -> __m128h {
1967    unsafe { simd_mul(a, b) }
1968}
1969
1970/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1971/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1972///
1973/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ph)
1974#[inline]
1975#[target_feature(enable = "avx512fp16,avx512vl")]
1976#[cfg_attr(test, assert_instr(vmulph))]
1977#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1978pub fn _mm_mask_mul_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1979    unsafe {
1980        let r = _mm_mul_ph(a, b);
1981        simd_select_bitmask(k, r, src)
1982    }
1983}
1984
1985/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1986/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1987///
1988/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ph)
1989#[inline]
1990#[target_feature(enable = "avx512fp16,avx512vl")]
1991#[cfg_attr(test, assert_instr(vmulph))]
1992#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1993pub fn _mm_maskz_mul_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1994    unsafe {
1995        let r = _mm_mul_ph(a, b);
1996        simd_select_bitmask(k, r, _mm_setzero_ph())
1997    }
1998}
1999
2000/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2001///
2002/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_ph)
2003#[inline]
2004#[target_feature(enable = "avx512fp16,avx512vl")]
2005#[cfg_attr(test, assert_instr(vmulph))]
2006#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2007pub fn _mm256_mul_ph(a: __m256h, b: __m256h) -> __m256h {
2008    unsafe { simd_mul(a, b) }
2009}
2010
2011/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2012/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2013///
2014/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_ph)
2015#[inline]
2016#[target_feature(enable = "avx512fp16,avx512vl")]
2017#[cfg_attr(test, assert_instr(vmulph))]
2018#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2019pub fn _mm256_mask_mul_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2020    unsafe {
2021        let r = _mm256_mul_ph(a, b);
2022        simd_select_bitmask(k, r, src)
2023    }
2024}
2025
2026/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2027/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2028///
2029/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_ph)
2030#[inline]
2031#[target_feature(enable = "avx512fp16,avx512vl")]
2032#[cfg_attr(test, assert_instr(vmulph))]
2033#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2034pub fn _mm256_maskz_mul_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2035    unsafe {
2036        let r = _mm256_mul_ph(a, b);
2037        simd_select_bitmask(k, r, _mm256_setzero_ph())
2038    }
2039}
2040
2041/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2042///
2043/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_ph)
2044#[inline]
2045#[target_feature(enable = "avx512fp16")]
2046#[cfg_attr(test, assert_instr(vmulph))]
2047#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2048pub fn _mm512_mul_ph(a: __m512h, b: __m512h) -> __m512h {
2049    unsafe { simd_mul(a, b) }
2050}
2051
2052/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2053/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2054///
2055/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_ph)
2056#[inline]
2057#[target_feature(enable = "avx512fp16")]
2058#[cfg_attr(test, assert_instr(vmulph))]
2059#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2060pub fn _mm512_mask_mul_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2061    unsafe {
2062        let r = _mm512_mul_ph(a, b);
2063        simd_select_bitmask(k, r, src)
2064    }
2065}
2066
2067/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2068/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2069///
2070/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_ph)
2071#[inline]
2072#[target_feature(enable = "avx512fp16")]
2073#[cfg_attr(test, assert_instr(vmulph))]
2074#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2075pub fn _mm512_maskz_mul_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2076    unsafe {
2077        let r = _mm512_mul_ph(a, b);
2078        simd_select_bitmask(k, r, _mm512_setzero_ph())
2079    }
2080}
2081
2082/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2083/// Rounding is done according to the rounding parameter, which can be one of:
2084///
2085/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2086/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2087/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2088/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2089/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2090///
2091/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_ph)
2092#[inline]
2093#[target_feature(enable = "avx512fp16")]
2094#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2095#[rustc_legacy_const_generics(2)]
2096#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2097pub fn _mm512_mul_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2098    unsafe {
2099        static_assert_rounding!(ROUNDING);
2100        vmulph(a, b, ROUNDING)
2101    }
2102}
2103
2104/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2105/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2106/// Rounding is done according to the rounding parameter, which can be one of:
2107///
2108/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2109/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2110/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2111/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2112/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2113///
2114/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_ph)
2115#[inline]
2116#[target_feature(enable = "avx512fp16")]
2117#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2118#[rustc_legacy_const_generics(4)]
2119#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2120pub fn _mm512_mask_mul_round_ph<const ROUNDING: i32>(
2121    src: __m512h,
2122    k: __mmask32,
2123    a: __m512h,
2124    b: __m512h,
2125) -> __m512h {
2126    unsafe {
2127        static_assert_rounding!(ROUNDING);
2128        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
2129        simd_select_bitmask(k, r, src)
2130    }
2131}
2132
2133/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2134/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2135/// Rounding is done according to the rounding parameter, which can be one of:
2136///
2137/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2138/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2139/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2140/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2141/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2142///
2143/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_ph)
2144#[inline]
2145#[target_feature(enable = "avx512fp16")]
2146#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2147#[rustc_legacy_const_generics(3)]
2148#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2149pub fn _mm512_maskz_mul_round_ph<const ROUNDING: i32>(
2150    k: __mmask32,
2151    a: __m512h,
2152    b: __m512h,
2153) -> __m512h {
2154    unsafe {
2155        static_assert_rounding!(ROUNDING);
2156        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
2157        simd_select_bitmask(k, r, _mm512_setzero_ph())
2158    }
2159}
2160
2161/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2162/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2163/// Rounding is done according to the rounding parameter, which can be one of:
2164///
2165/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2166/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2167/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2168/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2169/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2170///
2171/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sh)
2172#[inline]
2173#[target_feature(enable = "avx512fp16")]
2174#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2175#[rustc_legacy_const_generics(2)]
2176#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2177pub fn _mm_mul_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2178    static_assert_rounding!(ROUNDING);
2179    _mm_mask_mul_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
2180}
2181
2182/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2183/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2184/// writemask k (the element is copied from src when mask bit 0 is not set).
2185/// Rounding is done according to the rounding parameter, which can be one of:
2186///
2187/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2188/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2189/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2190/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2191/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2192///
2193/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sh)
2194#[inline]
2195#[target_feature(enable = "avx512fp16")]
2196#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2197#[rustc_legacy_const_generics(4)]
2198#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2199pub fn _mm_mask_mul_round_sh<const ROUNDING: i32>(
2200    src: __m128h,
2201    k: __mmask8,
2202    a: __m128h,
2203    b: __m128h,
2204) -> __m128h {
2205    unsafe {
2206        static_assert_rounding!(ROUNDING);
2207        vmulsh(a, b, src, k, ROUNDING)
2208    }
2209}
2210
2211/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2212/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2213/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2214/// Rounding is done according to the rounding parameter, which can be one of:
2215///
2216/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2217/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2218/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2219/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2220/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2221///
2222/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sh)
2223#[inline]
2224#[target_feature(enable = "avx512fp16")]
2225#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2226#[rustc_legacy_const_generics(3)]
2227#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2228pub fn _mm_maskz_mul_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2229    static_assert_rounding!(ROUNDING);
2230    _mm_mask_mul_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
2231}
2232
2233/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2234/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2235///
2236/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sh)
2237#[inline]
2238#[target_feature(enable = "avx512fp16")]
2239#[cfg_attr(test, assert_instr(vmulsh))]
2240#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2241pub fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
2242    _mm_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
2243}
2244
2245/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2246/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2247/// writemask k (the element is copied from src when mask bit 0 is not set).
2248///
2249/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sh)
2250#[inline]
2251#[target_feature(enable = "avx512fp16")]
2252#[cfg_attr(test, assert_instr(vmulsh))]
2253#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2254pub fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2255    _mm_mask_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2256}
2257
2258/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2259/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2260/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2261///
2262/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sh)
2263#[inline]
2264#[target_feature(enable = "avx512fp16")]
2265#[cfg_attr(test, assert_instr(vmulsh))]
2266#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2267pub fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2268    _mm_maskz_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
2269}
2270
2271/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2272///
2273/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ph)
2274#[inline]
2275#[target_feature(enable = "avx512fp16,avx512vl")]
2276#[cfg_attr(test, assert_instr(vdivph))]
2277#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2278pub fn _mm_div_ph(a: __m128h, b: __m128h) -> __m128h {
2279    unsafe { simd_div(a, b) }
2280}
2281
2282/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2283/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2284///
2285/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ph)
2286#[inline]
2287#[target_feature(enable = "avx512fp16,avx512vl")]
2288#[cfg_attr(test, assert_instr(vdivph))]
2289#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2290pub fn _mm_mask_div_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2291    unsafe {
2292        let r = _mm_div_ph(a, b);
2293        simd_select_bitmask(k, r, src)
2294    }
2295}
2296
2297/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2298/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2299///
2300/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ph)
2301#[inline]
2302#[target_feature(enable = "avx512fp16,avx512vl")]
2303#[cfg_attr(test, assert_instr(vdivph))]
2304#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2305pub fn _mm_maskz_div_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2306    unsafe {
2307        let r = _mm_div_ph(a, b);
2308        simd_select_bitmask(k, r, _mm_setzero_ph())
2309    }
2310}
2311
2312/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2313///
2314/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_div_ph)
2315#[inline]
2316#[target_feature(enable = "avx512fp16,avx512vl")]
2317#[cfg_attr(test, assert_instr(vdivph))]
2318#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2319pub fn _mm256_div_ph(a: __m256h, b: __m256h) -> __m256h {
2320    unsafe { simd_div(a, b) }
2321}
2322
2323/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2324/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2325///
2326/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_div_ph)
2327#[inline]
2328#[target_feature(enable = "avx512fp16,avx512vl")]
2329#[cfg_attr(test, assert_instr(vdivph))]
2330#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2331pub fn _mm256_mask_div_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2332    unsafe {
2333        let r = _mm256_div_ph(a, b);
2334        simd_select_bitmask(k, r, src)
2335    }
2336}
2337
2338/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2339/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2340///
2341/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_div_ph)
2342#[inline]
2343#[target_feature(enable = "avx512fp16,avx512vl")]
2344#[cfg_attr(test, assert_instr(vdivph))]
2345#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2346pub fn _mm256_maskz_div_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2347    unsafe {
2348        let r = _mm256_div_ph(a, b);
2349        simd_select_bitmask(k, r, _mm256_setzero_ph())
2350    }
2351}
2352
2353/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2354///
2355/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_ph)
2356#[inline]
2357#[target_feature(enable = "avx512fp16")]
2358#[cfg_attr(test, assert_instr(vdivph))]
2359#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2360pub fn _mm512_div_ph(a: __m512h, b: __m512h) -> __m512h {
2361    unsafe { simd_div(a, b) }
2362}
2363
2364/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2365/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2366///
2367/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_ph)
2368#[inline]
2369#[target_feature(enable = "avx512fp16")]
2370#[cfg_attr(test, assert_instr(vdivph))]
2371#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2372pub fn _mm512_mask_div_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2373    unsafe {
2374        let r = _mm512_div_ph(a, b);
2375        simd_select_bitmask(k, r, src)
2376    }
2377}
2378
2379/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2380/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2381///
2382/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_ph)
2383#[inline]
2384#[target_feature(enable = "avx512fp16")]
2385#[cfg_attr(test, assert_instr(vdivph))]
2386#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2387pub fn _mm512_maskz_div_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2388    unsafe {
2389        let r = _mm512_div_ph(a, b);
2390        simd_select_bitmask(k, r, _mm512_setzero_ph())
2391    }
2392}
2393
2394/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2395/// Rounding is done according to the rounding parameter, which can be one of:
2396///
2397/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2398/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2399/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2400/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2401/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2402///
2403/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_round_ph)
2404#[inline]
2405#[target_feature(enable = "avx512fp16")]
2406#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2407#[rustc_legacy_const_generics(2)]
2408#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2409pub fn _mm512_div_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2410    unsafe {
2411        static_assert_rounding!(ROUNDING);
2412        vdivph(a, b, ROUNDING)
2413    }
2414}
2415
2416/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2417/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2418/// Rounding is done according to the rounding parameter, which can be one of:
2419///
2420/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2421/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2422/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2423/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2424/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2425///
2426/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_round_ph)
2427#[inline]
2428#[target_feature(enable = "avx512fp16")]
2429#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2430#[rustc_legacy_const_generics(4)]
2431#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2432pub fn _mm512_mask_div_round_ph<const ROUNDING: i32>(
2433    src: __m512h,
2434    k: __mmask32,
2435    a: __m512h,
2436    b: __m512h,
2437) -> __m512h {
2438    unsafe {
2439        static_assert_rounding!(ROUNDING);
2440        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
2441        simd_select_bitmask(k, r, src)
2442    }
2443}
2444
2445/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2446/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2447/// Rounding is done according to the rounding parameter, which can be one of:
2448///
2449/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2450/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2451/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2452/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2453/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2454///
2455/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_round_ph)
2456#[inline]
2457#[target_feature(enable = "avx512fp16")]
2458#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2459#[rustc_legacy_const_generics(3)]
2460#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2461pub fn _mm512_maskz_div_round_ph<const ROUNDING: i32>(
2462    k: __mmask32,
2463    a: __m512h,
2464    b: __m512h,
2465) -> __m512h {
2466    unsafe {
2467        static_assert_rounding!(ROUNDING);
2468        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
2469        simd_select_bitmask(k, r, _mm512_setzero_ph())
2470    }
2471}
2472
2473/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2474/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2475/// Rounding is done according to the rounding parameter, which can be one of:
2476///
2477/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2478/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2479/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2480/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2481/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2482///
2483/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_sh)
2484#[inline]
2485#[target_feature(enable = "avx512fp16")]
2486#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2487#[rustc_legacy_const_generics(2)]
2488#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2489pub fn _mm_div_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2490    static_assert_rounding!(ROUNDING);
2491    _mm_mask_div_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
2492}
2493
2494/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2495/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2496/// writemask k (the element is copied from src when mask bit 0 is not set).
2497/// Rounding is done according to the rounding parameter, which can be one of:
2498///
2499/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2500/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2501/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2502/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2503/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2504///
2505/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_sh)
2506#[inline]
2507#[target_feature(enable = "avx512fp16")]
2508#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2509#[rustc_legacy_const_generics(4)]
2510#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2511pub fn _mm_mask_div_round_sh<const ROUNDING: i32>(
2512    src: __m128h,
2513    k: __mmask8,
2514    a: __m128h,
2515    b: __m128h,
2516) -> __m128h {
2517    unsafe {
2518        static_assert_rounding!(ROUNDING);
2519        vdivsh(a, b, src, k, ROUNDING)
2520    }
2521}
2522
2523/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2524/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2525/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2526/// Rounding is done according to the rounding parameter, which can be one of:
2527///
2528/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2529/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2530/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2531/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2532/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2533///
2534/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_sh)
2535#[inline]
2536#[target_feature(enable = "avx512fp16")]
2537#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2538#[rustc_legacy_const_generics(3)]
2539#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2540pub fn _mm_maskz_div_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2541    static_assert_rounding!(ROUNDING);
2542    _mm_mask_div_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
2543}
2544
2545/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2546/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2547///
2548/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sh)
2549#[inline]
2550#[target_feature(enable = "avx512fp16")]
2551#[cfg_attr(test, assert_instr(vdivsh))]
2552#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2553pub fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
2554    _mm_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
2555}
2556
2557/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2558/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2559/// writemask k (the element is copied from src when mask bit 0 is not set).
2560///
2561/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_sh)
2562#[inline]
2563#[target_feature(enable = "avx512fp16")]
2564#[cfg_attr(test, assert_instr(vdivsh))]
2565#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2566pub fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2567    _mm_mask_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2568}
2569
2570/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2571/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2572/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2573///
2574/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_sh)
2575#[inline]
2576#[target_feature(enable = "avx512fp16")]
2577#[cfg_attr(test, assert_instr(vdivsh))]
2578#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2579pub fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2580    _mm_maskz_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
2581}
2582
2583/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2584/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2585/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2586///
2587/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pch)
2588#[inline]
2589#[target_feature(enable = "avx512fp16,avx512vl")]
2590#[cfg_attr(test, assert_instr(vfmulcph))]
2591#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2592pub fn _mm_mul_pch(a: __m128h, b: __m128h) -> __m128h {
2593    _mm_mask_mul_pch(_mm_undefined_ph(), 0xff, a, b)
2594}
2595
2596/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2597/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2598/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2599///
2600/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_pch)
2601#[inline]
2602#[target_feature(enable = "avx512fp16,avx512vl")]
2603#[cfg_attr(test, assert_instr(vfmulcph))]
2604#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2605pub fn _mm_mask_mul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2606    unsafe { transmute(vfmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
2607}
2608
2609/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2610/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2611/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2612///
2613/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_pch)
2614#[inline]
2615#[target_feature(enable = "avx512fp16,avx512vl")]
2616#[cfg_attr(test, assert_instr(vfmulcph))]
2617#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2618pub fn _mm_maskz_mul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2619    _mm_mask_mul_pch(_mm_setzero_ph(), k, a, b)
2620}
2621
2622/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2623/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2624/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2625///
2626/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_pch)
2627#[inline]
2628#[target_feature(enable = "avx512fp16,avx512vl")]
2629#[cfg_attr(test, assert_instr(vfmulcph))]
2630#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2631pub fn _mm256_mul_pch(a: __m256h, b: __m256h) -> __m256h {
2632    _mm256_mask_mul_pch(_mm256_undefined_ph(), 0xff, a, b)
2633}
2634
2635/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2636/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2637/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2638///
2639/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_pch)
2640#[inline]
2641#[target_feature(enable = "avx512fp16,avx512vl")]
2642#[cfg_attr(test, assert_instr(vfmulcph))]
2643#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2644pub fn _mm256_mask_mul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2645    unsafe { transmute(vfmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
2646}
2647
2648/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2649/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2650/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2651///
2652/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_pch)
2653#[inline]
2654#[target_feature(enable = "avx512fp16,avx512vl")]
2655#[cfg_attr(test, assert_instr(vfmulcph))]
2656#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2657pub fn _mm256_maskz_mul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2658    _mm256_mask_mul_pch(_mm256_setzero_ph(), k, a, b)
2659}
2660
2661/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2662/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2663/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2664///
2665/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_pch)
2666#[inline]
2667#[target_feature(enable = "avx512fp16")]
2668#[cfg_attr(test, assert_instr(vfmulcph))]
2669#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2670pub fn _mm512_mul_pch(a: __m512h, b: __m512h) -> __m512h {
2671    _mm512_mask_mul_pch(_mm512_undefined_ph(), 0xffff, a, b)
2672}
2673
2674/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2675/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2676/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2677///
2678/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_pch)
2679#[inline]
2680#[target_feature(enable = "avx512fp16")]
2681#[cfg_attr(test, assert_instr(vfmulcph))]
2682#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2683pub fn _mm512_mask_mul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
2684    _mm512_mask_mul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2685}
2686
2687/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2688/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2689/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2690///
2691/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_pch)
2692#[inline]
2693#[target_feature(enable = "avx512fp16")]
2694#[cfg_attr(test, assert_instr(vfmulcph))]
2695#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2696pub fn _mm512_maskz_mul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
2697    _mm512_mask_mul_pch(_mm512_setzero_ph(), k, a, b)
2698}
2699
2700/// Multiply the packed complex numbers in a and b, and store the results in dst. Each complex number is
2701/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2702/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2703///
2704/// Rounding is done according to the rounding parameter, which can be one of:
2705///
2706/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2707/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2708/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2709/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2710/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2711///
2712/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_pch)
2713#[inline]
2714#[target_feature(enable = "avx512fp16")]
2715#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2716#[rustc_legacy_const_generics(2)]
2717#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2718pub fn _mm512_mul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2719    static_assert_rounding!(ROUNDING);
2720    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
2721}
2722
2723/// Multiply the packed complex numbers in a and b, and store the results in dst using writemask k (the element
2724/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2725/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2726///
2727/// Rounding is done according to the rounding parameter, which can be one of:
2728///
2729/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2730/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2731/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2732/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2733/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2734///
2735/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_pch)
2736#[inline]
2737#[target_feature(enable = "avx512fp16")]
2738#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2739#[rustc_legacy_const_generics(4)]
2740#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2741pub fn _mm512_mask_mul_round_pch<const ROUNDING: i32>(
2742    src: __m512h,
2743    k: __mmask16,
2744    a: __m512h,
2745    b: __m512h,
2746) -> __m512h {
2747    unsafe {
2748        static_assert_rounding!(ROUNDING);
2749        transmute(vfmulcph_512(
2750            transmute(a),
2751            transmute(b),
2752            transmute(src),
2753            k,
2754            ROUNDING,
2755        ))
2756    }
2757}
2758
2759/// Multiply the packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2760/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2761/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2762///
2763/// Rounding is done according to the rounding parameter, which can be one of:
2764///
2765/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2766/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2767/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2768/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2769/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2770///
2771/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_pch)
2772#[inline]
2773#[target_feature(enable = "avx512fp16")]
2774#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2775#[rustc_legacy_const_generics(3)]
2776#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2777pub fn _mm512_maskz_mul_round_pch<const ROUNDING: i32>(
2778    k: __mmask16,
2779    a: __m512h,
2780    b: __m512h,
2781) -> __m512h {
2782    static_assert_rounding!(ROUNDING);
2783    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
2784}
2785
2786/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
2787/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
2788/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2789/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2790///
2791/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sch)
2792#[inline]
2793#[target_feature(enable = "avx512fp16")]
2794#[cfg_attr(test, assert_instr(vfmulcsh))]
2795#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2796pub fn _mm_mul_sch(a: __m128h, b: __m128h) -> __m128h {
2797    _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b)
2798}
2799
2800/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2801/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
2802/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent
2803/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2804///
2805/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sch)
2806#[inline]
2807#[target_feature(enable = "avx512fp16")]
2808#[cfg_attr(test, assert_instr(vfmulcsh))]
2809#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2810pub fn _mm_mask_mul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2811    _mm_mask_mul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2812}
2813
2814/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2815/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
2816/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2817/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2818///
2819/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sch)
2820#[inline]
2821#[target_feature(enable = "avx512fp16")]
2822#[cfg_attr(test, assert_instr(vfmulcsh))]
2823#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2824pub fn _mm_maskz_mul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2825    _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), k, a, b)
2826}
2827
2828/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
2829/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
2830/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2831/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2832///
2833/// Rounding is done according to the rounding parameter, which can be one of:
2834///
2835/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2836/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2837/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2838/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2839/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2840///
2841/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sch)
2842#[inline]
2843#[target_feature(enable = "avx512fp16")]
2844#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
2845#[rustc_legacy_const_generics(2)]
2846#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2847pub fn _mm_mul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2848    static_assert_rounding!(ROUNDING);
2849    _mm_mask_mul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
2850}
2851
2852/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2853/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
2854/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2855/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2856///
2857/// Rounding is done according to the rounding parameter, which can be one of:
2858///
2859/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2860/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2861/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2862/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2863/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2864///
2865/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sch)
2866#[inline]
2867#[target_feature(enable = "avx512fp16")]
2868#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
2869#[rustc_legacy_const_generics(4)]
2870#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2871pub fn _mm_mask_mul_round_sch<const ROUNDING: i32>(
2872    src: __m128h,
2873    k: __mmask8,
2874    a: __m128h,
2875    b: __m128h,
2876) -> __m128h {
2877    unsafe {
2878        static_assert_rounding!(ROUNDING);
2879        transmute(vfmulcsh(
2880            transmute(a),
2881            transmute(b),
2882            transmute(src),
2883            k,
2884            ROUNDING,
2885        ))
2886    }
2887}
2888
2889/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2890/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
2891/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2892/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2893///
2894/// Rounding is done according to the rounding parameter, which can be one of:
2895///
2896/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2897/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2898/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2899/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2900/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2901///
2902/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sch)
2903#[inline]
2904#[target_feature(enable = "avx512fp16")]
2905#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
2906#[rustc_legacy_const_generics(3)]
2907#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2908pub fn _mm_maskz_mul_round_sch<const ROUNDING: i32>(
2909    k: __mmask8,
2910    a: __m128h,
2911    b: __m128h,
2912) -> __m128h {
2913    static_assert_rounding!(ROUNDING);
2914    _mm_mask_mul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
2915}
2916
2917/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2918/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2919/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2920///
2921/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_pch)
2922#[inline]
2923#[target_feature(enable = "avx512fp16,avx512vl")]
2924#[cfg_attr(test, assert_instr(vfmulcph))]
2925#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2926pub fn _mm_fmul_pch(a: __m128h, b: __m128h) -> __m128h {
2927    _mm_mul_pch(a, b)
2928}
2929
2930/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2931/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2932/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2933///
2934/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_pch)
2935#[inline]
2936#[target_feature(enable = "avx512fp16,avx512vl")]
2937#[cfg_attr(test, assert_instr(vfmulcph))]
2938#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2939pub fn _mm_mask_fmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2940    _mm_mask_mul_pch(src, k, a, b)
2941}
2942
2943/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2944/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
2945/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2946///
2947/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_pch)
2948#[inline]
2949#[target_feature(enable = "avx512fp16,avx512vl")]
2950#[cfg_attr(test, assert_instr(vfmulcph))]
2951#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2952pub fn _mm_maskz_fmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2953    _mm_maskz_mul_pch(k, a, b)
2954}
2955
2956/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2957/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2958/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2959///
2960/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmul_pch)
2961#[inline]
2962#[target_feature(enable = "avx512fp16,avx512vl")]
2963#[cfg_attr(test, assert_instr(vfmulcph))]
2964#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2965pub fn _mm256_fmul_pch(a: __m256h, b: __m256h) -> __m256h {
2966    _mm256_mul_pch(a, b)
2967}
2968
2969/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2970/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
2971/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2972///
2973/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmul_pch)
2974#[inline]
2975#[target_feature(enable = "avx512fp16,avx512vl")]
2976#[cfg_attr(test, assert_instr(vfmulcph))]
2977#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2978pub fn _mm256_mask_fmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2979    _mm256_mask_mul_pch(src, k, a, b)
2980}
2981
2982/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2983/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
2984/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2985///
2986/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmul_pch)
2987#[inline]
2988#[target_feature(enable = "avx512fp16,avx512vl")]
2989#[cfg_attr(test, assert_instr(vfmulcph))]
2990#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2991pub fn _mm256_maskz_fmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2992    _mm256_maskz_mul_pch(k, a, b)
2993}
2994
2995/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
2996/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2997///
2998/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_pch)
2999#[inline]
3000#[target_feature(enable = "avx512fp16")]
3001#[cfg_attr(test, assert_instr(vfmulcph))]
3002#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3003pub fn _mm512_fmul_pch(a: __m512h, b: __m512h) -> __m512h {
3004    _mm512_mul_pch(a, b)
3005}
3006
3007/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3008/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3009/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3010///
3011/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_pch)
3012#[inline]
3013#[target_feature(enable = "avx512fp16")]
3014#[cfg_attr(test, assert_instr(vfmulcph))]
3015#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3016pub fn _mm512_mask_fmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3017    _mm512_mask_mul_pch(src, k, a, b)
3018}
3019
3020/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3021/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3022/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3023///
3024/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_pch)
3025#[inline]
3026#[target_feature(enable = "avx512fp16")]
3027#[cfg_attr(test, assert_instr(vfmulcph))]
3028#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3029pub fn _mm512_maskz_fmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3030    _mm512_maskz_mul_pch(k, a, b)
3031}
3032
3033/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
3034/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3035/// Rounding is done according to the rounding parameter, which can be one of:
3036///
3037/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3038/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3039/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3040/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3041/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3042///
3043/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_round_pch)
3044#[inline]
3045#[target_feature(enable = "avx512fp16")]
3046#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3047#[rustc_legacy_const_generics(2)]
3048#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3049pub fn _mm512_fmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3050    static_assert_rounding!(ROUNDING);
3051    _mm512_mul_round_pch::<ROUNDING>(a, b)
3052}
3053
3054/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3055/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3056/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3057/// Rounding is done according to the rounding parameter, which can be one of:
3058///
3059/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3060/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3061/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3062/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3063/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3064///
3065/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_round_pch)
3066#[inline]
3067#[target_feature(enable = "avx512fp16")]
3068#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3069#[rustc_legacy_const_generics(4)]
3070#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3071pub fn _mm512_mask_fmul_round_pch<const ROUNDING: i32>(
3072    src: __m512h,
3073    k: __mmask16,
3074    a: __m512h,
3075    b: __m512h,
3076) -> __m512h {
3077    static_assert_rounding!(ROUNDING);
3078    _mm512_mask_mul_round_pch::<ROUNDING>(src, k, a, b)
3079}
3080
3081/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3082/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3083/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3084/// Rounding is done according to the rounding parameter, which can be one of:
3085///
3086/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3087/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3088/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3089/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3090/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3091///
3092/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_round_pch)
3093#[inline]
3094#[target_feature(enable = "avx512fp16")]
3095#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3096#[rustc_legacy_const_generics(3)]
3097#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3098pub fn _mm512_maskz_fmul_round_pch<const ROUNDING: i32>(
3099    k: __mmask16,
3100    a: __m512h,
3101    b: __m512h,
3102) -> __m512h {
3103    static_assert_rounding!(ROUNDING);
3104    _mm512_maskz_mul_round_pch::<ROUNDING>(k, a, b)
3105}
3106
3107/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is
3108/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3109/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3110///
3111/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_sch)
3112#[inline]
3113#[target_feature(enable = "avx512fp16")]
3114#[cfg_attr(test, assert_instr(vfmulcsh))]
3115#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3116pub fn _mm_fmul_sch(a: __m128h, b: __m128h) -> __m128h {
3117    _mm_mul_sch(a, b)
3118}
3119
3120/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
3121/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3122/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3123///
3124/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_sch)
3125#[inline]
3126#[target_feature(enable = "avx512fp16")]
3127#[cfg_attr(test, assert_instr(vfmulcsh))]
3128#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3129pub fn _mm_mask_fmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3130    _mm_mask_mul_sch(src, k, a, b)
3131}
3132
3133/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
3134/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3135/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3136///
3137/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_sch)
3138#[inline]
3139#[target_feature(enable = "avx512fp16")]
3140#[cfg_attr(test, assert_instr(vfmulcsh))]
3141#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3142pub fn _mm_maskz_fmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3143    _mm_maskz_mul_sch(k, a, b)
3144}
3145
3146/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is composed
3147/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3148///
3149/// Rounding is done according to the rounding parameter, which can be one of:
3150///
3151/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3152/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3153/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3154/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3155/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3156///
3157/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_round_sch)
3158#[inline]
3159#[target_feature(enable = "avx512fp16")]
3160#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3161#[rustc_legacy_const_generics(2)]
3162#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3163pub fn _mm_fmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3164    static_assert_rounding!(ROUNDING);
3165    _mm_mul_round_sch::<ROUNDING>(a, b)
3166}
3167
3168/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
3169/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3170/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3171///
3172/// Rounding is done according to the rounding parameter, which can be one of:
3173///
3174/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3175/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3176/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3177/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3178/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3179///
3180/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_round_sch)
3181#[inline]
3182#[target_feature(enable = "avx512fp16")]
3183#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3184#[rustc_legacy_const_generics(4)]
3185#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3186pub fn _mm_mask_fmul_round_sch<const ROUNDING: i32>(
3187    src: __m128h,
3188    k: __mmask8,
3189    a: __m128h,
3190    b: __m128h,
3191) -> __m128h {
3192    static_assert_rounding!(ROUNDING);
3193    _mm_mask_mul_round_sch::<ROUNDING>(src, k, a, b)
3194}
3195
3196/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
3197/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3198/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3199///
3200/// Rounding is done according to the rounding parameter, which can be one of:
3201///
3202/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3203/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3204/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3205/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3206/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3207///
3208/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_round_sch)
3209#[inline]
3210#[target_feature(enable = "avx512fp16")]
3211#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3212#[rustc_legacy_const_generics(3)]
3213#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3214pub fn _mm_maskz_fmul_round_sch<const ROUNDING: i32>(
3215    k: __mmask8,
3216    a: __m128h,
3217    b: __m128h,
3218) -> __m128h {
3219    static_assert_rounding!(ROUNDING);
3220    _mm_maskz_mul_round_sch::<ROUNDING>(k, a, b)
3221}
3222
3223/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3224/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3225/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3226/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3227///
3228/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_pch)
3229#[inline]
3230#[target_feature(enable = "avx512fp16,avx512vl")]
3231#[cfg_attr(test, assert_instr(vfcmulcph))]
3232#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3233pub fn _mm_cmul_pch(a: __m128h, b: __m128h) -> __m128h {
3234    _mm_mask_cmul_pch(_mm_undefined_ph(), 0xff, a, b)
3235}
3236
3237/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3238/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3239/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3240/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3241///
3242/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_pch)
3243#[inline]
3244#[target_feature(enable = "avx512fp16,avx512vl")]
3245#[cfg_attr(test, assert_instr(vfcmulcph))]
3246#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3247pub fn _mm_mask_cmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3248    unsafe { transmute(vfcmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
3249}
3250
3251/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3252/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3253/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3254/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3255///
3256/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_pch)
3257#[inline]
3258#[target_feature(enable = "avx512fp16,avx512vl")]
3259#[cfg_attr(test, assert_instr(vfcmulcph))]
3260#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3261pub fn _mm_maskz_cmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3262    _mm_mask_cmul_pch(_mm_setzero_ph(), k, a, b)
3263}
3264
3265/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3266/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3267/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3268/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3269///
3270/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmul_pch)
3271#[inline]
3272#[target_feature(enable = "avx512fp16,avx512vl")]
3273#[cfg_attr(test, assert_instr(vfcmulcph))]
3274#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3275pub fn _mm256_cmul_pch(a: __m256h, b: __m256h) -> __m256h {
3276    _mm256_mask_cmul_pch(_mm256_undefined_ph(), 0xff, a, b)
3277}
3278
3279/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3280/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3281/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3282/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3283///
3284/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmul_pch)
3285#[inline]
3286#[target_feature(enable = "avx512fp16,avx512vl")]
3287#[cfg_attr(test, assert_instr(vfcmulcph))]
3288#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3289pub fn _mm256_mask_cmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3290    unsafe { transmute(vfcmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
3291}
3292
3293/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3294/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3295/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3296/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3297///
3298/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cmul_pch)
3299#[inline]
3300#[target_feature(enable = "avx512fp16,avx512vl")]
3301#[cfg_attr(test, assert_instr(vfcmulcph))]
3302#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3303pub fn _mm256_maskz_cmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3304    _mm256_mask_cmul_pch(_mm256_setzero_ph(), k, a, b)
3305}
3306
3307/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3308/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3309/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3310/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3311///
3312/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_pch)
3313#[inline]
3314#[target_feature(enable = "avx512fp16")]
3315#[cfg_attr(test, assert_instr(vfcmulcph))]
3316#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3317pub fn _mm512_cmul_pch(a: __m512h, b: __m512h) -> __m512h {
3318    _mm512_mask_cmul_pch(_mm512_undefined_ph(), 0xffff, a, b)
3319}
3320
3321/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3322/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3323/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3324/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3325///
3326/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_pch)
3327#[inline]
3328#[target_feature(enable = "avx512fp16")]
3329#[cfg_attr(test, assert_instr(vfcmulcph))]
3330#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3331pub fn _mm512_mask_cmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3332    _mm512_mask_cmul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
3333}
3334
3335/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3336/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3337/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3338/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3339///
3340/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_pch)
3341#[inline]
3342#[target_feature(enable = "avx512fp16")]
3343#[cfg_attr(test, assert_instr(vfcmulcph))]
3344#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3345pub fn _mm512_maskz_cmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3346    _mm512_mask_cmul_pch(_mm512_setzero_ph(), k, a, b)
3347}
3348
3349/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3350/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3351/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3352/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3353///
3354/// Rounding is done according to the rounding parameter, which can be one of:
3355///
3356/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3357/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3358/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3359/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3360/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3361///
3362/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_round_pch)
3363#[inline]
3364#[target_feature(enable = "avx512fp16")]
3365#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3366#[rustc_legacy_const_generics(2)]
3367#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3368pub fn _mm512_cmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3369    static_assert_rounding!(ROUNDING);
3370    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
3371}
3372
3373/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3374/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3375/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3376/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3377///
3378/// Rounding is done according to the rounding parameter, which can be one of:
3379///
3380/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3381/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3382/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3383/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3384/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3385///
3386/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_round_pch)
3387#[inline]
3388#[target_feature(enable = "avx512fp16")]
3389#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3390#[rustc_legacy_const_generics(4)]
3391#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3392pub fn _mm512_mask_cmul_round_pch<const ROUNDING: i32>(
3393    src: __m512h,
3394    k: __mmask16,
3395    a: __m512h,
3396    b: __m512h,
3397) -> __m512h {
3398    unsafe {
3399        static_assert_rounding!(ROUNDING);
3400        transmute(vfcmulcph_512(
3401            transmute(a),
3402            transmute(b),
3403            transmute(src),
3404            k,
3405            ROUNDING,
3406        ))
3407    }
3408}
3409
3410/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3411/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3412/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3413/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3414///
3415/// Rounding is done according to the rounding parameter, which can be one of:
3416///
3417/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3418/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3419/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3420/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3421/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3422///
3423/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_round_pch)
3424#[inline]
3425#[target_feature(enable = "avx512fp16")]
3426#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3427#[rustc_legacy_const_generics(3)]
3428#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3429pub fn _mm512_maskz_cmul_round_pch<const ROUNDING: i32>(
3430    k: __mmask16,
3431    a: __m512h,
3432    b: __m512h,
3433) -> __m512h {
3434    static_assert_rounding!(ROUNDING);
3435    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
3436}
3437
3438/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3439/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3440/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3441///
3442/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_sch)
3443#[inline]
3444#[target_feature(enable = "avx512fp16")]
3445#[cfg_attr(test, assert_instr(vfcmulcsh))]
3446#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3447pub fn _mm_cmul_sch(a: __m128h, b: __m128h) -> __m128h {
3448    _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b)
3449}
3450
3451/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3452/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3453/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3454/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3455///
3456/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_sch)
3457#[inline]
3458#[target_feature(enable = "avx512fp16")]
3459#[cfg_attr(test, assert_instr(vfcmulcsh))]
3460#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3461pub fn _mm_mask_cmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3462    _mm_mask_cmul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
3463}
3464
3465/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3466/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3467/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3468/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3469///
3470/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_sch)
3471#[inline]
3472#[target_feature(enable = "avx512fp16")]
3473#[cfg_attr(test, assert_instr(vfcmulcsh))]
3474#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3475pub fn _mm_maskz_cmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3476    _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), k, a, b)
3477}
3478
3479/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3480/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3481/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3482///
3483/// Rounding is done according to the rounding parameter, which can be one of:
3484///
3485/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3486/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3487/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3488/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3489/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3490///
3491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_round_sch)
3492#[inline]
3493#[target_feature(enable = "avx512fp16")]
3494#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3495#[rustc_legacy_const_generics(2)]
3496#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3497pub fn _mm_cmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3498    static_assert_rounding!(ROUNDING);
3499    _mm_mask_cmul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
3500}
3501
3502/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3503/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3504/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3505/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3506///
3507/// Rounding is done according to the rounding parameter, which can be one of:
3508///
3509/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3510/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3511/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3512/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3513/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3514///
3515/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_round_sch)
3516#[inline]
3517#[target_feature(enable = "avx512fp16")]
3518#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3519#[rustc_legacy_const_generics(4)]
3520#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3521pub fn _mm_mask_cmul_round_sch<const ROUNDING: i32>(
3522    src: __m128h,
3523    k: __mmask8,
3524    a: __m128h,
3525    b: __m128h,
3526) -> __m128h {
3527    unsafe {
3528        static_assert_rounding!(ROUNDING);
3529        transmute(vfcmulcsh(
3530            transmute(a),
3531            transmute(b),
3532            transmute(src),
3533            k,
3534            ROUNDING,
3535        ))
3536    }
3537}
3538
3539/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3540/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3541/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3542/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3543///
3544/// Rounding is done according to the rounding parameter, which can be one of:
3545///
3546/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3547/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3548/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3549/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3550/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3551///
3552/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_round_sch)
3553#[inline]
3554#[target_feature(enable = "avx512fp16")]
3555#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3556#[rustc_legacy_const_generics(3)]
3557#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3558pub fn _mm_maskz_cmul_round_sch<const ROUNDING: i32>(
3559    k: __mmask8,
3560    a: __m128h,
3561    b: __m128h,
3562) -> __m128h {
3563    static_assert_rounding!(ROUNDING);
3564    _mm_mask_cmul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
3565}
3566
3567/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3568/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3569/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3570/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3571///
3572/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_pch)
3573#[inline]
3574#[target_feature(enable = "avx512fp16,avx512vl")]
3575#[cfg_attr(test, assert_instr(vfcmulcph))]
3576#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3577pub fn _mm_fcmul_pch(a: __m128h, b: __m128h) -> __m128h {
3578    _mm_cmul_pch(a, b)
3579}
3580
3581/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3582/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3583/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3584/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3585///
3586/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_pch)
3587#[inline]
3588#[target_feature(enable = "avx512fp16,avx512vl")]
3589#[cfg_attr(test, assert_instr(vfcmulcph))]
3590#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3591pub fn _mm_mask_fcmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3592    _mm_mask_cmul_pch(src, k, a, b)
3593}
3594
3595/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3596/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3597/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3598/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3599///
3600/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_pch)
3601#[inline]
3602#[target_feature(enable = "avx512fp16,avx512vl")]
3603#[cfg_attr(test, assert_instr(vfcmulcph))]
3604#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3605pub fn _mm_maskz_fcmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3606    _mm_maskz_cmul_pch(k, a, b)
3607}
3608
3609/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3610/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3611/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3612/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3613///
3614/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmul_pch)
3615#[inline]
3616#[target_feature(enable = "avx512fp16,avx512vl")]
3617#[cfg_attr(test, assert_instr(vfcmulcph))]
3618#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3619pub fn _mm256_fcmul_pch(a: __m256h, b: __m256h) -> __m256h {
3620    _mm256_cmul_pch(a, b)
3621}
3622
3623/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3624/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3625/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3626/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3627///
3628/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmul_pch)
3629#[inline]
3630#[target_feature(enable = "avx512fp16,avx512vl")]
3631#[cfg_attr(test, assert_instr(vfcmulcph))]
3632#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3633pub fn _mm256_mask_fcmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3634    _mm256_mask_cmul_pch(src, k, a, b)
3635}
3636
3637/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3638/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3639/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3640/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3641///
3642/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmul_pch)
3643#[inline]
3644#[target_feature(enable = "avx512fp16,avx512vl")]
3645#[cfg_attr(test, assert_instr(vfcmulcph))]
3646#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3647pub fn _mm256_maskz_fcmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3648    _mm256_maskz_cmul_pch(k, a, b)
3649}
3650
3651/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3652/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3653/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3654/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3655///
3656/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_pch)
3657#[inline]
3658#[target_feature(enable = "avx512fp16")]
3659#[cfg_attr(test, assert_instr(vfcmulcph))]
3660#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3661pub fn _mm512_fcmul_pch(a: __m512h, b: __m512h) -> __m512h {
3662    _mm512_cmul_pch(a, b)
3663}
3664
3665/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3666/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3667/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3668/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3669///
3670/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_pch)
3671#[inline]
3672#[target_feature(enable = "avx512fp16")]
3673#[cfg_attr(test, assert_instr(vfcmulcph))]
3674#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3675pub fn _mm512_mask_fcmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3676    _mm512_mask_cmul_pch(src, k, a, b)
3677}
3678
3679/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3680/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3681/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3682/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3683///
3684/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_pch)
3685#[inline]
3686#[target_feature(enable = "avx512fp16")]
3687#[cfg_attr(test, assert_instr(vfcmulcph))]
3688#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3689pub fn _mm512_maskz_fcmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3690    _mm512_maskz_cmul_pch(k, a, b)
3691}
3692
3693/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3694/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3695/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3696///
3697/// Rounding is done according to the rounding parameter, which can be one of:
3698///
3699/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3700/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3701/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3702/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3703/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3704///
3705/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_round_pch)
3706#[inline]
3707#[target_feature(enable = "avx512fp16")]
3708#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3709#[rustc_legacy_const_generics(2)]
3710#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3711pub fn _mm512_fcmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3712    static_assert_rounding!(ROUNDING);
3713    _mm512_cmul_round_pch::<ROUNDING>(a, b)
3714}
3715
3716/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3717/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3718/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3719/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3720///
3721/// Rounding is done according to the rounding parameter, which can be one of:
3722///
3723/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3724/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3725/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3726/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3727/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3728///
3729/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_round_pch)
3730#[inline]
3731#[target_feature(enable = "avx512fp16")]
3732#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3733#[rustc_legacy_const_generics(4)]
3734#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3735pub fn _mm512_mask_fcmul_round_pch<const ROUNDING: i32>(
3736    src: __m512h,
3737    k: __mmask16,
3738    a: __m512h,
3739    b: __m512h,
3740) -> __m512h {
3741    static_assert_rounding!(ROUNDING);
3742    _mm512_mask_cmul_round_pch::<ROUNDING>(src, k, a, b)
3743}
3744
3745/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3746/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3747/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3748/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3749///
3750/// Rounding is done according to the rounding parameter, which can be one of:
3751///
3752/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3753/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3754/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3755/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3756/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3757///
3758/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_round_pch)
3759#[inline]
3760#[target_feature(enable = "avx512fp16")]
3761#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3762#[rustc_legacy_const_generics(3)]
3763#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3764pub fn _mm512_maskz_fcmul_round_pch<const ROUNDING: i32>(
3765    k: __mmask16,
3766    a: __m512h,
3767    b: __m512h,
3768) -> __m512h {
3769    static_assert_rounding!(ROUNDING);
3770    _mm512_maskz_cmul_round_pch::<ROUNDING>(k, a, b)
3771}
3772
3773/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3774/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3775/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3776/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3777///
3778/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_sch)
3779#[inline]
3780#[target_feature(enable = "avx512fp16")]
3781#[cfg_attr(test, assert_instr(vfcmulcsh))]
3782#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3783pub fn _mm_fcmul_sch(a: __m128h, b: __m128h) -> __m128h {
3784    _mm_cmul_sch(a, b)
3785}
3786
3787/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3788/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3789/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3790/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3791///
3792/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_sch)
3793#[inline]
3794#[target_feature(enable = "avx512fp16")]
3795#[cfg_attr(test, assert_instr(vfcmulcsh))]
3796#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3797pub fn _mm_mask_fcmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3798    _mm_mask_cmul_sch(src, k, a, b)
3799}
3800
3801/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3802/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3803/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3804/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3805///
3806/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_sch)
3807#[inline]
3808#[target_feature(enable = "avx512fp16")]
3809#[cfg_attr(test, assert_instr(vfcmulcsh))]
3810#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3811pub fn _mm_maskz_fcmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3812    _mm_maskz_cmul_sch(k, a, b)
3813}
3814
3815/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3816/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3817/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3818///
3819/// Rounding is done according to the rounding parameter, which can be one of:
3820///
3821/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3822/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3823/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3824/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3825/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3826///
3827/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_round_sch)
3828#[inline]
3829#[target_feature(enable = "avx512fp16")]
3830#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3831#[rustc_legacy_const_generics(2)]
3832#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3833pub fn _mm_fcmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3834    static_assert_rounding!(ROUNDING);
3835    _mm_cmul_round_sch::<ROUNDING>(a, b)
3836}
3837
3838/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3839/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3840/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3841/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3842///
3843/// Rounding is done according to the rounding parameter, which can be one of:
3844///
3845/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3846/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3847/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3848/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3849/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3850///
3851/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_round_sch)
3852#[inline]
3853#[target_feature(enable = "avx512fp16")]
3854#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3855#[rustc_legacy_const_generics(4)]
3856#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3857pub fn _mm_mask_fcmul_round_sch<const ROUNDING: i32>(
3858    src: __m128h,
3859    k: __mmask8,
3860    a: __m128h,
3861    b: __m128h,
3862) -> __m128h {
3863    static_assert_rounding!(ROUNDING);
3864    _mm_mask_cmul_round_sch::<ROUNDING>(src, k, a, b)
3865}
3866
3867/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3868/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3869/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3870/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3871///
3872/// Rounding is done according to the rounding parameter, which can be one of:
3873///
3874/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3875/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3876/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3877/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3878/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3879///
3880/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_round_sch)
3881#[inline]
3882#[target_feature(enable = "avx512fp16")]
3883#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3884#[rustc_legacy_const_generics(3)]
3885#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3886pub fn _mm_maskz_fcmul_round_sch<const ROUNDING: i32>(
3887    k: __mmask8,
3888    a: __m128h,
3889    b: __m128h,
3890) -> __m128h {
3891    static_assert_rounding!(ROUNDING);
3892    _mm_maskz_cmul_round_sch::<ROUNDING>(k, a, b)
3893}
3894
3895/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
3896/// the results in dst.
3897///
3898/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_ph)
3899#[inline]
3900#[target_feature(enable = "avx512fp16,avx512vl")]
3901#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3902pub fn _mm_abs_ph(v2: __m128h) -> __m128h {
3903    unsafe { transmute(_mm_and_si128(transmute(v2), _mm_set1_epi16(i16::MAX))) }
3904}
3905
3906/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
3907/// the result in dst.
3908///
3909/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_ph)
3910#[inline]
3911#[target_feature(enable = "avx512fp16,avx512vl")]
3912#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3913pub fn _mm256_abs_ph(v2: __m256h) -> __m256h {
3914    unsafe { transmute(_mm256_and_si256(transmute(v2), _mm256_set1_epi16(i16::MAX))) }
3915}
3916
3917/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
3918/// the result in dst.
3919///
3920/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_ph)
3921#[inline]
3922#[target_feature(enable = "avx512fp16")]
3923#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3924pub fn _mm512_abs_ph(v2: __m512h) -> __m512h {
3925    unsafe { transmute(_mm512_and_si512(transmute(v2), _mm512_set1_epi16(i16::MAX))) }
3926}
3927
3928/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex
3929/// number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines
3930/// the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate
3931/// `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3932///
3933/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_conj_pch)
3934#[inline]
3935#[target_feature(enable = "avx512fp16,avx512vl")]
3936#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3937pub fn _mm_conj_pch(a: __m128h) -> __m128h {
3938    unsafe { transmute(_mm_xor_si128(transmute(a), _mm_set1_epi32(i32::MIN))) }
3939}
3940
3941/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
3942/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
3943/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number
3944/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3945///
3946/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_conj_pch)
3947#[inline]
3948#[target_feature(enable = "avx512fp16,avx512vl")]
3949#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3950pub fn _mm_mask_conj_pch(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
3951    unsafe {
3952        let r: __m128 = transmute(_mm_conj_pch(a));
3953        transmute(simd_select_bitmask(k, r, transmute(src)))
3954    }
3955}
3956
3957/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
3958/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
3959/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3960/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3961///
3962/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_conj_pch)
3963#[inline]
3964#[target_feature(enable = "avx512fp16,avx512vl")]
3965#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3966pub fn _mm_maskz_conj_pch(k: __mmask8, a: __m128h) -> __m128h {
3967    _mm_mask_conj_pch(_mm_setzero_ph(), k, a)
3968}
3969
3970/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
3971/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3972/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3973///
3974/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_conj_pch)
3975#[inline]
3976#[target_feature(enable = "avx512fp16,avx512vl")]
3977#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3978pub fn _mm256_conj_pch(a: __m256h) -> __m256h {
3979    unsafe { transmute(_mm256_xor_si256(transmute(a), _mm256_set1_epi32(i32::MIN))) }
3980}
3981
3982/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
3983/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
3984/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3985/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3986///
3987/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_conj_pch)
3988#[inline]
3989#[target_feature(enable = "avx512fp16,avx512vl")]
3990#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3991pub fn _mm256_mask_conj_pch(src: __m256h, k: __mmask8, a: __m256h) -> __m256h {
3992    unsafe {
3993        let r: __m256 = transmute(_mm256_conj_pch(a));
3994        transmute(simd_select_bitmask(k, r, transmute(src)))
3995    }
3996}
3997
3998/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
3999/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
4000/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4001/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4002///
4003/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_conj_pch)
4004#[inline]
4005#[target_feature(enable = "avx512fp16,avx512vl")]
4006#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4007pub fn _mm256_maskz_conj_pch(k: __mmask8, a: __m256h) -> __m256h {
4008    _mm256_mask_conj_pch(_mm256_setzero_ph(), k, a)
4009}
4010
4011/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
4012/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4013/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4014///
4015/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_conj_pch)
4016#[inline]
4017#[target_feature(enable = "avx512fp16")]
4018#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4019pub fn _mm512_conj_pch(a: __m512h) -> __m512h {
4020    unsafe { transmute(_mm512_xor_si512(transmute(a), _mm512_set1_epi32(i32::MIN))) }
4021}
4022
4023/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
4024/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
4025/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4026/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4027///
4028/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_conj_pch)
4029#[inline]
4030#[target_feature(enable = "avx512fp16")]
4031#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4032pub fn _mm512_mask_conj_pch(src: __m512h, k: __mmask16, a: __m512h) -> __m512h {
4033    unsafe {
4034        let r: __m512 = transmute(_mm512_conj_pch(a));
4035        transmute(simd_select_bitmask(k, r, transmute(src)))
4036    }
4037}
4038
4039/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
4040/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
4041/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4042/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4043///
4044/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_conj_pch)
4045#[inline]
4046#[target_feature(enable = "avx512fp16")]
4047#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4048pub fn _mm512_maskz_conj_pch(k: __mmask16, a: __m512h) -> __m512h {
4049    _mm512_mask_conj_pch(_mm512_setzero_ph(), k, a)
4050}
4051
4052/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4053/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4054/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4055///
4056/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_pch)
4057#[inline]
4058#[target_feature(enable = "avx512fp16,avx512vl")]
4059#[cfg_attr(test, assert_instr(vfmaddcph))]
4060#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4061pub fn _mm_fmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4062    _mm_mask3_fmadd_pch(a, b, c, 0xff)
4063}
4064
4065/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4066/// and store the results in dst using writemask k (the element is copied from a when the corresponding
4067/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4068/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4069///
4070/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_pch)
4071#[inline]
4072#[target_feature(enable = "avx512fp16,avx512vl")]
4073#[cfg_attr(test, assert_instr(vfmaddcph))]
4074#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4075pub fn _mm_mask_fmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4076    unsafe {
4077        let r: __m128 = transmute(_mm_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4078        transmute(simd_select_bitmask(k, r, transmute(a)))
4079    }
4080}
4081
4082/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4083/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4084/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4085/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4086///
4087/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_pch)
4088#[inline]
4089#[target_feature(enable = "avx512fp16,avx512vl")]
4090#[cfg_attr(test, assert_instr(vfmaddcph))]
4091#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4092pub fn _mm_mask3_fmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4093    unsafe {
4094        transmute(vfmaddcph_mask3_128(
4095            transmute(a),
4096            transmute(b),
4097            transmute(c),
4098            k,
4099        ))
4100    }
4101}
4102
4103/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4104/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4105/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4106/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4107///
4108/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_pch)
4109#[inline]
4110#[target_feature(enable = "avx512fp16,avx512vl")]
4111#[cfg_attr(test, assert_instr(vfmaddcph))]
4112#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4113pub fn _mm_maskz_fmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4114    unsafe {
4115        transmute(vfmaddcph_maskz_128(
4116            transmute(a),
4117            transmute(b),
4118            transmute(c),
4119            k,
4120        ))
4121    }
4122}
4123
4124/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4125/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4126/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4127///
4128/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_pch)
4129#[inline]
4130#[target_feature(enable = "avx512fp16,avx512vl")]
4131#[cfg_attr(test, assert_instr(vfmaddcph))]
4132#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4133pub fn _mm256_fmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4134    _mm256_mask3_fmadd_pch(a, b, c, 0xff)
4135}
4136
4137/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4138/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4139/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4140/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4141///
4142/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_pch)
4143#[inline]
4144#[target_feature(enable = "avx512fp16,avx512vl")]
4145#[cfg_attr(test, assert_instr(vfmaddcph))]
4146#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4147pub fn _mm256_mask_fmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
4148    unsafe {
4149        let r: __m256 = transmute(_mm256_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4150        transmute(simd_select_bitmask(k, r, transmute(a)))
4151    }
4152}
4153
4154/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4155/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4156/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4157/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4158///
4159/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_pch)
4160#[inline]
4161#[target_feature(enable = "avx512fp16,avx512vl")]
4162#[cfg_attr(test, assert_instr(vfmaddcph))]
4163#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4164pub fn _mm256_mask3_fmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
4165    unsafe {
4166        transmute(vfmaddcph_mask3_256(
4167            transmute(a),
4168            transmute(b),
4169            transmute(c),
4170            k,
4171        ))
4172    }
4173}
4174
4175/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4176/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4177/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4178/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4179///
4180/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_pch)
4181#[inline]
4182#[target_feature(enable = "avx512fp16,avx512vl")]
4183#[cfg_attr(test, assert_instr(vfmaddcph))]
4184#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4185pub fn _mm256_maskz_fmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4186    unsafe {
4187        transmute(vfmaddcph_maskz_256(
4188            transmute(a),
4189            transmute(b),
4190            transmute(c),
4191            k,
4192        ))
4193    }
4194}
4195
4196/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4197/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4198/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4199///
4200/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_pch)
4201#[inline]
4202#[target_feature(enable = "avx512fp16")]
4203#[cfg_attr(test, assert_instr(vfmaddcph))]
4204#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4205pub fn _mm512_fmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4206    _mm512_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4207}
4208
4209/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4210/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4211/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4212/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4213///
4214/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_pch)
4215#[inline]
4216#[target_feature(enable = "avx512fp16")]
4217#[cfg_attr(test, assert_instr(vfmaddcph))]
4218#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4219pub fn _mm512_mask_fmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
4220    _mm512_mask_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4221}
4222
4223/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4224/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4225/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4226/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4227///
4228/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_pch)
4229#[inline]
4230#[target_feature(enable = "avx512fp16")]
4231#[cfg_attr(test, assert_instr(vfmaddcph))]
4232#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4233pub fn _mm512_mask3_fmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
4234    _mm512_mask3_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4235}
4236
4237/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4238/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4239/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4240/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4241///
4242/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_pch)
4243#[inline]
4244#[target_feature(enable = "avx512fp16")]
4245#[cfg_attr(test, assert_instr(vfmaddcph))]
4246#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4247pub fn _mm512_maskz_fmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4248    _mm512_maskz_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4249}
4250
4251/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4252/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4253/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4254///
4255/// Rounding is done according to the rounding parameter, which can be one of:
4256///
4257/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4258/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4259/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4260/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4261/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4262///
4263/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_pch)
4264#[inline]
4265#[target_feature(enable = "avx512fp16")]
4266#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4267#[rustc_legacy_const_generics(3)]
4268#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4269pub fn _mm512_fmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4270    static_assert_rounding!(ROUNDING);
4271    _mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
4272}
4273
4274/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4275/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4276/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4277/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4278///
4279/// Rounding is done according to the rounding parameter, which can be one of:
4280///
4281/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4282/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4283/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4284/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4285/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4286///
4287/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_pch)
4288#[inline]
4289#[target_feature(enable = "avx512fp16")]
4290#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4291#[rustc_legacy_const_generics(4)]
4292#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4293pub fn _mm512_mask_fmadd_round_pch<const ROUNDING: i32>(
4294    a: __m512h,
4295    k: __mmask16,
4296    b: __m512h,
4297    c: __m512h,
4298) -> __m512h {
4299    unsafe {
4300        static_assert_rounding!(ROUNDING);
4301        let r: __m512 = transmute(_mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
4302        transmute(simd_select_bitmask(k, r, transmute(a)))
4303    }
4304}
4305
4306/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4307/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4308/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4309/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4310///
4311/// Rounding is done according to the rounding parameter, which can be one of:
4312///
4313/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4314/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4315/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4316/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4317/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4318///
4319/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_pch)
4320#[inline]
4321#[target_feature(enable = "avx512fp16")]
4322#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4323#[rustc_legacy_const_generics(4)]
4324#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4325pub fn _mm512_mask3_fmadd_round_pch<const ROUNDING: i32>(
4326    a: __m512h,
4327    b: __m512h,
4328    c: __m512h,
4329    k: __mmask16,
4330) -> __m512h {
4331    unsafe {
4332        static_assert_rounding!(ROUNDING);
4333        transmute(vfmaddcph_mask3_512(
4334            transmute(a),
4335            transmute(b),
4336            transmute(c),
4337            k,
4338            ROUNDING,
4339        ))
4340    }
4341}
4342
4343/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4344/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4345/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4346/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4347///
4348/// Rounding is done according to the rounding parameter, which can be one of:
4349///
4350/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4351/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4352/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4353/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4354/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4355///
4356/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_pch)
4357#[inline]
4358#[target_feature(enable = "avx512fp16")]
4359#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4360#[rustc_legacy_const_generics(4)]
4361#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4362pub fn _mm512_maskz_fmadd_round_pch<const ROUNDING: i32>(
4363    k: __mmask16,
4364    a: __m512h,
4365    b: __m512h,
4366    c: __m512h,
4367) -> __m512h {
4368    unsafe {
4369        static_assert_rounding!(ROUNDING);
4370        transmute(vfmaddcph_maskz_512(
4371            transmute(a),
4372            transmute(b),
4373            transmute(c),
4374            k,
4375            ROUNDING,
4376        ))
4377    }
4378}
4379
4380/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4381/// store the result in the lower elements of dst, and copy the upper 6 packed elements from a to the
4382/// upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
4383/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4384///
4385/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sch)
4386#[inline]
4387#[target_feature(enable = "avx512fp16")]
4388#[cfg_attr(test, assert_instr(vfmaddcsh))]
4389#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4390pub fn _mm_fmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4391    _mm_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4392}
4393
4394/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4395/// store the result in the lower elements of dst using writemask k (elements are copied from a when
4396/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4397/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4398/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4399///
4400/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sch)
4401#[inline]
4402#[target_feature(enable = "avx512fp16")]
4403#[cfg_attr(test, assert_instr(vfmaddcsh))]
4404#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4405pub fn _mm_mask_fmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4406    _mm_mask_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4407}
4408
4409/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4410/// store the result in the lower elements of dst using writemask k (elements are copied from c when
4411/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4412/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4413/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4414///
4415/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sch)
4416#[inline]
4417#[target_feature(enable = "avx512fp16")]
4418#[cfg_attr(test, assert_instr(vfmaddcsh))]
4419#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4420pub fn _mm_mask3_fmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4421    _mm_mask3_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4422}
4423
4424/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4425/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
4426/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
4427/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4428/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4429///
4430/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sch)
4431#[inline]
4432#[target_feature(enable = "avx512fp16")]
4433#[cfg_attr(test, assert_instr(vfmaddcsh))]
4434#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4435pub fn _mm_maskz_fmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4436    _mm_maskz_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4437}
4438
4439/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4440/// store the result in the lower elements of dst. Each complex number is composed of two adjacent
4441/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4442///
4443/// Rounding is done according to the rounding parameter, which can be one of:
4444///
4445/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4446/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4447/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4448/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4449/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4450///
4451/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sch)
4452#[inline]
4453#[target_feature(enable = "avx512fp16")]
4454#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4455#[rustc_legacy_const_generics(3)]
4456#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4457pub fn _mm_fmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4458    unsafe {
4459        static_assert_rounding!(ROUNDING);
4460        transmute(vfmaddcsh_mask(
4461            transmute(a),
4462            transmute(b),
4463            transmute(c),
4464            0xff,
4465            ROUNDING,
4466        ))
4467    }
4468}
4469
4470/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4471/// store the result in the lower elements of dst using writemask k (elements are copied from a when
4472/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4473/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4474/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4475///
4476/// Rounding is done according to the rounding parameter, which can be one of:
4477///
4478/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4479/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4480/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4481/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4482/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4483///
4484/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sch)
4485#[inline]
4486#[target_feature(enable = "avx512fp16")]
4487#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4488#[rustc_legacy_const_generics(4)]
4489#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4490pub fn _mm_mask_fmadd_round_sch<const ROUNDING: i32>(
4491    a: __m128h,
4492    k: __mmask8,
4493    b: __m128h,
4494    c: __m128h,
4495) -> __m128h {
4496    unsafe {
4497        static_assert_rounding!(ROUNDING);
4498        let a = transmute(a);
4499        let r = vfmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING); // using `0xff` would have been fine here, but this is what CLang does
4500        transmute(_mm_mask_move_ss(a, k, a, r))
4501    }
4502}
4503
4504/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4505/// store the result in the lower elements of dst using writemask k (elements are copied from c when
4506/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4507/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4508/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4509///
4510/// Rounding is done according to the rounding parameter, which can be one of:
4511///
4512/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4513/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4514/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4515/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4516/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4517///
4518/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sch)
4519#[inline]
4520#[target_feature(enable = "avx512fp16")]
4521#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4522#[rustc_legacy_const_generics(4)]
4523#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4524pub fn _mm_mask3_fmadd_round_sch<const ROUNDING: i32>(
4525    a: __m128h,
4526    b: __m128h,
4527    c: __m128h,
4528    k: __mmask8,
4529) -> __m128h {
4530    unsafe {
4531        static_assert_rounding!(ROUNDING);
4532        let c = transmute(c);
4533        let r = vfmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
4534        transmute(_mm_move_ss(c, r))
4535    }
4536}
4537
4538/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4539/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
4540/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
4541/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4542/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4543///
4544/// Rounding is done according to the rounding parameter, which can be one of:
4545///
4546/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4547/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4548/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4549/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4550/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4551///
4552/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sch)
4553#[inline]
4554#[target_feature(enable = "avx512fp16")]
4555#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4556#[rustc_legacy_const_generics(4)]
4557#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4558pub fn _mm_maskz_fmadd_round_sch<const ROUNDING: i32>(
4559    k: __mmask8,
4560    a: __m128h,
4561    b: __m128h,
4562    c: __m128h,
4563) -> __m128h {
4564    unsafe {
4565        static_assert_rounding!(ROUNDING);
4566        transmute(vfmaddcsh_maskz(
4567            transmute(a),
4568            transmute(b),
4569            transmute(c),
4570            k,
4571            ROUNDING,
4572        ))
4573    }
4574}
4575
4576/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4577/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4578/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4579/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4580///
4581/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_pch)
4582#[inline]
4583#[target_feature(enable = "avx512fp16,avx512vl")]
4584#[cfg_attr(test, assert_instr(vfcmaddcph))]
4585#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4586pub fn _mm_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4587    _mm_mask3_fcmadd_pch(a, b, c, 0xff)
4588}
4589
4590/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4591/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4592/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4593/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4594/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4595///
4596/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_pch)
4597#[inline]
4598#[target_feature(enable = "avx512fp16,avx512vl")]
4599#[cfg_attr(test, assert_instr(vfcmaddcph))]
4600#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4601pub fn _mm_mask_fcmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4602    unsafe {
4603        let r: __m128 = transmute(_mm_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4604        transmute(simd_select_bitmask(k, r, transmute(a)))
4605    }
4606}
4607
4608/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4609/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4610/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4611/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4612/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4613///
4614/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_pch)
4615#[inline]
4616#[target_feature(enable = "avx512fp16,avx512vl")]
4617#[cfg_attr(test, assert_instr(vfcmaddcph))]
4618#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4619pub fn _mm_mask3_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4620    unsafe {
4621        transmute(vfcmaddcph_mask3_128(
4622            transmute(a),
4623            transmute(b),
4624            transmute(c),
4625            k,
4626        ))
4627    }
4628}
4629
4630/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4631/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4632/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4633/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4634/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4635///
4636/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_pch)
4637#[inline]
4638#[target_feature(enable = "avx512fp16,avx512vl")]
4639#[cfg_attr(test, assert_instr(vfcmaddcph))]
4640#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4641pub fn _mm_maskz_fcmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4642    unsafe {
4643        transmute(vfcmaddcph_maskz_128(
4644            transmute(a),
4645            transmute(b),
4646            transmute(c),
4647            k,
4648        ))
4649    }
4650}
4651
4652/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4653/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4654/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4655/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4656///
4657/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmadd_pch)
4658#[inline]
4659#[target_feature(enable = "avx512fp16,avx512vl")]
4660#[cfg_attr(test, assert_instr(vfcmaddcph))]
4661#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4662pub fn _mm256_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4663    _mm256_mask3_fcmadd_pch(a, b, c, 0xff)
4664}
4665
4666/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4667/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4668/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4669/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4670/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4671///
4672/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmadd_pch)
4673#[inline]
4674#[target_feature(enable = "avx512fp16,avx512vl")]
4675#[cfg_attr(test, assert_instr(vfcmaddcph))]
4676#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4677pub fn _mm256_mask_fcmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
4678    unsafe {
4679        let r: __m256 = transmute(_mm256_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4680        transmute(simd_select_bitmask(k, r, transmute(a)))
4681    }
4682}
4683
4684/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4685/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4686/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4687/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4688/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4689///
4690/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fcmadd_pch)
4691#[inline]
4692#[target_feature(enable = "avx512fp16,avx512vl")]
4693#[cfg_attr(test, assert_instr(vfcmaddcph))]
4694#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4695pub fn _mm256_mask3_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
4696    unsafe {
4697        transmute(vfcmaddcph_mask3_256(
4698            transmute(a),
4699            transmute(b),
4700            transmute(c),
4701            k,
4702        ))
4703    }
4704}
4705
4706/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4707/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4708/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4709/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4710/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4711///
4712/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmadd_pch)
4713#[inline]
4714#[target_feature(enable = "avx512fp16,avx512vl")]
4715#[cfg_attr(test, assert_instr(vfcmaddcph))]
4716#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4717pub fn _mm256_maskz_fcmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4718    unsafe {
4719        transmute(vfcmaddcph_maskz_256(
4720            transmute(a),
4721            transmute(b),
4722            transmute(c),
4723            k,
4724        ))
4725    }
4726}
4727
4728/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4729/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4730/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4731/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4732///
4733/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_pch)
4734#[inline]
4735#[target_feature(enable = "avx512fp16")]
4736#[cfg_attr(test, assert_instr(vfcmaddcph))]
4737#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4738pub fn _mm512_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4739    _mm512_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4740}
4741
4742/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4743/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4744/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4745/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4746/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4747///
4748/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_pch)
4749#[inline]
4750#[target_feature(enable = "avx512fp16")]
4751#[cfg_attr(test, assert_instr(vfcmaddcph))]
4752#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4753pub fn _mm512_mask_fcmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
4754    _mm512_mask_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4755}
4756
4757/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4758/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4759/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4760/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4761/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4762///
4763/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_pch)
4764#[inline]
4765#[target_feature(enable = "avx512fp16")]
4766#[cfg_attr(test, assert_instr(vfcmaddcph))]
4767#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4768pub fn _mm512_mask3_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
4769    _mm512_mask3_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4770}
4771
4772/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4773/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4774/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4775/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4776/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4777///
4778/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_pch)
4779#[inline]
4780#[target_feature(enable = "avx512fp16")]
4781#[cfg_attr(test, assert_instr(vfcmaddcph))]
4782#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4783pub fn _mm512_maskz_fcmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4784    _mm512_maskz_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4785}
4786
4787/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4788/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4789/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4790/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4791///
4792/// Rounding is done according to the rounding parameter, which can be one of:
4793///
4794/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4795/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4796/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4797/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4798/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4799///
4800/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_round_pch)
4801#[inline]
4802#[target_feature(enable = "avx512fp16")]
4803#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4804#[rustc_legacy_const_generics(3)]
4805#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4806pub fn _mm512_fcmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4807    static_assert_rounding!(ROUNDING);
4808    _mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
4809}
4810
4811/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4812/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4813/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4814/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4815/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4816///
4817/// Rounding is done according to the rounding parameter, which can be one of:
4818///
4819/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4820/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4821/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4822/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4823/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4824///
4825/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_round_pch)
4826#[inline]
4827#[target_feature(enable = "avx512fp16")]
4828#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4829#[rustc_legacy_const_generics(4)]
4830#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4831pub fn _mm512_mask_fcmadd_round_pch<const ROUNDING: i32>(
4832    a: __m512h,
4833    k: __mmask16,
4834    b: __m512h,
4835    c: __m512h,
4836) -> __m512h {
4837    unsafe {
4838        static_assert_rounding!(ROUNDING);
4839        let r: __m512 = transmute(_mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
4840        transmute(simd_select_bitmask(k, r, transmute(a)))
4841    }
4842}
4843
4844/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4845/// to the corresponding complex numbers in c using writemask k (the element is copied from c when the corresponding
4846/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
4847/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
4848/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4849///
4850/// Rounding is done according to the rounding parameter, which can be one of:
4851///
4852/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4853/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4854/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4855/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4856/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4857///
4858/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_round_pch)
4859#[inline]
4860#[target_feature(enable = "avx512fp16")]
4861#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4862#[rustc_legacy_const_generics(4)]
4863#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4864pub fn _mm512_mask3_fcmadd_round_pch<const ROUNDING: i32>(
4865    a: __m512h,
4866    b: __m512h,
4867    c: __m512h,
4868    k: __mmask16,
4869) -> __m512h {
4870    unsafe {
4871        static_assert_rounding!(ROUNDING);
4872        transmute(vfcmaddcph_mask3_512(
4873            transmute(a),
4874            transmute(b),
4875            transmute(c),
4876            k,
4877            ROUNDING,
4878        ))
4879    }
4880}
4881
4882/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4883/// to the corresponding complex numbers in c using zeromask k (the element is zeroed out when the corresponding
4884/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
4885/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
4886/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4887///
4888/// Rounding is done according to the rounding parameter, which can be one of:
4889///
4890/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4891/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4892/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4893/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4894/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4895///
4896/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_round_pch)
4897#[inline]
4898#[target_feature(enable = "avx512fp16")]
4899#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4900#[rustc_legacy_const_generics(4)]
4901#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4902pub fn _mm512_maskz_fcmadd_round_pch<const ROUNDING: i32>(
4903    k: __mmask16,
4904    a: __m512h,
4905    b: __m512h,
4906    c: __m512h,
4907) -> __m512h {
4908    unsafe {
4909        static_assert_rounding!(ROUNDING);
4910        transmute(vfcmaddcph_maskz_512(
4911            transmute(a),
4912            transmute(b),
4913            transmute(c),
4914            k,
4915            ROUNDING,
4916        ))
4917    }
4918}
4919
4920/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4921/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
4922/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
4923/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4924/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4925///
4926/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_sch)
4927#[inline]
4928#[target_feature(enable = "avx512fp16")]
4929#[cfg_attr(test, assert_instr(vfcmaddcsh))]
4930#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4931pub fn _mm_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4932    _mm_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4933}
4934
4935/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4936/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
4937/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
4938/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
4939/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4940/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4941///
4942/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_sch)
4943#[inline]
4944#[target_feature(enable = "avx512fp16")]
4945#[cfg_attr(test, assert_instr(vfcmaddcsh))]
4946#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4947pub fn _mm_mask_fcmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4948    _mm_mask_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4949}
4950
4951/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4952/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
4953/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
4954/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
4955/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4956/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4957///
4958/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_sch)
4959#[inline]
4960#[target_feature(enable = "avx512fp16")]
4961#[cfg_attr(test, assert_instr(vfcmaddcsh))]
4962#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4963pub fn _mm_mask3_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4964    _mm_mask3_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4965}
4966
4967/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4968/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
4969/// zeromask k (the element is zeroed out when the corresponding mask bit is not set), and copy the upper
4970/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
4971/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4972/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4973///
4974/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_sch)
4975#[inline]
4976#[target_feature(enable = "avx512fp16")]
4977#[cfg_attr(test, assert_instr(vfcmaddcsh))]
4978#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4979pub fn _mm_maskz_fcmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4980    _mm_maskz_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4981}
4982
4983/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4984/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
4985/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
4986/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4987/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4988///
4989/// Rounding is done according to the rounding parameter, which can be one of:
4990///
4991/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4992/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4993/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4994/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4995/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4996///
4997/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_round_sch)
4998#[inline]
4999#[target_feature(enable = "avx512fp16")]
5000#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5001#[rustc_legacy_const_generics(3)]
5002#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5003pub fn _mm_fcmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5004    unsafe {
5005        static_assert_rounding!(ROUNDING);
5006        transmute(vfcmaddcsh_mask(
5007            transmute(a),
5008            transmute(b),
5009            transmute(c),
5010            0xff,
5011            ROUNDING,
5012        ))
5013    }
5014}
5015
5016/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5017/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5018/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
5019/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5020/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5021/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5022///
5023/// Rounding is done according to the rounding parameter, which can be one of:
5024///
5025/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5026/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5027/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5028/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5029/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5030///
5031/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_round_sch)
5032#[inline]
5033#[target_feature(enable = "avx512fp16")]
5034#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5035#[rustc_legacy_const_generics(4)]
5036#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5037pub fn _mm_mask_fcmadd_round_sch<const ROUNDING: i32>(
5038    a: __m128h,
5039    k: __mmask8,
5040    b: __m128h,
5041    c: __m128h,
5042) -> __m128h {
5043    unsafe {
5044        static_assert_rounding!(ROUNDING);
5045        let a = transmute(a);
5046        let r = vfcmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING);
5047        transmute(_mm_mask_move_ss(a, k, a, r))
5048    }
5049}
5050
5051/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5052/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5053/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
5054/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5055/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5056/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5057///
5058/// Rounding is done according to the rounding parameter, which can be one of:
5059///
5060/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5061/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5062/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5063/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5064/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5065///
5066/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_round_sch)
5067#[inline]
5068#[target_feature(enable = "avx512fp16")]
5069#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5070#[rustc_legacy_const_generics(4)]
5071#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5072pub fn _mm_mask3_fcmadd_round_sch<const ROUNDING: i32>(
5073    a: __m128h,
5074    b: __m128h,
5075    c: __m128h,
5076    k: __mmask8,
5077) -> __m128h {
5078    unsafe {
5079        static_assert_rounding!(ROUNDING);
5080        let c = transmute(c);
5081        let r = vfcmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
5082        transmute(_mm_move_ss(c, r))
5083    }
5084}
5085
5086/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5087/// accumulate to the lower complex number in c using zeromask k (the element is zeroed out when the corresponding
5088/// mask bit is not set), and store the result in the lower elements of dst, and copy the upper 6 packed elements
5089/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
5090/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
5091/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5092///
5093/// Rounding is done according to the rounding parameter, which can be one of:
5094///
5095/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5096/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5097/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5098/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5099/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5100///
5101/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_round_sch)
5102#[inline]
5103#[target_feature(enable = "avx512fp16")]
5104#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5105#[rustc_legacy_const_generics(4)]
5106#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5107pub fn _mm_maskz_fcmadd_round_sch<const ROUNDING: i32>(
5108    k: __mmask8,
5109    a: __m128h,
5110    b: __m128h,
5111    c: __m128h,
5112) -> __m128h {
5113    unsafe {
5114        static_assert_rounding!(ROUNDING);
5115        transmute(vfcmaddcsh_maskz(
5116            transmute(a),
5117            transmute(b),
5118            transmute(c),
5119            k,
5120            ROUNDING,
5121        ))
5122    }
5123}
5124
5125/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5126/// result to packed elements in c, and store the results in dst.
5127///
5128/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_ph)
5129#[inline]
5130#[target_feature(enable = "avx512fp16,avx512vl")]
5131#[cfg_attr(test, assert_instr(vfmadd))]
5132#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5133pub fn _mm_fmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5134    unsafe { simd_fma(a, b, c) }
5135}
5136
5137/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5138/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5139/// from a when the corresponding mask bit is not set).
5140///
5141/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_ph)
5142#[inline]
5143#[target_feature(enable = "avx512fp16,avx512vl")]
5144#[cfg_attr(test, assert_instr(vfmadd))]
5145#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5146pub fn _mm_mask_fmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5147    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), a) }
5148}
5149
5150/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5151/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5152/// from c when the corresponding mask bit is not set).
5153///
5154/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_ph)
5155#[inline]
5156#[target_feature(enable = "avx512fp16,avx512vl")]
5157#[cfg_attr(test, assert_instr(vfmadd))]
5158#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5159pub fn _mm_mask3_fmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5160    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), c) }
5161}
5162
5163/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5164/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5165/// out when the corresponding mask bit is not set).
5166///
5167/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_ph)
5168#[inline]
5169#[target_feature(enable = "avx512fp16,avx512vl")]
5170#[cfg_attr(test, assert_instr(vfmadd))]
5171#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5172pub fn _mm_maskz_fmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5173    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), _mm_setzero_ph()) }
5174}
5175
5176/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5177/// result to packed elements in c, and store the results in dst.
5178///
5179/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_ph)
5180#[inline]
5181#[target_feature(enable = "avx512fp16,avx512vl")]
5182#[cfg_attr(test, assert_instr(vfmadd))]
5183#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5184pub fn _mm256_fmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5185    unsafe { simd_fma(a, b, c) }
5186}
5187
5188/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5189/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5190/// from a when the corresponding mask bit is not set).
5191///
5192/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_ph)
5193#[inline]
5194#[target_feature(enable = "avx512fp16,avx512vl")]
5195#[cfg_attr(test, assert_instr(vfmadd))]
5196#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5197pub fn _mm256_mask_fmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
5198    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), a) }
5199}
5200
5201/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5202/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5203/// from c when the corresponding mask bit is not set).
5204///
5205/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_ph)
5206#[inline]
5207#[target_feature(enable = "avx512fp16,avx512vl")]
5208#[cfg_attr(test, assert_instr(vfmadd))]
5209#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5210pub fn _mm256_mask3_fmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
5211    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), c) }
5212}
5213
5214/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5215/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5216/// out when the corresponding mask bit is not set).
5217///
5218/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_ph)
5219#[inline]
5220#[target_feature(enable = "avx512fp16,avx512vl")]
5221#[cfg_attr(test, assert_instr(vfmadd))]
5222#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5223pub fn _mm256_maskz_fmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5224    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), _mm256_setzero_ph()) }
5225}
5226
5227/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5228/// result to packed elements in c, and store the results in dst.
5229///
5230/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_ph)
5231#[inline]
5232#[target_feature(enable = "avx512fp16")]
5233#[cfg_attr(test, assert_instr(vfmadd))]
5234#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5235pub fn _mm512_fmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5236    unsafe { simd_fma(a, b, c) }
5237}
5238
5239/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5240/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5241/// from a when the corresponding mask bit is not set).
5242///
5243/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_ph)
5244#[inline]
5245#[target_feature(enable = "avx512fp16")]
5246#[cfg_attr(test, assert_instr(vfmadd))]
5247#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5248pub fn _mm512_mask_fmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
5249    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), a) }
5250}
5251
5252/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5253/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5254/// from c when the corresponding mask bit is not set).
5255///
5256/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_ph)
5257#[inline]
5258#[target_feature(enable = "avx512fp16")]
5259#[cfg_attr(test, assert_instr(vfmadd))]
5260#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5261pub fn _mm512_mask3_fmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
5262    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), c) }
5263}
5264
5265/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5266/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5267/// out when the corresponding mask bit is not set).
5268///
5269/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_ph)
5270#[inline]
5271#[target_feature(enable = "avx512fp16")]
5272#[cfg_attr(test, assert_instr(vfmadd))]
5273#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5274pub fn _mm512_maskz_fmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5275    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), _mm512_setzero_ph()) }
5276}
5277
5278/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5279/// result to packed elements in c, and store the results in dst.
5280///
5281/// Rounding is done according to the rounding parameter, which can be one of:
5282///
5283/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5284/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5285/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5286/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5287/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5288///
5289/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_ph)
5290#[inline]
5291#[target_feature(enable = "avx512fp16")]
5292#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5293#[rustc_legacy_const_generics(3)]
5294#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5295pub fn _mm512_fmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5296    unsafe {
5297        static_assert_rounding!(ROUNDING);
5298        vfmaddph_512(a, b, c, ROUNDING)
5299    }
5300}
5301
5302/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5303/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5304/// from a when the corresponding mask bit is not set).
5305///
5306/// Rounding is done according to the rounding parameter, which can be one of:
5307///
5308/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5309/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5310/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5311/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5312/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5313///
5314/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_ph)
5315#[inline]
5316#[target_feature(enable = "avx512fp16")]
5317#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5318#[rustc_legacy_const_generics(4)]
5319#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5320pub fn _mm512_mask_fmadd_round_ph<const ROUNDING: i32>(
5321    a: __m512h,
5322    k: __mmask32,
5323    b: __m512h,
5324    c: __m512h,
5325) -> __m512h {
5326    unsafe {
5327        static_assert_rounding!(ROUNDING);
5328        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), a)
5329    }
5330}
5331
5332/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5333/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5334/// from c when the corresponding mask bit is not set).
5335///
5336/// Rounding is done according to the rounding parameter, which can be one of:
5337///
5338/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5339/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5340/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5341/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5342/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5343///
5344/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_ph)
5345#[inline]
5346#[target_feature(enable = "avx512fp16")]
5347#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5348#[rustc_legacy_const_generics(4)]
5349#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5350pub fn _mm512_mask3_fmadd_round_ph<const ROUNDING: i32>(
5351    a: __m512h,
5352    b: __m512h,
5353    c: __m512h,
5354    k: __mmask32,
5355) -> __m512h {
5356    unsafe {
5357        static_assert_rounding!(ROUNDING);
5358        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), c)
5359    }
5360}
5361
5362/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5363/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5364/// out when the corresponding mask bit is not set).
5365///
5366/// Rounding is done according to the rounding parameter, which can be one of:
5367///
5368/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5369/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5370/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5371/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5372/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5373///
5374/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_ph)
5375#[inline]
5376#[target_feature(enable = "avx512fp16")]
5377#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5378#[rustc_legacy_const_generics(4)]
5379#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5380pub fn _mm512_maskz_fmadd_round_ph<const ROUNDING: i32>(
5381    k: __mmask32,
5382    a: __m512h,
5383    b: __m512h,
5384    c: __m512h,
5385) -> __m512h {
5386    unsafe {
5387        static_assert_rounding!(ROUNDING);
5388        simd_select_bitmask(
5389            k,
5390            _mm512_fmadd_round_ph::<ROUNDING>(a, b, c),
5391            _mm512_setzero_ph(),
5392        )
5393    }
5394}
5395
5396/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5397/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
5398/// 7 packed elements from a to the upper elements of dst.
5399///
5400/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sh)
5401#[inline]
5402#[target_feature(enable = "avx512fp16")]
5403#[cfg_attr(test, assert_instr(vfmadd))]
5404#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5405pub fn _mm_fmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5406    unsafe {
5407        let extracta: f16 = simd_extract!(a, 0);
5408        let extractb: f16 = simd_extract!(b, 0);
5409        let extractc: f16 = simd_extract!(c, 0);
5410        let r = fmaf16(extracta, extractb, extractc);
5411        simd_insert!(a, 0, r)
5412    }
5413}
5414
5415/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5416/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5417/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5418/// upper elements of dst.
5419///
5420/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sh)
5421#[inline]
5422#[target_feature(enable = "avx512fp16")]
5423#[cfg_attr(test, assert_instr(vfmadd))]
5424#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5425pub fn _mm_mask_fmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5426    unsafe {
5427        let mut fmadd: f16 = simd_extract!(a, 0);
5428        if k & 1 != 0 {
5429            let extractb: f16 = simd_extract!(b, 0);
5430            let extractc: f16 = simd_extract!(c, 0);
5431            fmadd = fmaf16(fmadd, extractb, extractc);
5432        }
5433        simd_insert!(a, 0, fmadd)
5434    }
5435}
5436
5437/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5438/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5439/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5440/// upper elements of dst.
5441///
5442/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sh)
5443#[inline]
5444#[target_feature(enable = "avx512fp16")]
5445#[cfg_attr(test, assert_instr(vfmadd))]
5446#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5447pub fn _mm_mask3_fmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5448    unsafe {
5449        let mut fmadd: f16 = simd_extract!(c, 0);
5450        if k & 1 != 0 {
5451            let extracta: f16 = simd_extract!(a, 0);
5452            let extractb: f16 = simd_extract!(b, 0);
5453            fmadd = fmaf16(extracta, extractb, fmadd);
5454        }
5455        simd_insert!(c, 0, fmadd)
5456    }
5457}
5458
5459/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5460/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
5461/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5462/// upper elements of dst.
5463///
5464/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sh)
5465#[inline]
5466#[target_feature(enable = "avx512fp16")]
5467#[cfg_attr(test, assert_instr(vfmadd))]
5468#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5469pub fn _mm_maskz_fmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5470    unsafe {
5471        let mut fmadd: f16 = 0.0;
5472        if k & 1 != 0 {
5473            let extracta: f16 = simd_extract!(a, 0);
5474            let extractb: f16 = simd_extract!(b, 0);
5475            let extractc: f16 = simd_extract!(c, 0);
5476            fmadd = fmaf16(extracta, extractb, extractc);
5477        }
5478        simd_insert!(a, 0, fmadd)
5479    }
5480}
5481
5482/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5483/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
5484/// 7 packed elements from a to the upper elements of dst.
5485///
5486/// Rounding is done according to the rounding parameter, which can be one of:
5487///
5488/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5489/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5490/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5491/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5492/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5493///
5494/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sh)
5495#[inline]
5496#[target_feature(enable = "avx512fp16")]
5497#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5498#[rustc_legacy_const_generics(3)]
5499#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5500pub fn _mm_fmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5501    unsafe {
5502        static_assert_rounding!(ROUNDING);
5503        let extracta: f16 = simd_extract!(a, 0);
5504        let extractb: f16 = simd_extract!(b, 0);
5505        let extractc: f16 = simd_extract!(c, 0);
5506        let r = vfmaddsh(extracta, extractb, extractc, ROUNDING);
5507        simd_insert!(a, 0, r)
5508    }
5509}
5510
5511/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5512/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5513/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5514/// upper elements of dst.
5515///
5516/// Rounding is done according to the rounding parameter, which can be one of:
5517///
5518/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5519/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5520/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5521/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5522/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5523///
5524/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sh)
5525#[inline]
5526#[target_feature(enable = "avx512fp16")]
5527#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5528#[rustc_legacy_const_generics(4)]
5529#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5530pub fn _mm_mask_fmadd_round_sh<const ROUNDING: i32>(
5531    a: __m128h,
5532    k: __mmask8,
5533    b: __m128h,
5534    c: __m128h,
5535) -> __m128h {
5536    unsafe {
5537        static_assert_rounding!(ROUNDING);
5538        let mut fmadd: f16 = simd_extract!(a, 0);
5539        if k & 1 != 0 {
5540            let extractb: f16 = simd_extract!(b, 0);
5541            let extractc: f16 = simd_extract!(c, 0);
5542            fmadd = vfmaddsh(fmadd, extractb, extractc, ROUNDING);
5543        }
5544        simd_insert!(a, 0, fmadd)
5545    }
5546}
5547
5548/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5549/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5550/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5551/// upper elements of dst.
5552///
5553/// Rounding is done according to the rounding parameter, which can be one of:
5554///
5555/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5556/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5557/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5558/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5559/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5560///
5561/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sh)
5562#[inline]
5563#[target_feature(enable = "avx512fp16")]
5564#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5565#[rustc_legacy_const_generics(4)]
5566#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5567pub fn _mm_mask3_fmadd_round_sh<const ROUNDING: i32>(
5568    a: __m128h,
5569    b: __m128h,
5570    c: __m128h,
5571    k: __mmask8,
5572) -> __m128h {
5573    unsafe {
5574        static_assert_rounding!(ROUNDING);
5575        let mut fmadd: f16 = simd_extract!(c, 0);
5576        if k & 1 != 0 {
5577            let extracta: f16 = simd_extract!(a, 0);
5578            let extractb: f16 = simd_extract!(b, 0);
5579            fmadd = vfmaddsh(extracta, extractb, fmadd, ROUNDING);
5580        }
5581        simd_insert!(c, 0, fmadd)
5582    }
5583}
5584
5585/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5586/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
5587/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5588/// upper elements of dst.
5589///
5590/// Rounding is done according to the rounding parameter, which can be one of:
5591///
5592/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5593/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5594/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5595/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5596/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5597///
5598/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sh)
5599#[inline]
5600#[target_feature(enable = "avx512fp16")]
5601#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5602#[rustc_legacy_const_generics(4)]
5603#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5604pub fn _mm_maskz_fmadd_round_sh<const ROUNDING: i32>(
5605    k: __mmask8,
5606    a: __m128h,
5607    b: __m128h,
5608    c: __m128h,
5609) -> __m128h {
5610    unsafe {
5611        static_assert_rounding!(ROUNDING);
5612        let mut fmadd: f16 = 0.0;
5613        if k & 1 != 0 {
5614            let extracta: f16 = simd_extract!(a, 0);
5615            let extractb: f16 = simd_extract!(b, 0);
5616            let extractc: f16 = simd_extract!(c, 0);
5617            fmadd = vfmaddsh(extracta, extractb, extractc, ROUNDING);
5618        }
5619        simd_insert!(a, 0, fmadd)
5620    }
5621}
5622
5623/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5624/// in c from the intermediate result, and store the results in dst.
5625/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5626///
5627/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_ph)
5628#[inline]
5629#[target_feature(enable = "avx512fp16,avx512vl")]
5630#[cfg_attr(test, assert_instr(vfmsub))]
5631#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5632pub fn _mm_fmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5633    unsafe { simd_fma(a, b, simd_neg(c)) }
5634}
5635
5636/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5637/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5638/// from a when the corresponding mask bit is not set).
5639///
5640/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_ph)
5641#[inline]
5642#[target_feature(enable = "avx512fp16,avx512vl")]
5643#[cfg_attr(test, assert_instr(vfmsub))]
5644#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5645pub fn _mm_mask_fmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5646    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), a) }
5647}
5648
5649/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5650/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5651/// from c when the corresponding mask bit is not set).
5652///
5653/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_ph)
5654#[inline]
5655#[target_feature(enable = "avx512fp16,avx512vl")]
5656#[cfg_attr(test, assert_instr(vfmsub))]
5657#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5658pub fn _mm_mask3_fmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5659    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), c) }
5660}
5661
5662/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5663/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5664/// out when the corresponding mask bit is not set).
5665///
5666/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_ph)
5667#[inline]
5668#[target_feature(enable = "avx512fp16,avx512vl")]
5669#[cfg_attr(test, assert_instr(vfmsub))]
5670#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5671pub fn _mm_maskz_fmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5672    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), _mm_setzero_ph()) }
5673}
5674
5675/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5676/// in c from the intermediate result, and store the results in dst.
5677///
5678/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsub_ph)
5679#[inline]
5680#[target_feature(enable = "avx512fp16,avx512vl")]
5681#[cfg_attr(test, assert_instr(vfmsub))]
5682#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5683pub fn _mm256_fmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5684    unsafe { simd_fma(a, b, simd_neg(c)) }
5685}
5686
5687/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5688/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5689/// from a when the corresponding mask bit is not set).
5690///
5691/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsub_ph)
5692#[inline]
5693#[target_feature(enable = "avx512fp16,avx512vl")]
5694#[cfg_attr(test, assert_instr(vfmsub))]
5695#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5696pub fn _mm256_mask_fmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
5697    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), a) }
5698}
5699
5700/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5701/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5702/// from c when the corresponding mask bit is not set).
5703///
5704/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsub_ph)
5705#[inline]
5706#[target_feature(enable = "avx512fp16,avx512vl")]
5707#[cfg_attr(test, assert_instr(vfmsub))]
5708#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5709pub fn _mm256_mask3_fmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
5710    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), c) }
5711}
5712
5713/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5714/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5715/// out when the corresponding mask bit is not set).
5716///
5717/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsub_ph)
5718#[inline]
5719#[target_feature(enable = "avx512fp16,avx512vl")]
5720#[cfg_attr(test, assert_instr(vfmsub))]
5721#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5722pub fn _mm256_maskz_fmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5723    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), _mm256_setzero_ph()) }
5724}
5725
5726/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5727/// in c from the intermediate result, and store the results in dst.
5728///
5729/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_ph)
5730#[inline]
5731#[target_feature(enable = "avx512fp16")]
5732#[cfg_attr(test, assert_instr(vfmsub))]
5733#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5734pub fn _mm512_fmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5735    unsafe { simd_fma(a, b, simd_neg(c)) }
5736}
5737
5738/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5739/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5740/// from a when the corresponding mask bit is not set).
5741///
5742/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_ph)
5743#[inline]
5744#[target_feature(enable = "avx512fp16")]
5745#[cfg_attr(test, assert_instr(vfmsub))]
5746#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5747pub fn _mm512_mask_fmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
5748    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), a) }
5749}
5750
5751/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5752/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5753/// from c when the corresponding mask bit is not set).
5754///
5755/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_ph)
5756#[inline]
5757#[target_feature(enable = "avx512fp16")]
5758#[cfg_attr(test, assert_instr(vfmsub))]
5759#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5760pub fn _mm512_mask3_fmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
5761    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), c) }
5762}
5763
5764/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5765/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5766/// out when the corresponding mask bit is not set).
5767///
5768/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_ph)
5769#[inline]
5770#[target_feature(enable = "avx512fp16")]
5771#[cfg_attr(test, assert_instr(vfmsub))]
5772#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5773pub fn _mm512_maskz_fmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5774    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), _mm512_setzero_ph()) }
5775}
5776
5777/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5778/// in c from the intermediate result, and store the results in dst.
5779///
5780/// Rounding is done according to the rounding parameter, which can be one of:
5781///
5782/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5783/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5784/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5785/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5786/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5787///
5788/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_round_ph)
5789#[inline]
5790#[target_feature(enable = "avx512fp16")]
5791#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5792#[rustc_legacy_const_generics(3)]
5793#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5794pub fn _mm512_fmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5795    unsafe {
5796        static_assert_rounding!(ROUNDING);
5797        vfmaddph_512(a, b, simd_neg(c), ROUNDING)
5798    }
5799}
5800
5801/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5802/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5803/// from a when the corresponding mask bit is not set).
5804///
5805/// Rounding is done according to the rounding parameter, which can be one of:
5806///
5807/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5808/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5809/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5810/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5811/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5812///
5813/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_round_ph)
5814#[inline]
5815#[target_feature(enable = "avx512fp16")]
5816#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5817#[rustc_legacy_const_generics(4)]
5818#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5819pub fn _mm512_mask_fmsub_round_ph<const ROUNDING: i32>(
5820    a: __m512h,
5821    k: __mmask32,
5822    b: __m512h,
5823    c: __m512h,
5824) -> __m512h {
5825    unsafe {
5826        static_assert_rounding!(ROUNDING);
5827        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), a)
5828    }
5829}
5830
5831/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5832/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5833/// from c when the corresponding mask bit is not set).
5834///
5835/// Rounding is done according to the rounding parameter, which can be one of:
5836///
5837/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5838/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5839/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5840/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5841/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5842///
5843/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_round_ph)
5844#[inline]
5845#[target_feature(enable = "avx512fp16")]
5846#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5847#[rustc_legacy_const_generics(4)]
5848#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5849pub fn _mm512_mask3_fmsub_round_ph<const ROUNDING: i32>(
5850    a: __m512h,
5851    b: __m512h,
5852    c: __m512h,
5853    k: __mmask32,
5854) -> __m512h {
5855    unsafe {
5856        static_assert_rounding!(ROUNDING);
5857        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), c)
5858    }
5859}
5860
5861/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5862/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5863/// out when the corresponding mask bit is not set).
5864///
5865/// Rounding is done according to the rounding parameter, which can be one of:
5866///
5867/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5868/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5869/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5870/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5871/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5872///
5873/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_round_ph)
5874#[inline]
5875#[target_feature(enable = "avx512fp16")]
5876#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5877#[rustc_legacy_const_generics(4)]
5878#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5879pub fn _mm512_maskz_fmsub_round_ph<const ROUNDING: i32>(
5880    k: __mmask32,
5881    a: __m512h,
5882    b: __m512h,
5883    c: __m512h,
5884) -> __m512h {
5885    unsafe {
5886        static_assert_rounding!(ROUNDING);
5887        simd_select_bitmask(
5888            k,
5889            _mm512_fmsub_round_ph::<ROUNDING>(a, b, c),
5890            _mm512_setzero_ph(),
5891        )
5892    }
5893}
5894
5895/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5896/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
5897/// 7 packed elements from a to the upper elements of dst.
5898///
5899/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_sh)
5900#[inline]
5901#[target_feature(enable = "avx512fp16")]
5902#[cfg_attr(test, assert_instr(vfmsub))]
5903#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5904pub fn _mm_fmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5905    unsafe {
5906        let extracta: f16 = simd_extract!(a, 0);
5907        let extractb: f16 = simd_extract!(b, 0);
5908        let extractc: f16 = simd_extract!(c, 0);
5909        let r = fmaf16(extracta, extractb, -extractc);
5910        simd_insert!(a, 0, r)
5911    }
5912}
5913
5914/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5915/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
5916/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5917/// upper elements of dst.
5918///
5919/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_sh)
5920#[inline]
5921#[target_feature(enable = "avx512fp16")]
5922#[cfg_attr(test, assert_instr(vfmsub))]
5923#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5924pub fn _mm_mask_fmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5925    unsafe {
5926        let mut fmsub: f16 = simd_extract!(a, 0);
5927        if k & 1 != 0 {
5928            let extractb: f16 = simd_extract!(b, 0);
5929            let extractc: f16 = simd_extract!(c, 0);
5930            fmsub = fmaf16(fmsub, extractb, -extractc);
5931        }
5932        simd_insert!(a, 0, fmsub)
5933    }
5934}
5935
5936/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5937/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
5938/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5939/// upper elements of dst.
5940///
5941/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_sh)
5942#[inline]
5943#[target_feature(enable = "avx512fp16")]
5944#[cfg_attr(test, assert_instr(vfmsub))]
5945#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5946pub fn _mm_mask3_fmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5947    unsafe {
5948        let mut fmsub: f16 = simd_extract!(c, 0);
5949        if k & 1 != 0 {
5950            let extracta: f16 = simd_extract!(a, 0);
5951            let extractb: f16 = simd_extract!(b, 0);
5952            fmsub = fmaf16(extracta, extractb, -fmsub);
5953        }
5954        simd_insert!(c, 0, fmsub)
5955    }
5956}
5957
5958/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5959/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
5960/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5961/// upper elements of dst.
5962///
5963/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_sh)
5964#[inline]
5965#[target_feature(enable = "avx512fp16")]
5966#[cfg_attr(test, assert_instr(vfmsub))]
5967#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5968pub fn _mm_maskz_fmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5969    unsafe {
5970        let mut fmsub: f16 = 0.0;
5971        if k & 1 != 0 {
5972            let extracta: f16 = simd_extract!(a, 0);
5973            let extractb: f16 = simd_extract!(b, 0);
5974            let extractc: f16 = simd_extract!(c, 0);
5975            fmsub = fmaf16(extracta, extractb, -extractc);
5976        }
5977        simd_insert!(a, 0, fmsub)
5978    }
5979}
5980
5981/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5982/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
5983/// 7 packed elements from a to the upper elements of dst.
5984///
5985/// Rounding is done according to the rounding parameter, which can be one of:
5986///
5987/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5988/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5989/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5990/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5991/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5992///
5993/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_round_sh)
5994#[inline]
5995#[target_feature(enable = "avx512fp16")]
5996#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5997#[rustc_legacy_const_generics(3)]
5998#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5999pub fn _mm_fmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6000    unsafe {
6001        static_assert_rounding!(ROUNDING);
6002        let extracta: f16 = simd_extract!(a, 0);
6003        let extractb: f16 = simd_extract!(b, 0);
6004        let extractc: f16 = simd_extract!(c, 0);
6005        let r = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
6006        simd_insert!(a, 0, r)
6007    }
6008}
6009
6010/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6011/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6012/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6013/// upper elements of dst.
6014///
6015/// Rounding is done according to the rounding parameter, which can be one of:
6016///
6017/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6018/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6019/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6020/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6021/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6022///
6023/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_round_sh)
6024#[inline]
6025#[target_feature(enable = "avx512fp16")]
6026#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6027#[rustc_legacy_const_generics(4)]
6028#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6029pub fn _mm_mask_fmsub_round_sh<const ROUNDING: i32>(
6030    a: __m128h,
6031    k: __mmask8,
6032    b: __m128h,
6033    c: __m128h,
6034) -> __m128h {
6035    unsafe {
6036        static_assert_rounding!(ROUNDING);
6037        let mut fmsub: f16 = simd_extract!(a, 0);
6038        if k & 1 != 0 {
6039            let extractb: f16 = simd_extract!(b, 0);
6040            let extractc: f16 = simd_extract!(c, 0);
6041            fmsub = vfmaddsh(fmsub, extractb, -extractc, ROUNDING);
6042        }
6043        simd_insert!(a, 0, fmsub)
6044    }
6045}
6046
6047/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6048/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6049/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
6050/// upper elements of dst.
6051///
6052/// Rounding is done according to the rounding parameter, which can be one of:
6053///
6054/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6055/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6056/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6057/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6058/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6059///
6060/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_round_sh)
6061#[inline]
6062#[target_feature(enable = "avx512fp16")]
6063#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6064#[rustc_legacy_const_generics(4)]
6065#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6066pub fn _mm_mask3_fmsub_round_sh<const ROUNDING: i32>(
6067    a: __m128h,
6068    b: __m128h,
6069    c: __m128h,
6070    k: __mmask8,
6071) -> __m128h {
6072    unsafe {
6073        static_assert_rounding!(ROUNDING);
6074        let mut fmsub: f16 = simd_extract!(c, 0);
6075        if k & 1 != 0 {
6076            let extracta: f16 = simd_extract!(a, 0);
6077            let extractb: f16 = simd_extract!(b, 0);
6078            fmsub = vfmaddsh(extracta, extractb, -fmsub, ROUNDING);
6079        }
6080        simd_insert!(c, 0, fmsub)
6081    }
6082}
6083
6084/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6085/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
6086/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6087/// upper elements of dst.
6088///
6089/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_round_sh)
6090#[inline]
6091#[target_feature(enable = "avx512fp16")]
6092#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6093#[rustc_legacy_const_generics(4)]
6094#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6095pub fn _mm_maskz_fmsub_round_sh<const ROUNDING: i32>(
6096    k: __mmask8,
6097    a: __m128h,
6098    b: __m128h,
6099    c: __m128h,
6100) -> __m128h {
6101    unsafe {
6102        static_assert_rounding!(ROUNDING);
6103        let mut fmsub: f16 = 0.0;
6104        if k & 1 != 0 {
6105            let extracta: f16 = simd_extract!(a, 0);
6106            let extractb: f16 = simd_extract!(b, 0);
6107            let extractc: f16 = simd_extract!(c, 0);
6108            fmsub = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
6109        }
6110        simd_insert!(a, 0, fmsub)
6111    }
6112}
6113
6114/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6115/// result from packed elements in c, and store the results in dst.
6116///
6117/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_ph)
6118#[inline]
6119#[target_feature(enable = "avx512fp16,avx512vl")]
6120#[cfg_attr(test, assert_instr(vfnmadd))]
6121#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6122pub fn _mm_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6123    unsafe { simd_fma(simd_neg(a), b, c) }
6124}
6125
6126/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6127/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6128/// from a when the corresponding mask bit is not set).
6129///
6130/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_ph)
6131#[inline]
6132#[target_feature(enable = "avx512fp16,avx512vl")]
6133#[cfg_attr(test, assert_instr(vfnmadd))]
6134#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6135pub fn _mm_mask_fnmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6136    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), a) }
6137}
6138
6139/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6140/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6141/// from c when the corresponding mask bit is not set).
6142///
6143/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_ph)
6144#[inline]
6145#[target_feature(enable = "avx512fp16,avx512vl")]
6146#[cfg_attr(test, assert_instr(vfnmadd))]
6147#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6148pub fn _mm_mask3_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6149    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), c) }
6150}
6151
6152/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6153/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6154/// out when the corresponding mask bit is not set).
6155///
6156/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_ph)
6157#[inline]
6158#[target_feature(enable = "avx512fp16,avx512vl")]
6159#[cfg_attr(test, assert_instr(vfnmadd))]
6160#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6161pub fn _mm_maskz_fnmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6162    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), _mm_setzero_ph()) }
6163}
6164
6165/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6166/// result from packed elements in c, and store the results in dst.
6167///
6168/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmadd_ph)
6169#[inline]
6170#[target_feature(enable = "avx512fp16,avx512vl")]
6171#[cfg_attr(test, assert_instr(vfnmadd))]
6172#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6173pub fn _mm256_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6174    unsafe { simd_fma(simd_neg(a), b, c) }
6175}
6176
6177/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6178/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6179/// from a when the corresponding mask bit is not set).
6180///
6181/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmadd_ph)
6182#[inline]
6183#[target_feature(enable = "avx512fp16,avx512vl")]
6184#[cfg_attr(test, assert_instr(vfnmadd))]
6185#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6186pub fn _mm256_mask_fnmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
6187    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), a) }
6188}
6189
6190/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6191/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6192/// from c when the corresponding mask bit is not set).
6193///
6194/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmadd_ph)
6195#[inline]
6196#[target_feature(enable = "avx512fp16,avx512vl")]
6197#[cfg_attr(test, assert_instr(vfnmadd))]
6198#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6199pub fn _mm256_mask3_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
6200    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), c) }
6201}
6202
6203/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6204/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6205/// out when the corresponding mask bit is not set).
6206///
6207/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmadd_ph)
6208#[inline]
6209#[target_feature(enable = "avx512fp16,avx512vl")]
6210#[cfg_attr(test, assert_instr(vfnmadd))]
6211#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6212pub fn _mm256_maskz_fnmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6213    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), _mm256_setzero_ph()) }
6214}
6215
6216/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6217/// result from packed elements in c, and store the results in dst.
6218///
6219/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_ph)
6220#[inline]
6221#[target_feature(enable = "avx512fp16")]
6222#[cfg_attr(test, assert_instr(vfnmadd))]
6223#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6224pub fn _mm512_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6225    unsafe { simd_fma(simd_neg(a), b, c) }
6226}
6227
6228/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6229/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6230/// from a when the corresponding mask bit is not set).
6231///
6232/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_ph)
6233#[inline]
6234#[target_feature(enable = "avx512fp16")]
6235#[cfg_attr(test, assert_instr(vfnmadd))]
6236#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6237pub fn _mm512_mask_fnmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
6238    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), a) }
6239}
6240
6241/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6242/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6243/// from c when the corresponding mask bit is not set).
6244///
6245/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_ph)
6246#[inline]
6247#[target_feature(enable = "avx512fp16")]
6248#[cfg_attr(test, assert_instr(vfnmadd))]
6249#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6250pub fn _mm512_mask3_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
6251    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), c) }
6252}
6253
6254/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6255/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6256/// out when the corresponding mask bit is not set).
6257///
6258/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_ph)
6259#[inline]
6260#[target_feature(enable = "avx512fp16")]
6261#[cfg_attr(test, assert_instr(vfnmadd))]
6262#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6263pub fn _mm512_maskz_fnmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6264    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), _mm512_setzero_ph()) }
6265}
6266
6267/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6268/// result from packed elements in c, and store the results in dst.
6269///
6270/// Rounding is done according to the rounding parameter, which can be one of:
6271///
6272/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6273/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6274/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6275/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6276/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6277///
6278/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_round_ph)
6279#[inline]
6280#[target_feature(enable = "avx512fp16")]
6281#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6282#[rustc_legacy_const_generics(3)]
6283#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6284pub fn _mm512_fnmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6285    unsafe {
6286        static_assert_rounding!(ROUNDING);
6287        vfmaddph_512(simd_neg(a), b, c, ROUNDING)
6288    }
6289}
6290
6291/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6292/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6293/// from a when the corresponding mask bit is not set).
6294///
6295/// Rounding is done according to the rounding parameter, which can be one of:
6296///
6297/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6298/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6299/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6300/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6301/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6302///
6303/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_round_ph)
6304#[inline]
6305#[target_feature(enable = "avx512fp16")]
6306#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6307#[rustc_legacy_const_generics(4)]
6308#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6309pub fn _mm512_mask_fnmadd_round_ph<const ROUNDING: i32>(
6310    a: __m512h,
6311    k: __mmask32,
6312    b: __m512h,
6313    c: __m512h,
6314) -> __m512h {
6315    unsafe {
6316        static_assert_rounding!(ROUNDING);
6317        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), a)
6318    }
6319}
6320
6321/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6322/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6323/// from c when the corresponding mask bit is not set).
6324///
6325/// Rounding is done according to the rounding parameter, which can be one of:
6326///
6327/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6328/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6329/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6330/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6331/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6332///
6333/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_round_ph)
6334#[inline]
6335#[target_feature(enable = "avx512fp16")]
6336#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6337#[rustc_legacy_const_generics(4)]
6338#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6339pub fn _mm512_mask3_fnmadd_round_ph<const ROUNDING: i32>(
6340    a: __m512h,
6341    b: __m512h,
6342    c: __m512h,
6343    k: __mmask32,
6344) -> __m512h {
6345    unsafe {
6346        static_assert_rounding!(ROUNDING);
6347        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), c)
6348    }
6349}
6350
6351/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6352/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6353/// out when the corresponding mask bit is not set).
6354///
6355/// Rounding is done according to the rounding parameter, which can be one of:
6356///
6357/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6358/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6359/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6360/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6361/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6362///
6363/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_round_ph)
6364#[inline]
6365#[target_feature(enable = "avx512fp16")]
6366#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6367#[rustc_legacy_const_generics(4)]
6368#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6369pub fn _mm512_maskz_fnmadd_round_ph<const ROUNDING: i32>(
6370    k: __mmask32,
6371    a: __m512h,
6372    b: __m512h,
6373    c: __m512h,
6374) -> __m512h {
6375    unsafe {
6376        static_assert_rounding!(ROUNDING);
6377        simd_select_bitmask(
6378            k,
6379            _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c),
6380            _mm512_setzero_ph(),
6381        )
6382    }
6383}
6384
6385/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6386/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6387/// elements from a to the upper elements of dst.
6388///
6389/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_sh)
6390#[inline]
6391#[target_feature(enable = "avx512fp16")]
6392#[cfg_attr(test, assert_instr(vfnmadd))]
6393#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6394pub fn _mm_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6395    unsafe {
6396        let extracta: f16 = simd_extract!(a, 0);
6397        let extractb: f16 = simd_extract!(b, 0);
6398        let extractc: f16 = simd_extract!(c, 0);
6399        let r = fmaf16(-extracta, extractb, extractc);
6400        simd_insert!(a, 0, r)
6401    }
6402}
6403
6404/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6405/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6406/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6407/// elements of dst.
6408///
6409/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_sh)
6410#[inline]
6411#[target_feature(enable = "avx512fp16")]
6412#[cfg_attr(test, assert_instr(vfnmadd))]
6413#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6414pub fn _mm_mask_fnmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6415    unsafe {
6416        let mut fnmadd: f16 = simd_extract!(a, 0);
6417        if k & 1 != 0 {
6418            let extractb: f16 = simd_extract!(b, 0);
6419            let extractc: f16 = simd_extract!(c, 0);
6420            fnmadd = fmaf16(-fnmadd, extractb, extractc);
6421        }
6422        simd_insert!(a, 0, fnmadd)
6423    }
6424}
6425
6426/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6427/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6428/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6429/// elements of dst.
6430///
6431/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_sh)
6432#[inline]
6433#[target_feature(enable = "avx512fp16")]
6434#[cfg_attr(test, assert_instr(vfnmadd))]
6435#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6436pub fn _mm_mask3_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6437    unsafe {
6438        let mut fnmadd: f16 = simd_extract!(c, 0);
6439        if k & 1 != 0 {
6440            let extracta: f16 = simd_extract!(a, 0);
6441            let extractb: f16 = simd_extract!(b, 0);
6442            fnmadd = fmaf16(-extracta, extractb, fnmadd);
6443        }
6444        simd_insert!(c, 0, fnmadd)
6445    }
6446}
6447
6448/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6449/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6450/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6451/// elements of dst.
6452///
6453/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_sh)
6454#[inline]
6455#[target_feature(enable = "avx512fp16")]
6456#[cfg_attr(test, assert_instr(vfnmadd))]
6457#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6458pub fn _mm_maskz_fnmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6459    unsafe {
6460        let mut fnmadd: f16 = 0.0;
6461        if k & 1 != 0 {
6462            let extracta: f16 = simd_extract!(a, 0);
6463            let extractb: f16 = simd_extract!(b, 0);
6464            let extractc: f16 = simd_extract!(c, 0);
6465            fnmadd = fmaf16(-extracta, extractb, extractc);
6466        }
6467        simd_insert!(a, 0, fnmadd)
6468    }
6469}
6470
6471/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6472/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6473/// elements from a to the upper elements of dst.
6474///
6475/// Rounding is done according to the rounding parameter, which can be one of:
6476///
6477/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6478/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6479/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6480/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6481/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6482///
6483/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_round_sh)
6484#[inline]
6485#[target_feature(enable = "avx512fp16")]
6486#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6487#[rustc_legacy_const_generics(3)]
6488#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6489pub fn _mm_fnmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6490    unsafe {
6491        static_assert_rounding!(ROUNDING);
6492        let extracta: f16 = simd_extract!(a, 0);
6493        let extractb: f16 = simd_extract!(b, 0);
6494        let extractc: f16 = simd_extract!(c, 0);
6495        let r = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
6496        simd_insert!(a, 0, r)
6497    }
6498}
6499
6500/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6501/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6502/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6503/// elements of dst.
6504///
6505/// Rounding is done according to the rounding parameter, which can be one of:
6506///
6507/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6508/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6509/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6510/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6511/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6512///
6513/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_round_sh)
6514#[inline]
6515#[target_feature(enable = "avx512fp16")]
6516#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6517#[rustc_legacy_const_generics(4)]
6518#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6519pub fn _mm_mask_fnmadd_round_sh<const ROUNDING: i32>(
6520    a: __m128h,
6521    k: __mmask8,
6522    b: __m128h,
6523    c: __m128h,
6524) -> __m128h {
6525    unsafe {
6526        static_assert_rounding!(ROUNDING);
6527        let mut fnmadd: f16 = simd_extract!(a, 0);
6528        if k & 1 != 0 {
6529            let extractb: f16 = simd_extract!(b, 0);
6530            let extractc: f16 = simd_extract!(c, 0);
6531            fnmadd = vfmaddsh(-fnmadd, extractb, extractc, ROUNDING);
6532        }
6533        simd_insert!(a, 0, fnmadd)
6534    }
6535}
6536
6537/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6538/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6539/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6540/// elements of dst.
6541///
6542/// Rounding is done according to the rounding parameter, which can be one of:
6543///
6544/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6545/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6546/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6547/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6548/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6549///
6550/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_round_sh)
6551#[inline]
6552#[target_feature(enable = "avx512fp16")]
6553#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6554#[rustc_legacy_const_generics(4)]
6555#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6556pub fn _mm_mask3_fnmadd_round_sh<const ROUNDING: i32>(
6557    a: __m128h,
6558    b: __m128h,
6559    c: __m128h,
6560    k: __mmask8,
6561) -> __m128h {
6562    unsafe {
6563        static_assert_rounding!(ROUNDING);
6564        let mut fnmadd: f16 = simd_extract!(c, 0);
6565        if k & 1 != 0 {
6566            let extracta: f16 = simd_extract!(a, 0);
6567            let extractb: f16 = simd_extract!(b, 0);
6568            fnmadd = vfmaddsh(-extracta, extractb, fnmadd, ROUNDING);
6569        }
6570        simd_insert!(c, 0, fnmadd)
6571    }
6572}
6573
6574/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6575/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6576/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6577/// elements of dst.
6578///
6579/// Rounding is done according to the rounding parameter, which can be one of:
6580///
6581/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6582/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6583/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6584/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6585/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6586///
6587/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_round_sh)
6588#[inline]
6589#[target_feature(enable = "avx512fp16")]
6590#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6591#[rustc_legacy_const_generics(4)]
6592#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6593pub fn _mm_maskz_fnmadd_round_sh<const ROUNDING: i32>(
6594    k: __mmask8,
6595    a: __m128h,
6596    b: __m128h,
6597    c: __m128h,
6598) -> __m128h {
6599    unsafe {
6600        static_assert_rounding!(ROUNDING);
6601        let mut fnmadd: f16 = 0.0;
6602        if k & 1 != 0 {
6603            let extracta: f16 = simd_extract!(a, 0);
6604            let extractb: f16 = simd_extract!(b, 0);
6605            let extractc: f16 = simd_extract!(c, 0);
6606            fnmadd = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
6607        }
6608        simd_insert!(a, 0, fnmadd)
6609    }
6610}
6611
6612/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6613/// in c from the negated intermediate result, and store the results in dst.
6614///
6615/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_ph)
6616#[inline]
6617#[target_feature(enable = "avx512fp16,avx512vl")]
6618#[cfg_attr(test, assert_instr(vfnmsub))]
6619#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6620pub fn _mm_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6621    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6622}
6623
6624/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6625/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6626/// copied from a when the corresponding mask bit is not set).
6627///
6628/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_ph)
6629#[inline]
6630#[target_feature(enable = "avx512fp16,avx512vl")]
6631#[cfg_attr(test, assert_instr(vfnmsub))]
6632#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6633pub fn _mm_mask_fnmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6634    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), a) }
6635}
6636
6637/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6638/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6639/// copied from c when the corresponding mask bit is not set).
6640///
6641/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_ph)
6642#[inline]
6643#[target_feature(enable = "avx512fp16,avx512vl")]
6644#[cfg_attr(test, assert_instr(vfnmsub))]
6645#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6646pub fn _mm_mask3_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6647    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), c) }
6648}
6649
6650/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6651/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6652/// zeroed out when the corresponding mask bit is not set).
6653///
6654/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_ph)
6655#[inline]
6656#[target_feature(enable = "avx512fp16,avx512vl")]
6657#[cfg_attr(test, assert_instr(vfnmsub))]
6658#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6659pub fn _mm_maskz_fnmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6660    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), _mm_setzero_ph()) }
6661}
6662
6663/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6664/// in c from the negated intermediate result, and store the results in dst.
6665///
6666/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmsub_ph)
6667#[inline]
6668#[target_feature(enable = "avx512fp16,avx512vl")]
6669#[cfg_attr(test, assert_instr(vfnmsub))]
6670#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6671pub fn _mm256_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6672    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6673}
6674
6675/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6676/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6677/// copied from a when the corresponding mask bit is not set).
6678///
6679/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmsub_ph)
6680#[inline]
6681#[target_feature(enable = "avx512fp16,avx512vl")]
6682#[cfg_attr(test, assert_instr(vfnmsub))]
6683#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6684pub fn _mm256_mask_fnmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
6685    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), a) }
6686}
6687
6688/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6689/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6690/// copied from c when the corresponding mask bit is not set).
6691///
6692/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmsub_ph)
6693#[inline]
6694#[target_feature(enable = "avx512fp16,avx512vl")]
6695#[cfg_attr(test, assert_instr(vfnmsub))]
6696#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6697pub fn _mm256_mask3_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
6698    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), c) }
6699}
6700
6701/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6702/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6703/// zeroed out when the corresponding mask bit is not set).
6704///
6705/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmsub_ph)
6706#[inline]
6707#[target_feature(enable = "avx512fp16,avx512vl")]
6708#[cfg_attr(test, assert_instr(vfnmsub))]
6709#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6710pub fn _mm256_maskz_fnmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6711    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), _mm256_setzero_ph()) }
6712}
6713
6714/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6715/// in c from the negated intermediate result, and store the results in dst.
6716///
6717/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_ph)
6718#[inline]
6719#[target_feature(enable = "avx512fp16")]
6720#[cfg_attr(test, assert_instr(vfnmsub))]
6721#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6722pub fn _mm512_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6723    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6724}
6725
6726/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6727/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6728/// copied from a when the corresponding mask bit is not set).
6729///
6730/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_ph)
6731#[inline]
6732#[target_feature(enable = "avx512fp16")]
6733#[cfg_attr(test, assert_instr(vfnmsub))]
6734#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6735pub fn _mm512_mask_fnmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
6736    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), a) }
6737}
6738
6739/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6740/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6741/// copied from c when the corresponding mask bit is not set).
6742///
6743/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_ph)
6744#[inline]
6745#[target_feature(enable = "avx512fp16")]
6746#[cfg_attr(test, assert_instr(vfnmsub))]
6747#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6748pub fn _mm512_mask3_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
6749    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), c) }
6750}
6751
6752/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6753/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6754/// zeroed out when the corresponding mask bit is not set).
6755///
6756/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_ph)
6757#[inline]
6758#[target_feature(enable = "avx512fp16")]
6759#[cfg_attr(test, assert_instr(vfnmsub))]
6760#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6761pub fn _mm512_maskz_fnmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6762    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), _mm512_setzero_ph()) }
6763}
6764
6765/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6766/// in c from the negated intermediate result, and store the results in dst.
6767///
6768/// Rounding is done according to the rounding parameter, which can be one of:
6769///
6770/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6771/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6772/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6773/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6774/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6775///
6776/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_round_ph)
6777#[inline]
6778#[target_feature(enable = "avx512fp16")]
6779#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6780#[rustc_legacy_const_generics(3)]
6781#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6782pub fn _mm512_fnmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6783    unsafe {
6784        static_assert_rounding!(ROUNDING);
6785        vfmaddph_512(simd_neg(a), b, simd_neg(c), ROUNDING)
6786    }
6787}
6788
6789/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6790/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6791/// copied from a when the corresponding mask bit is not set).
6792///
6793/// Rounding is done according to the rounding parameter, which can be one of:
6794///
6795/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6796/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6797/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6798/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6799/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6800///
6801/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_round_ph)
6802#[inline]
6803#[target_feature(enable = "avx512fp16")]
6804#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6805#[rustc_legacy_const_generics(4)]
6806#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6807pub fn _mm512_mask_fnmsub_round_ph<const ROUNDING: i32>(
6808    a: __m512h,
6809    k: __mmask32,
6810    b: __m512h,
6811    c: __m512h,
6812) -> __m512h {
6813    unsafe {
6814        static_assert_rounding!(ROUNDING);
6815        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), a)
6816    }
6817}
6818
6819/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6820/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6821/// copied from c when the corresponding mask bit is not set).
6822///
6823/// Rounding is done according to the rounding parameter, which can be one of:
6824///
6825/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6826/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6827/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6828/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6829/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6830///
6831/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_round_ph)
6832#[inline]
6833#[target_feature(enable = "avx512fp16")]
6834#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6835#[rustc_legacy_const_generics(4)]
6836#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6837pub fn _mm512_mask3_fnmsub_round_ph<const ROUNDING: i32>(
6838    a: __m512h,
6839    b: __m512h,
6840    c: __m512h,
6841    k: __mmask32,
6842) -> __m512h {
6843    unsafe {
6844        static_assert_rounding!(ROUNDING);
6845        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), c)
6846    }
6847}
6848
6849/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6850/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6851/// zeroed out when the corresponding mask bit is not set).
6852///
6853/// Rounding is done according to the rounding parameter, which can be one of:
6854///
6855/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6856/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6857/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6858/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6859/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6860///
6861/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_round_ph)
6862#[inline]
6863#[target_feature(enable = "avx512fp16")]
6864#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6865#[rustc_legacy_const_generics(4)]
6866#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6867pub fn _mm512_maskz_fnmsub_round_ph<const ROUNDING: i32>(
6868    k: __mmask32,
6869    a: __m512h,
6870    b: __m512h,
6871    c: __m512h,
6872) -> __m512h {
6873    unsafe {
6874        static_assert_rounding!(ROUNDING);
6875        simd_select_bitmask(
6876            k,
6877            _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c),
6878            _mm512_setzero_ph(),
6879        )
6880    }
6881}
6882
6883/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6884/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6885/// elements from a to the upper elements of dst.
6886///
6887/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_sh)
6888#[inline]
6889#[target_feature(enable = "avx512fp16")]
6890#[cfg_attr(test, assert_instr(vfnmsub))]
6891#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6892pub fn _mm_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6893    unsafe {
6894        let extracta: f16 = simd_extract!(a, 0);
6895        let extractb: f16 = simd_extract!(b, 0);
6896        let extractc: f16 = simd_extract!(c, 0);
6897        let r = fmaf16(-extracta, extractb, -extractc);
6898        simd_insert!(a, 0, r)
6899    }
6900}
6901
6902/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6903/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6904/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6905/// elements of dst.
6906///
6907/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_sh)
6908#[inline]
6909#[target_feature(enable = "avx512fp16")]
6910#[cfg_attr(test, assert_instr(vfnmsub))]
6911#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6912pub fn _mm_mask_fnmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6913    unsafe {
6914        let mut fnmsub: f16 = simd_extract!(a, 0);
6915        if k & 1 != 0 {
6916            let extractb: f16 = simd_extract!(b, 0);
6917            let extractc: f16 = simd_extract!(c, 0);
6918            fnmsub = fmaf16(-fnmsub, extractb, -extractc);
6919        }
6920        simd_insert!(a, 0, fnmsub)
6921    }
6922}
6923
6924/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6925/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6926/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6927/// elements of dst.
6928///
6929/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_sh)
6930#[inline]
6931#[target_feature(enable = "avx512fp16")]
6932#[cfg_attr(test, assert_instr(vfnmsub))]
6933#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6934pub fn _mm_mask3_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6935    unsafe {
6936        let mut fnmsub: f16 = simd_extract!(c, 0);
6937        if k & 1 != 0 {
6938            let extracta: f16 = simd_extract!(a, 0);
6939            let extractb: f16 = simd_extract!(b, 0);
6940            fnmsub = fmaf16(-extracta, extractb, -fnmsub);
6941        }
6942        simd_insert!(c, 0, fnmsub)
6943    }
6944}
6945
6946/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6947/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6948/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6949/// elements of dst.
6950///
6951/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_sh)
6952#[inline]
6953#[target_feature(enable = "avx512fp16")]
6954#[cfg_attr(test, assert_instr(vfnmsub))]
6955#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6956pub fn _mm_maskz_fnmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6957    unsafe {
6958        let mut fnmsub: f16 = 0.0;
6959        if k & 1 != 0 {
6960            let extracta: f16 = simd_extract!(a, 0);
6961            let extractb: f16 = simd_extract!(b, 0);
6962            let extractc: f16 = simd_extract!(c, 0);
6963            fnmsub = fmaf16(-extracta, extractb, -extractc);
6964        }
6965        simd_insert!(a, 0, fnmsub)
6966    }
6967}
6968
6969/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6970/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6971/// elements from a to the upper elements of dst.
6972///
6973/// Rounding is done according to the rounding parameter, which can be one of:
6974///
6975/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6976/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6977/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6978/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6979/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6980///
6981/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_round_sh)
6982#[inline]
6983#[target_feature(enable = "avx512fp16")]
6984#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6985#[rustc_legacy_const_generics(3)]
6986#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6987pub fn _mm_fnmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6988    unsafe {
6989        static_assert_rounding!(ROUNDING);
6990        let extracta: f16 = simd_extract!(a, 0);
6991        let extractb: f16 = simd_extract!(b, 0);
6992        let extractc: f16 = simd_extract!(c, 0);
6993        let r = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
6994        simd_insert!(a, 0, r)
6995    }
6996}
6997
6998/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6999/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
7000/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
7001/// elements of dst.
7002///
7003/// Rounding is done according to the rounding parameter, which can be one of:
7004///
7005/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7006/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7007/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7008/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7009/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7010///
7011/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_round_sh)
7012#[inline]
7013#[target_feature(enable = "avx512fp16")]
7014#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7015#[rustc_legacy_const_generics(4)]
7016#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7017pub fn _mm_mask_fnmsub_round_sh<const ROUNDING: i32>(
7018    a: __m128h,
7019    k: __mmask8,
7020    b: __m128h,
7021    c: __m128h,
7022) -> __m128h {
7023    unsafe {
7024        static_assert_rounding!(ROUNDING);
7025        let mut fnmsub: f16 = simd_extract!(a, 0);
7026        if k & 1 != 0 {
7027            let extractb: f16 = simd_extract!(b, 0);
7028            let extractc: f16 = simd_extract!(c, 0);
7029            fnmsub = vfmaddsh(-fnmsub, extractb, -extractc, ROUNDING);
7030        }
7031        simd_insert!(a, 0, fnmsub)
7032    }
7033}
7034
7035/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7036/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
7037/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
7038/// elements of dst.
7039///
7040/// Rounding is done according to the rounding parameter, which can be one of:
7041///
7042/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7043/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7044/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7045/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7046/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7047///
7048/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_round_sh)
7049#[inline]
7050#[target_feature(enable = "avx512fp16")]
7051#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7052#[rustc_legacy_const_generics(4)]
7053#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7054pub fn _mm_mask3_fnmsub_round_sh<const ROUNDING: i32>(
7055    a: __m128h,
7056    b: __m128h,
7057    c: __m128h,
7058    k: __mmask8,
7059) -> __m128h {
7060    unsafe {
7061        static_assert_rounding!(ROUNDING);
7062        let mut fnmsub: f16 = simd_extract!(c, 0);
7063        if k & 1 != 0 {
7064            let extracta: f16 = simd_extract!(a, 0);
7065            let extractb: f16 = simd_extract!(b, 0);
7066            fnmsub = vfmaddsh(-extracta, extractb, -fnmsub, ROUNDING);
7067        }
7068        simd_insert!(c, 0, fnmsub)
7069    }
7070}
7071
7072/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7073/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
7074/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
7075/// elements of dst.
7076///
7077/// Rounding is done according to the rounding parameter, which can be one of:
7078///
7079/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7080/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7081/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7082/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7083/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7084///
7085/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_round_sh)
7086#[inline]
7087#[target_feature(enable = "avx512fp16")]
7088#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7089#[rustc_legacy_const_generics(4)]
7090#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7091pub fn _mm_maskz_fnmsub_round_sh<const ROUNDING: i32>(
7092    k: __mmask8,
7093    a: __m128h,
7094    b: __m128h,
7095    c: __m128h,
7096) -> __m128h {
7097    unsafe {
7098        static_assert_rounding!(ROUNDING);
7099        let mut fnmsub: f16 = 0.0;
7100        if k & 1 != 0 {
7101            let extracta: f16 = simd_extract!(a, 0);
7102            let extractb: f16 = simd_extract!(b, 0);
7103            let extractc: f16 = simd_extract!(c, 0);
7104            fnmsub = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
7105        }
7106        simd_insert!(a, 0, fnmsub)
7107    }
7108}
7109
7110/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7111/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7112///
7113/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmaddsub_ph)
7114#[inline]
7115#[target_feature(enable = "avx512fp16,avx512vl")]
7116#[cfg_attr(test, assert_instr(vfmaddsub))]
7117#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7118pub fn _mm_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7119    unsafe { vfmaddsubph_128(a, b, c) }
7120}
7121
7122/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7123/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7124/// (the element is copied from a when the corresponding mask bit is not set).
7125///
7126/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmaddsub_ph)
7127#[inline]
7128#[target_feature(enable = "avx512fp16,avx512vl")]
7129#[cfg_attr(test, assert_instr(vfmaddsub))]
7130#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7131pub fn _mm_mask_fmaddsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
7132    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), a) }
7133}
7134
7135/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7136/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7137/// (the element is copied from c when the corresponding mask bit is not set).
7138///
7139/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmaddsub_ph)
7140#[inline]
7141#[target_feature(enable = "avx512fp16,avx512vl")]
7142#[cfg_attr(test, assert_instr(vfmaddsub))]
7143#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7144pub fn _mm_mask3_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
7145    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), c) }
7146}
7147
7148/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7149/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7150/// (the element is zeroed out when the corresponding mask bit is not set).
7151///
7152/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmaddsub_ph)
7153#[inline]
7154#[target_feature(enable = "avx512fp16,avx512vl")]
7155#[cfg_attr(test, assert_instr(vfmaddsub))]
7156#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7157pub fn _mm_maskz_fmaddsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7158    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), _mm_setzero_ph()) }
7159}
7160
7161/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7162/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7163///
7164/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmaddsub_ph)
7165#[inline]
7166#[target_feature(enable = "avx512fp16,avx512vl")]
7167#[cfg_attr(test, assert_instr(vfmaddsub))]
7168#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7169pub fn _mm256_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7170    unsafe { vfmaddsubph_256(a, b, c) }
7171}
7172
7173/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7174/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7175/// (the element is copied from a when the corresponding mask bit is not set).
7176///
7177/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmaddsub_ph)
7178#[inline]
7179#[target_feature(enable = "avx512fp16,avx512vl")]
7180#[cfg_attr(test, assert_instr(vfmaddsub))]
7181#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7182pub fn _mm256_mask_fmaddsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
7183    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), a) }
7184}
7185
7186/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7187/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7188/// (the element is copied from c when the corresponding mask bit is not set).
7189///
7190/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmaddsub_ph)
7191#[inline]
7192#[target_feature(enable = "avx512fp16,avx512vl")]
7193#[cfg_attr(test, assert_instr(vfmaddsub))]
7194#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7195pub fn _mm256_mask3_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
7196    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), c) }
7197}
7198
7199/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7200/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7201/// (the element is zeroed out when the corresponding mask bit is not set).
7202///
7203/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmaddsub_ph)
7204#[inline]
7205#[target_feature(enable = "avx512fp16,avx512vl")]
7206#[cfg_attr(test, assert_instr(vfmaddsub))]
7207#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7208pub fn _mm256_maskz_fmaddsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7209    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), _mm256_setzero_ph()) }
7210}
7211
7212/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7213/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7214///
7215/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_ph)
7216#[inline]
7217#[target_feature(enable = "avx512fp16")]
7218#[cfg_attr(test, assert_instr(vfmaddsub))]
7219#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7220pub fn _mm512_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7221    _mm512_fmaddsub_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
7222}
7223
7224/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7225/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7226/// (the element is copied from a when the corresponding mask bit is not set).
7227///
7228/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_ph)
7229#[inline]
7230#[target_feature(enable = "avx512fp16")]
7231#[cfg_attr(test, assert_instr(vfmaddsub))]
7232#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7233pub fn _mm512_mask_fmaddsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
7234    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), a) }
7235}
7236
7237/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7238/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7239/// (the element is copied from c when the corresponding mask bit is not set).
7240///
7241/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_ph)
7242#[inline]
7243#[target_feature(enable = "avx512fp16")]
7244#[cfg_attr(test, assert_instr(vfmaddsub))]
7245#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7246pub fn _mm512_mask3_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
7247    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), c) }
7248}
7249
7250/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7251/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7252/// (the element is zeroed out when the corresponding mask bit is not set).
7253///
7254/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_ph)
7255#[inline]
7256#[target_feature(enable = "avx512fp16")]
7257#[cfg_attr(test, assert_instr(vfmaddsub))]
7258#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7259pub fn _mm512_maskz_fmaddsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7260    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), _mm512_setzero_ph()) }
7261}
7262
7263/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7264/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7265///
7266/// Rounding is done according to the rounding parameter, which can be one of:
7267///
7268/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7269/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7270/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7271/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7272/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7273///
7274/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_round_ph)
7275#[inline]
7276#[target_feature(enable = "avx512fp16")]
7277#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7278#[rustc_legacy_const_generics(3)]
7279#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7280pub fn _mm512_fmaddsub_round_ph<const ROUNDING: i32>(
7281    a: __m512h,
7282    b: __m512h,
7283    c: __m512h,
7284) -> __m512h {
7285    unsafe {
7286        static_assert_rounding!(ROUNDING);
7287        vfmaddsubph_512(a, b, c, ROUNDING)
7288    }
7289}
7290
7291/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7292/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7293/// (the element is copied from a when the corresponding mask bit is not set).
7294///
7295/// Rounding is done according to the rounding parameter, which can be one of:
7296///
7297/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7298/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7299/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7300/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7301/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7302///
7303/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_round_ph)
7304#[inline]
7305#[target_feature(enable = "avx512fp16")]
7306#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7307#[rustc_legacy_const_generics(4)]
7308#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7309pub fn _mm512_mask_fmaddsub_round_ph<const ROUNDING: i32>(
7310    a: __m512h,
7311    k: __mmask32,
7312    b: __m512h,
7313    c: __m512h,
7314) -> __m512h {
7315    unsafe {
7316        static_assert_rounding!(ROUNDING);
7317        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), a)
7318    }
7319}
7320
7321/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7322/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7323/// (the element is copied from c when the corresponding mask bit is not set).
7324///
7325/// Rounding is done according to the rounding parameter, which can be one of:
7326///
7327/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7328/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7329/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7330/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7331/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7332///
7333/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_round_ph)
7334#[inline]
7335#[target_feature(enable = "avx512fp16")]
7336#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7337#[rustc_legacy_const_generics(4)]
7338#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7339pub fn _mm512_mask3_fmaddsub_round_ph<const ROUNDING: i32>(
7340    a: __m512h,
7341    b: __m512h,
7342    c: __m512h,
7343    k: __mmask32,
7344) -> __m512h {
7345    unsafe {
7346        static_assert_rounding!(ROUNDING);
7347        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), c)
7348    }
7349}
7350
7351/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7352/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7353/// (the element is zeroed out when the corresponding mask bit is not set).
7354///
7355/// Rounding is done according to the rounding parameter, which can be one of:
7356///
7357/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7358/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7359/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7360/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7361/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7362///
7363/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_round_ph)
7364#[inline]
7365#[target_feature(enable = "avx512fp16")]
7366#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7367#[rustc_legacy_const_generics(4)]
7368#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7369pub fn _mm512_maskz_fmaddsub_round_ph<const ROUNDING: i32>(
7370    k: __mmask32,
7371    a: __m512h,
7372    b: __m512h,
7373    c: __m512h,
7374) -> __m512h {
7375    unsafe {
7376        static_assert_rounding!(ROUNDING);
7377        simd_select_bitmask(
7378            k,
7379            _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c),
7380            _mm512_setzero_ph(),
7381        )
7382    }
7383}
7384
7385/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7386/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7387///
7388/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsubadd_ph)
7389#[inline]
7390#[target_feature(enable = "avx512fp16,avx512vl")]
7391#[cfg_attr(test, assert_instr(vfmsubadd))]
7392#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7393pub fn _mm_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7394    unsafe { vfmaddsubph_128(a, b, simd_neg(c)) }
7395}
7396
7397/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7398/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7399/// (the element is copied from a when the corresponding mask bit is not set).
7400///
7401/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsubadd_ph)
7402#[inline]
7403#[target_feature(enable = "avx512fp16,avx512vl")]
7404#[cfg_attr(test, assert_instr(vfmsubadd))]
7405#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7406pub fn _mm_mask_fmsubadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
7407    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), a) }
7408}
7409
7410/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7411/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7412/// (the element is copied from c when the corresponding mask bit is not set).
7413///
7414/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsubadd_ph)
7415#[inline]
7416#[target_feature(enable = "avx512fp16,avx512vl")]
7417#[cfg_attr(test, assert_instr(vfmsubadd))]
7418#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7419pub fn _mm_mask3_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
7420    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), c) }
7421}
7422
7423/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7424/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7425/// (the element is zeroed out when the corresponding mask bit is not set).
7426///
7427/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsubadd_ph)
7428#[inline]
7429#[target_feature(enable = "avx512fp16,avx512vl")]
7430#[cfg_attr(test, assert_instr(vfmsubadd))]
7431#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7432pub fn _mm_maskz_fmsubadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7433    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), _mm_setzero_ph()) }
7434}
7435
7436/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7437/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7438///
7439/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsubadd_ph)
7440#[inline]
7441#[target_feature(enable = "avx512fp16,avx512vl")]
7442#[cfg_attr(test, assert_instr(vfmsubadd))]
7443#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7444pub fn _mm256_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7445    unsafe { vfmaddsubph_256(a, b, simd_neg(c)) }
7446}
7447
7448/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7449/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7450/// (the element is copied from a when the corresponding mask bit is not set).
7451///
7452/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsubadd_ph)
7453#[inline]
7454#[target_feature(enable = "avx512fp16,avx512vl")]
7455#[cfg_attr(test, assert_instr(vfmsubadd))]
7456#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7457pub fn _mm256_mask_fmsubadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
7458    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), a) }
7459}
7460
7461/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7462/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7463/// (the element is copied from c when the corresponding mask bit is not set).
7464///
7465/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsubadd_ph)
7466#[inline]
7467#[target_feature(enable = "avx512fp16,avx512vl")]
7468#[cfg_attr(test, assert_instr(vfmsubadd))]
7469#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7470pub fn _mm256_mask3_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
7471    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), c) }
7472}
7473
7474/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7475/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7476/// (the element is zeroed out when the corresponding mask bit is not set).
7477///
7478/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsubadd_ph)
7479#[inline]
7480#[target_feature(enable = "avx512fp16,avx512vl")]
7481#[cfg_attr(test, assert_instr(vfmsubadd))]
7482#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7483pub fn _mm256_maskz_fmsubadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7484    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), _mm256_setzero_ph()) }
7485}
7486
7487/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7488/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7489///
7490/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_ph)
7491#[inline]
7492#[target_feature(enable = "avx512fp16")]
7493#[cfg_attr(test, assert_instr(vfmsubadd))]
7494#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7495pub fn _mm512_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7496    _mm512_fmsubadd_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
7497}
7498
7499/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7500/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7501/// (the element is copied from a when the corresponding mask bit is not set).
7502///
7503/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_ph)
7504#[inline]
7505#[target_feature(enable = "avx512fp16")]
7506#[cfg_attr(test, assert_instr(vfmsubadd))]
7507#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7508pub fn _mm512_mask_fmsubadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
7509    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), a) }
7510}
7511
7512/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7513/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7514/// (the element is copied from c when the corresponding mask bit is not set).
7515///
7516/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_ph)
7517#[inline]
7518#[target_feature(enable = "avx512fp16")]
7519#[cfg_attr(test, assert_instr(vfmsubadd))]
7520#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7521pub fn _mm512_mask3_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
7522    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), c) }
7523}
7524
7525/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7526/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7527/// (the element is zeroed out when the corresponding mask bit is not set).
7528///
7529/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_ph)
7530#[inline]
7531#[target_feature(enable = "avx512fp16")]
7532#[cfg_attr(test, assert_instr(vfmsubadd))]
7533#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7534pub fn _mm512_maskz_fmsubadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7535    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), _mm512_setzero_ph()) }
7536}
7537
7538/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7539/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7540///
7541/// Rounding is done according to the rounding parameter, which can be one of:
7542///
7543/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7544/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7545/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7546/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7547/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7548///
7549/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_round_ph)
7550#[inline]
7551#[target_feature(enable = "avx512fp16")]
7552#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7553#[rustc_legacy_const_generics(3)]
7554#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7555pub fn _mm512_fmsubadd_round_ph<const ROUNDING: i32>(
7556    a: __m512h,
7557    b: __m512h,
7558    c: __m512h,
7559) -> __m512h {
7560    unsafe {
7561        static_assert_rounding!(ROUNDING);
7562        vfmaddsubph_512(a, b, simd_neg(c), ROUNDING)
7563    }
7564}
7565
7566/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7567/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7568/// (the element is copied from a when the corresponding mask bit is not set).
7569///
7570/// Rounding is done according to the rounding parameter, which can be one of:
7571///
7572/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7573/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7574/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7575/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7576/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7577///
7578/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_round_ph)
7579#[inline]
7580#[target_feature(enable = "avx512fp16")]
7581#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7582#[rustc_legacy_const_generics(4)]
7583#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7584pub fn _mm512_mask_fmsubadd_round_ph<const ROUNDING: i32>(
7585    a: __m512h,
7586    k: __mmask32,
7587    b: __m512h,
7588    c: __m512h,
7589) -> __m512h {
7590    unsafe {
7591        static_assert_rounding!(ROUNDING);
7592        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), a)
7593    }
7594}
7595
7596/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7597/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7598/// (the element is copied from c when the corresponding mask bit is not set).
7599///
7600/// Rounding is done according to the rounding parameter, which can be one of:
7601///
7602/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7603/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7604/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7605/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7606/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7607///
7608/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_round_ph)
7609#[inline]
7610#[target_feature(enable = "avx512fp16")]
7611#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7612#[rustc_legacy_const_generics(4)]
7613#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7614pub fn _mm512_mask3_fmsubadd_round_ph<const ROUNDING: i32>(
7615    a: __m512h,
7616    b: __m512h,
7617    c: __m512h,
7618    k: __mmask32,
7619) -> __m512h {
7620    unsafe {
7621        static_assert_rounding!(ROUNDING);
7622        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), c)
7623    }
7624}
7625
7626/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7627/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7628/// (the element is zeroed out when the corresponding mask bit is not set).
7629///
7630/// Rounding is done according to the rounding parameter, which can be one of:
7631///
7632/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7633/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7634/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7635/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7636/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7637///
7638/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_round_ph)
7639#[inline]
7640#[target_feature(enable = "avx512fp16")]
7641#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7642#[rustc_legacy_const_generics(4)]
7643#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7644pub fn _mm512_maskz_fmsubadd_round_ph<const ROUNDING: i32>(
7645    k: __mmask32,
7646    a: __m512h,
7647    b: __m512h,
7648    c: __m512h,
7649) -> __m512h {
7650    unsafe {
7651        static_assert_rounding!(ROUNDING);
7652        simd_select_bitmask(
7653            k,
7654            _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c),
7655            _mm512_setzero_ph(),
7656        )
7657    }
7658}
7659
7660/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7661/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7662///
7663/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ph)
7664#[inline]
7665#[target_feature(enable = "avx512fp16,avx512vl")]
7666#[cfg_attr(test, assert_instr(vrcpph))]
7667#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7668pub fn _mm_rcp_ph(a: __m128h) -> __m128h {
7669    _mm_mask_rcp_ph(_mm_undefined_ph(), 0xff, a)
7670}
7671
7672/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7673/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7674/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7675///
7676/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_ph)
7677#[inline]
7678#[target_feature(enable = "avx512fp16,avx512vl")]
7679#[cfg_attr(test, assert_instr(vrcpph))]
7680#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7681pub fn _mm_mask_rcp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
7682    unsafe { vrcpph_128(a, src, k) }
7683}
7684
7685/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7686/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7687/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7688///
7689/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_ph)
7690#[inline]
7691#[target_feature(enable = "avx512fp16,avx512vl")]
7692#[cfg_attr(test, assert_instr(vrcpph))]
7693#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7694pub fn _mm_maskz_rcp_ph(k: __mmask8, a: __m128h) -> __m128h {
7695    _mm_mask_rcp_ph(_mm_setzero_ph(), k, a)
7696}
7697
7698/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7699/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7700///
7701/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rcp_ph)
7702#[inline]
7703#[target_feature(enable = "avx512fp16,avx512vl")]
7704#[cfg_attr(test, assert_instr(vrcpph))]
7705#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7706pub fn _mm256_rcp_ph(a: __m256h) -> __m256h {
7707    _mm256_mask_rcp_ph(_mm256_undefined_ph(), 0xffff, a)
7708}
7709
7710/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7711/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7712/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7713///
7714/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rcp_ph)
7715#[inline]
7716#[target_feature(enable = "avx512fp16,avx512vl")]
7717#[cfg_attr(test, assert_instr(vrcpph))]
7718#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7719pub fn _mm256_mask_rcp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
7720    unsafe { vrcpph_256(a, src, k) }
7721}
7722
7723/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7724/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7725/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7726///
7727/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rcp_ph)
7728#[inline]
7729#[target_feature(enable = "avx512fp16,avx512vl")]
7730#[cfg_attr(test, assert_instr(vrcpph))]
7731#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7732pub fn _mm256_maskz_rcp_ph(k: __mmask16, a: __m256h) -> __m256h {
7733    _mm256_mask_rcp_ph(_mm256_setzero_ph(), k, a)
7734}
7735
7736/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7737/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7738///
7739/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp_ph)
7740#[inline]
7741#[target_feature(enable = "avx512fp16")]
7742#[cfg_attr(test, assert_instr(vrcpph))]
7743#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7744pub fn _mm512_rcp_ph(a: __m512h) -> __m512h {
7745    _mm512_mask_rcp_ph(_mm512_undefined_ph(), 0xffffffff, a)
7746}
7747
7748/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7749/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7750/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7751///
7752/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp_ph)
7753#[inline]
7754#[target_feature(enable = "avx512fp16")]
7755#[cfg_attr(test, assert_instr(vrcpph))]
7756#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7757pub fn _mm512_mask_rcp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
7758    unsafe { vrcpph_512(a, src, k) }
7759}
7760
7761/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7762/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7763/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7764///
7765/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rcp_ph)
7766#[inline]
7767#[target_feature(enable = "avx512fp16")]
7768#[cfg_attr(test, assert_instr(vrcpph))]
7769#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7770pub fn _mm512_maskz_rcp_ph(k: __mmask32, a: __m512h) -> __m512h {
7771    _mm512_mask_rcp_ph(_mm512_setzero_ph(), k, a)
7772}
7773
7774/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
7775/// store the result in the lower element of dst, and copy the upper 7 packed elements from a to the
7776/// upper elements of dst.
7777/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7778///
7779/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_sh)
7780#[inline]
7781#[target_feature(enable = "avx512fp16")]
7782#[cfg_attr(test, assert_instr(vrcpsh))]
7783#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7784pub fn _mm_rcp_sh(a: __m128h, b: __m128h) -> __m128h {
7785    _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
7786}
7787
7788/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
7789/// store the result in the lower element of dst using writemask k (the element is copied from src when
7790/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7791/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7792///
7793/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_sh)
7794#[inline]
7795#[target_feature(enable = "avx512fp16")]
7796#[cfg_attr(test, assert_instr(vrcpsh))]
7797#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7798pub fn _mm_mask_rcp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7799    unsafe { vrcpsh(a, b, src, k) }
7800}
7801
7802/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
7803/// store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
7804/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7805/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7806///
7807/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_sh)
7808#[inline]
7809#[target_feature(enable = "avx512fp16")]
7810#[cfg_attr(test, assert_instr(vrcpsh))]
7811#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7812pub fn _mm_maskz_rcp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7813    _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), k, a, b)
7814}
7815
7816/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7817/// elements in a, and store the results in dst.
7818/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7819///
7820/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ph)
7821#[inline]
7822#[target_feature(enable = "avx512fp16,avx512vl")]
7823#[cfg_attr(test, assert_instr(vrsqrtph))]
7824#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7825pub fn _mm_rsqrt_ph(a: __m128h) -> __m128h {
7826    _mm_mask_rsqrt_ph(_mm_undefined_ph(), 0xff, a)
7827}
7828
7829/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7830/// elements in a, and store the results in dst using writemask k (elements are copied from src when
7831/// the corresponding mask bit is not set).
7832/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7833///
7834/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_ph)
7835#[inline]
7836#[target_feature(enable = "avx512fp16,avx512vl")]
7837#[cfg_attr(test, assert_instr(vrsqrtph))]
7838#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7839pub fn _mm_mask_rsqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
7840    unsafe { vrsqrtph_128(a, src, k) }
7841}
7842
7843/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7844/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
7845/// corresponding mask bit is not set).
7846/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7847///
7848/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_ph)
7849#[inline]
7850#[target_feature(enable = "avx512fp16,avx512vl")]
7851#[cfg_attr(test, assert_instr(vrsqrtph))]
7852#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7853pub fn _mm_maskz_rsqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
7854    _mm_mask_rsqrt_ph(_mm_setzero_ph(), k, a)
7855}
7856
7857/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7858/// elements in a, and store the results in dst.
7859/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7860///
7861/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rsqrt_ph)
7862#[inline]
7863#[target_feature(enable = "avx512fp16,avx512vl")]
7864#[cfg_attr(test, assert_instr(vrsqrtph))]
7865#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7866pub fn _mm256_rsqrt_ph(a: __m256h) -> __m256h {
7867    _mm256_mask_rsqrt_ph(_mm256_undefined_ph(), 0xffff, a)
7868}
7869
7870/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7871/// elements in a, and store the results in dst using writemask k (elements are copied from src when
7872/// the corresponding mask bit is not set).
7873/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7874///
7875/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rsqrt_ph)
7876#[inline]
7877#[target_feature(enable = "avx512fp16,avx512vl")]
7878#[cfg_attr(test, assert_instr(vrsqrtph))]
7879#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7880pub fn _mm256_mask_rsqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
7881    unsafe { vrsqrtph_256(a, src, k) }
7882}
7883
7884/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7885/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
7886/// corresponding mask bit is not set).
7887/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7888///
7889/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rsqrt_ph)
7890#[inline]
7891#[target_feature(enable = "avx512fp16,avx512vl")]
7892#[cfg_attr(test, assert_instr(vrsqrtph))]
7893#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7894pub fn _mm256_maskz_rsqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
7895    _mm256_mask_rsqrt_ph(_mm256_setzero_ph(), k, a)
7896}
7897
7898/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7899/// elements in a, and store the results in dst.
7900/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7901///
7902/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt_ph)
7903#[inline]
7904#[target_feature(enable = "avx512fp16")]
7905#[cfg_attr(test, assert_instr(vrsqrtph))]
7906#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7907pub fn _mm512_rsqrt_ph(a: __m512h) -> __m512h {
7908    _mm512_mask_rsqrt_ph(_mm512_undefined_ph(), 0xffffffff, a)
7909}
7910
7911/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7912/// elements in a, and store the results in dst using writemask k (elements are copied from src when
7913/// the corresponding mask bit is not set).
7914/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7915///
7916/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt_ph)
7917#[inline]
7918#[target_feature(enable = "avx512fp16")]
7919#[cfg_attr(test, assert_instr(vrsqrtph))]
7920#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7921pub fn _mm512_mask_rsqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
7922    unsafe { vrsqrtph_512(a, src, k) }
7923}
7924
7925/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7926/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
7927/// corresponding mask bit is not set).
7928/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7929///
7930/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt_ph)
7931#[inline]
7932#[target_feature(enable = "avx512fp16")]
7933#[cfg_attr(test, assert_instr(vrsqrtph))]
7934#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7935pub fn _mm512_maskz_rsqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
7936    _mm512_mask_rsqrt_ph(_mm512_setzero_ph(), k, a)
7937}
7938
7939/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
7940/// element in b, store the result in the lower element of dst, and copy the upper 7 packed elements from a
7941/// to the upper elements of dst.
7942/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7943///
7944/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_sh)
7945#[inline]
7946#[target_feature(enable = "avx512fp16")]
7947#[cfg_attr(test, assert_instr(vrsqrtsh))]
7948#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7949pub fn _mm_rsqrt_sh(a: __m128h, b: __m128h) -> __m128h {
7950    _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
7951}
7952
7953/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
7954/// element in b, store the result in the lower element of dst using writemask k (the element is copied from src
7955/// when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7956/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7957///
7958/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_sh)
7959#[inline]
7960#[target_feature(enable = "avx512fp16")]
7961#[cfg_attr(test, assert_instr(vrsqrtsh))]
7962#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7963pub fn _mm_mask_rsqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7964    unsafe { vrsqrtsh(a, b, src, k) }
7965}
7966
7967/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
7968/// element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when
7969/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7970/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7971///
7972/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_sh)
7973#[inline]
7974#[target_feature(enable = "avx512fp16")]
7975#[cfg_attr(test, assert_instr(vrsqrtsh))]
7976#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7977pub fn _mm_maskz_rsqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7978    _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), k, a, b)
7979}
7980
7981/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7982/// results in dst.
7983///
7984/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ph)
7985#[inline]
7986#[target_feature(enable = "avx512fp16,avx512vl")]
7987#[cfg_attr(test, assert_instr(vsqrtph))]
7988#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7989pub fn _mm_sqrt_ph(a: __m128h) -> __m128h {
7990    unsafe { simd_fsqrt(a) }
7991}
7992
7993/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7994/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7995///
7996/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ph)
7997#[inline]
7998#[target_feature(enable = "avx512fp16,avx512vl")]
7999#[cfg_attr(test, assert_instr(vsqrtph))]
8000#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8001pub fn _mm_mask_sqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
8002    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), src) }
8003}
8004
8005/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8006/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8007///
8008/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ph)
8009#[inline]
8010#[target_feature(enable = "avx512fp16,avx512vl")]
8011#[cfg_attr(test, assert_instr(vsqrtph))]
8012#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8013pub fn _mm_maskz_sqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
8014    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), _mm_setzero_ph()) }
8015}
8016
8017/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8018/// results in dst.
8019///
8020/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sqrt_ph)
8021#[inline]
8022#[target_feature(enable = "avx512fp16,avx512vl")]
8023#[cfg_attr(test, assert_instr(vsqrtph))]
8024#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8025pub fn _mm256_sqrt_ph(a: __m256h) -> __m256h {
8026    unsafe { simd_fsqrt(a) }
8027}
8028
8029/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8030/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8031///
8032/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sqrt_ph)
8033#[inline]
8034#[target_feature(enable = "avx512fp16,avx512vl")]
8035#[cfg_attr(test, assert_instr(vsqrtph))]
8036#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8037pub fn _mm256_mask_sqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
8038    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), src) }
8039}
8040
8041/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8042/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8043///
8044/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sqrt_ph)
8045#[inline]
8046#[target_feature(enable = "avx512fp16,avx512vl")]
8047#[cfg_attr(test, assert_instr(vsqrtph))]
8048#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8049pub fn _mm256_maskz_sqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
8050    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), _mm256_setzero_ph()) }
8051}
8052
8053/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8054/// results in dst.
8055///
8056/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_ph)
8057#[inline]
8058#[target_feature(enable = "avx512fp16")]
8059#[cfg_attr(test, assert_instr(vsqrtph))]
8060#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8061pub fn _mm512_sqrt_ph(a: __m512h) -> __m512h {
8062    unsafe { simd_fsqrt(a) }
8063}
8064
8065/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8066/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8067///
8068/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_ph)
8069#[inline]
8070#[target_feature(enable = "avx512fp16")]
8071#[cfg_attr(test, assert_instr(vsqrtph))]
8072#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8073pub fn _mm512_mask_sqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
8074    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), src) }
8075}
8076
8077/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8078/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8079///
8080/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_ph)
8081#[inline]
8082#[target_feature(enable = "avx512fp16")]
8083#[cfg_attr(test, assert_instr(vsqrtph))]
8084#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8085pub fn _mm512_maskz_sqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
8086    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), _mm512_setzero_ph()) }
8087}
8088
8089/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8090/// results in dst.
8091/// Rounding is done according to the rounding parameter, which can be one of:
8092///
8093/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8094/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8095/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8096/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8097/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8098///
8099/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_round_ph)
8100#[inline]
8101#[target_feature(enable = "avx512fp16")]
8102#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8103#[rustc_legacy_const_generics(1)]
8104#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8105pub fn _mm512_sqrt_round_ph<const ROUNDING: i32>(a: __m512h) -> __m512h {
8106    unsafe {
8107        static_assert_rounding!(ROUNDING);
8108        vsqrtph_512(a, ROUNDING)
8109    }
8110}
8111
8112/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8113/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8114/// Rounding is done according to the rounding parameter, which can be one of:
8115///
8116/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8117/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8118/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8119/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8120/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8121///
8122/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_round_ph)
8123#[inline]
8124#[target_feature(enable = "avx512fp16")]
8125#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8126#[rustc_legacy_const_generics(3)]
8127#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8128pub fn _mm512_mask_sqrt_round_ph<const ROUNDING: i32>(
8129    src: __m512h,
8130    k: __mmask32,
8131    a: __m512h,
8132) -> __m512h {
8133    unsafe {
8134        static_assert_rounding!(ROUNDING);
8135        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), src)
8136    }
8137}
8138
8139/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8140/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8141/// Rounding is done according to the rounding parameter, which can be one of:
8142///
8143/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8144/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8145/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8146/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8147/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8148///
8149/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_round_ph)
8150#[inline]
8151#[target_feature(enable = "avx512fp16")]
8152#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8153#[rustc_legacy_const_generics(2)]
8154#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8155pub fn _mm512_maskz_sqrt_round_ph<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512h {
8156    unsafe {
8157        static_assert_rounding!(ROUNDING);
8158        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), _mm512_setzero_ph())
8159    }
8160}
8161
8162/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8163/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
8164/// elements of dst.
8165///
8166/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sh)
8167#[inline]
8168#[target_feature(enable = "avx512fp16")]
8169#[cfg_attr(test, assert_instr(vsqrtsh))]
8170#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8171pub fn _mm_sqrt_sh(a: __m128h, b: __m128h) -> __m128h {
8172    _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
8173}
8174
8175/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8176/// the result in the lower element of dst using writemask k (the element is copied from src when mask
8177/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8178///
8179/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sh)
8180#[inline]
8181#[target_feature(enable = "avx512fp16")]
8182#[cfg_attr(test, assert_instr(vsqrtsh))]
8183#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8184pub fn _mm_mask_sqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8185    _mm_mask_sqrt_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8186}
8187
8188/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8189/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
8190/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8191///
8192/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sh)
8193#[inline]
8194#[target_feature(enable = "avx512fp16")]
8195#[cfg_attr(test, assert_instr(vsqrtsh))]
8196#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8197pub fn _mm_maskz_sqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8198    _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), k, a, b)
8199}
8200
8201/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8202/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
8203/// elements of dst.
8204/// Rounding is done according to the rounding parameter, which can be one of:
8205///
8206/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8207/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8208/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8209/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8210/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8211///
8212/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sh)
8213#[inline]
8214#[target_feature(enable = "avx512fp16")]
8215#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8216#[rustc_legacy_const_generics(2)]
8217#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8218pub fn _mm_sqrt_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
8219    static_assert_rounding!(ROUNDING);
8220    _mm_mask_sqrt_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
8221}
8222
8223/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8224/// the result in the lower element of dst using writemask k (the element is copied from src when mask
8225/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8226/// Rounding is done according to the rounding parameter, which can be one of:
8227///
8228/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8229/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8230/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8231/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8232/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8233///
8234/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sh)
8235#[inline]
8236#[target_feature(enable = "avx512fp16")]
8237#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8238#[rustc_legacy_const_generics(4)]
8239#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8240pub fn _mm_mask_sqrt_round_sh<const ROUNDING: i32>(
8241    src: __m128h,
8242    k: __mmask8,
8243    a: __m128h,
8244    b: __m128h,
8245) -> __m128h {
8246    unsafe {
8247        static_assert_rounding!(ROUNDING);
8248        vsqrtsh(a, b, src, k, ROUNDING)
8249    }
8250}
8251
8252/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8253/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
8254/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8255/// Rounding is done according to the rounding parameter, which can be one of:
8256///
8257/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8258/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8259/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8260/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8261/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8262///
8263/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sh)
8264#[inline]
8265#[target_feature(enable = "avx512fp16")]
8266#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8267#[rustc_legacy_const_generics(3)]
8268#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8269pub fn _mm_maskz_sqrt_round_sh<const ROUNDING: i32>(
8270    k: __mmask8,
8271    a: __m128h,
8272    b: __m128h,
8273) -> __m128h {
8274    static_assert_rounding!(ROUNDING);
8275    _mm_mask_sqrt_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
8276}
8277
8278/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8279/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8280/// value when inputs are NaN or signed-zero values.
8281///
8282/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ph)
8283#[inline]
8284#[target_feature(enable = "avx512fp16,avx512vl")]
8285#[cfg_attr(test, assert_instr(vmaxph))]
8286#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8287pub fn _mm_max_ph(a: __m128h, b: __m128h) -> __m128h {
8288    unsafe { vmaxph_128(a, b) }
8289}
8290
8291/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8292/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8293/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8294/// NaN or signed-zero values.
8295///
8296/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_ph)
8297#[inline]
8298#[target_feature(enable = "avx512fp16,avx512vl")]
8299#[cfg_attr(test, assert_instr(vmaxph))]
8300#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8301pub fn _mm_mask_max_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8302    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), src) }
8303}
8304
8305/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8306/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8307/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8308/// NaN or signed-zero values.
8309///
8310/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_ph)
8311#[inline]
8312#[target_feature(enable = "avx512fp16,avx512vl")]
8313#[cfg_attr(test, assert_instr(vmaxph))]
8314#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8315pub fn _mm_maskz_max_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8316    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), _mm_setzero_ph()) }
8317}
8318
8319/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8320/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8321/// value when inputs are NaN or signed-zero values.
8322///
8323/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_ph)
8324#[inline]
8325#[target_feature(enable = "avx512fp16,avx512vl")]
8326#[cfg_attr(test, assert_instr(vmaxph))]
8327#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8328pub fn _mm256_max_ph(a: __m256h, b: __m256h) -> __m256h {
8329    unsafe { vmaxph_256(a, b) }
8330}
8331
8332/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8333/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8334/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8335/// NaN or signed-zero values.
8336///
8337/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_ph)
8338#[inline]
8339#[target_feature(enable = "avx512fp16,avx512vl")]
8340#[cfg_attr(test, assert_instr(vmaxph))]
8341#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8342pub fn _mm256_mask_max_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8343    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), src) }
8344}
8345
8346/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8347/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8348/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8349/// NaN or signed-zero values.
8350///
8351/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_ph)
8352#[inline]
8353#[target_feature(enable = "avx512fp16,avx512vl")]
8354#[cfg_attr(test, assert_instr(vmaxph))]
8355#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8356pub fn _mm256_maskz_max_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8357    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), _mm256_setzero_ph()) }
8358}
8359
8360/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8361/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8362/// value when inputs are NaN or signed-zero values.
8363///
8364/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_ph)
8365#[inline]
8366#[target_feature(enable = "avx512fp16")]
8367#[cfg_attr(test, assert_instr(vmaxph))]
8368#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8369pub fn _mm512_max_ph(a: __m512h, b: __m512h) -> __m512h {
8370    _mm512_max_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
8371}
8372
8373/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8374/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8375/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8376/// NaN or signed-zero values.
8377///
8378/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_ph)
8379#[inline]
8380#[target_feature(enable = "avx512fp16")]
8381#[cfg_attr(test, assert_instr(vmaxph))]
8382#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8383pub fn _mm512_mask_max_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8384    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), src) }
8385}
8386
8387/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8388/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8389/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8390/// NaN or signed-zero values.
8391///
8392/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_ph)
8393#[inline]
8394#[target_feature(enable = "avx512fp16")]
8395#[cfg_attr(test, assert_instr(vmaxph))]
8396#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8397pub fn _mm512_maskz_max_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8398    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), _mm512_setzero_ph()) }
8399}
8400
8401/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8402/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8403/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8404/// NaN or signed-zero values.
8405///
8406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_round_ph)
8407#[inline]
8408#[target_feature(enable = "avx512fp16")]
8409#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8410#[rustc_legacy_const_generics(2)]
8411#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8412pub fn _mm512_max_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
8413    unsafe {
8414        static_assert_sae!(SAE);
8415        vmaxph_512(a, b, SAE)
8416    }
8417}
8418
8419/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8420/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8421/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8422/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8423///
8424/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_round_ph)
8425#[inline]
8426#[target_feature(enable = "avx512fp16")]
8427#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8428#[rustc_legacy_const_generics(4)]
8429#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8430pub fn _mm512_mask_max_round_ph<const SAE: i32>(
8431    src: __m512h,
8432    k: __mmask32,
8433    a: __m512h,
8434    b: __m512h,
8435) -> __m512h {
8436    unsafe {
8437        static_assert_sae!(SAE);
8438        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), src)
8439    }
8440}
8441
8442/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8443/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8444/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8445/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8446///
8447/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_round_ph)
8448#[inline]
8449#[target_feature(enable = "avx512fp16")]
8450#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8451#[rustc_legacy_const_generics(3)]
8452#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8453pub fn _mm512_maskz_max_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8454    unsafe {
8455        static_assert_sae!(SAE);
8456        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), _mm512_setzero_ph())
8457    }
8458}
8459
8460/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
8461/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
8462/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value
8463/// when inputs are NaN or signed-zero values.
8464///
8465/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sh)
8466#[inline]
8467#[target_feature(enable = "avx512fp16,avx512vl")]
8468#[cfg_attr(test, assert_instr(vmaxsh))]
8469#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8470pub fn _mm_max_sh(a: __m128h, b: __m128h) -> __m128h {
8471    _mm_mask_max_sh(_mm_undefined_ph(), 0xff, a, b)
8472}
8473
8474/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
8475/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
8476/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
8477/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8478///
8479/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_sh)
8480#[inline]
8481#[target_feature(enable = "avx512fp16,avx512vl")]
8482#[cfg_attr(test, assert_instr(vmaxsh))]
8483#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8484pub fn _mm_mask_max_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8485    _mm_mask_max_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8486}
8487
8488/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8489/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8490/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
8491/// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8492///
8493/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_sh)
8494#[inline]
8495#[target_feature(enable = "avx512fp16,avx512vl")]
8496#[cfg_attr(test, assert_instr(vmaxsh))]
8497#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8498pub fn _mm_maskz_max_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8499    _mm_mask_max_sh(f16x8::ZERO.as_m128h(), k, a, b)
8500}
8501
8502/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8503/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
8504/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8505/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8506///
8507/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sh)
8508#[inline]
8509#[target_feature(enable = "avx512fp16,avx512vl")]
8510#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8511#[rustc_legacy_const_generics(2)]
8512#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8513pub fn _mm_max_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
8514    static_assert_sae!(SAE);
8515    _mm_mask_max_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
8516}
8517
8518/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8519/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
8520/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8521/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8522/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8523///
8524/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_sh)
8525#[inline]
8526#[target_feature(enable = "avx512fp16,avx512vl")]
8527#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8528#[rustc_legacy_const_generics(4)]
8529#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8530pub fn _mm_mask_max_round_sh<const SAE: i32>(
8531    src: __m128h,
8532    k: __mmask8,
8533    a: __m128h,
8534    b: __m128h,
8535) -> __m128h {
8536    unsafe {
8537        static_assert_sae!(SAE);
8538        vmaxsh(a, b, src, k, SAE)
8539    }
8540}
8541
8542/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8543/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8544/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8545/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8546/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8547///
8548/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_sh)
8549#[inline]
8550#[target_feature(enable = "avx512fp16,avx512vl")]
8551#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8552#[rustc_legacy_const_generics(3)]
8553#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8554pub fn _mm_maskz_max_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8555    static_assert_sae!(SAE);
8556    _mm_mask_max_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
8557}
8558
8559/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8560/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8561/// when inputs are NaN or signed-zero values.
8562///
8563/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ph)
8564#[inline]
8565#[target_feature(enable = "avx512fp16,avx512vl")]
8566#[cfg_attr(test, assert_instr(vminph))]
8567#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8568pub fn _mm_min_ph(a: __m128h, b: __m128h) -> __m128h {
8569    unsafe { vminph_128(a, b) }
8570}
8571
8572/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8573/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8574/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8575/// NaN or signed-zero values.
8576///
8577/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ph)
8578#[inline]
8579#[target_feature(enable = "avx512fp16,avx512vl")]
8580#[cfg_attr(test, assert_instr(vminph))]
8581#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8582pub fn _mm_mask_min_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8583    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), src) }
8584}
8585
8586/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8587/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8588/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8589/// NaN or signed-zero values.
8590///
8591/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ph)
8592#[inline]
8593#[target_feature(enable = "avx512fp16,avx512vl")]
8594#[cfg_attr(test, assert_instr(vminph))]
8595#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8596pub fn _mm_maskz_min_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8597    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), _mm_setzero_ph()) }
8598}
8599
8600/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8601/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8602/// when inputs are NaN or signed-zero values.
8603///
8604/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_ph)
8605#[inline]
8606#[target_feature(enable = "avx512fp16,avx512vl")]
8607#[cfg_attr(test, assert_instr(vminph))]
8608#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8609pub fn _mm256_min_ph(a: __m256h, b: __m256h) -> __m256h {
8610    unsafe { vminph_256(a, b) }
8611}
8612
8613/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8614/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8615/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8616/// NaN or signed-zero values.
8617///
8618/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_ph)
8619#[inline]
8620#[target_feature(enable = "avx512fp16,avx512vl")]
8621#[cfg_attr(test, assert_instr(vminph))]
8622#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8623pub fn _mm256_mask_min_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8624    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), src) }
8625}
8626
8627/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8628/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8629/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8630/// NaN or signed-zero values.
8631///
8632/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_ph)
8633#[inline]
8634#[target_feature(enable = "avx512fp16,avx512vl")]
8635#[cfg_attr(test, assert_instr(vminph))]
8636#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8637pub fn _mm256_maskz_min_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8638    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), _mm256_setzero_ph()) }
8639}
8640
8641/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8642/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8643/// when inputs are NaN or signed-zero values.
8644///
8645/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_ph)
8646#[inline]
8647#[target_feature(enable = "avx512fp16")]
8648#[cfg_attr(test, assert_instr(vminph))]
8649#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8650pub fn _mm512_min_ph(a: __m512h, b: __m512h) -> __m512h {
8651    _mm512_min_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
8652}
8653
8654/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8655/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8656/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8657/// NaN or signed-zero values.
8658///
8659/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_ph)
8660#[inline]
8661#[target_feature(enable = "avx512fp16")]
8662#[cfg_attr(test, assert_instr(vminph))]
8663#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8664pub fn _mm512_mask_min_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8665    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), src) }
8666}
8667
8668/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8669/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8670/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8671/// NaN or signed-zero values.
8672///
8673/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_ph)
8674#[inline]
8675#[target_feature(enable = "avx512fp16")]
8676#[cfg_attr(test, assert_instr(vminph))]
8677#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8678pub fn _mm512_maskz_min_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8679    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), _mm512_setzero_ph()) }
8680}
8681
8682/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8683/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not
8684/// follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8685///
8686/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_round_ph)
8687#[inline]
8688#[target_feature(enable = "avx512fp16")]
8689#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8690#[rustc_legacy_const_generics(2)]
8691#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8692pub fn _mm512_min_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
8693    unsafe {
8694        static_assert_sae!(SAE);
8695        vminph_512(a, b, SAE)
8696    }
8697}
8698
8699/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8700/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8701/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8702/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8703///
8704/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_ph)
8705#[inline]
8706#[target_feature(enable = "avx512fp16")]
8707#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8708#[rustc_legacy_const_generics(4)]
8709#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8710pub fn _mm512_mask_min_round_ph<const SAE: i32>(
8711    src: __m512h,
8712    k: __mmask32,
8713    a: __m512h,
8714    b: __m512h,
8715) -> __m512h {
8716    unsafe {
8717        static_assert_sae!(SAE);
8718        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), src)
8719    }
8720}
8721
8722/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8723/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8724/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8725/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8726///
8727/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_ph)
8728#[inline]
8729#[target_feature(enable = "avx512fp16")]
8730#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8731#[rustc_legacy_const_generics(3)]
8732#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8733pub fn _mm512_maskz_min_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8734    unsafe {
8735        static_assert_sae!(SAE);
8736        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), _mm512_setzero_ph())
8737    }
8738}
8739
8740/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
8741/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
8742/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
8743/// inputs are NaN or signed-zero values.
8744///
8745/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sh)
8746#[inline]
8747#[target_feature(enable = "avx512fp16,avx512vl")]
8748#[cfg_attr(test, assert_instr(vminsh))]
8749#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8750pub fn _mm_min_sh(a: __m128h, b: __m128h) -> __m128h {
8751    _mm_mask_min_sh(_mm_undefined_ph(), 0xff, a, b)
8752}
8753
8754/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
8755/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
8756/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
8757/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8758///
8759/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sh)
8760#[inline]
8761#[target_feature(enable = "avx512fp16,avx512vl")]
8762#[cfg_attr(test, assert_instr(vminsh))]
8763#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8764pub fn _mm_mask_min_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8765    _mm_mask_min_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8766}
8767
8768/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8769/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8770/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
8771/// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8772///
8773/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sh)
8774#[inline]
8775#[target_feature(enable = "avx512fp16,avx512vl")]
8776#[cfg_attr(test, assert_instr(vminsh))]
8777#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8778pub fn _mm_maskz_min_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8779    _mm_mask_min_sh(f16x8::ZERO.as_m128h(), k, a, b)
8780}
8781
8782/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8783/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
8784/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8785/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8786///
8787/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sh)
8788#[inline]
8789#[target_feature(enable = "avx512fp16,avx512vl")]
8790#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
8791#[rustc_legacy_const_generics(2)]
8792#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8793pub fn _mm_min_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
8794    static_assert_sae!(SAE);
8795    _mm_mask_min_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
8796}
8797
8798/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8799/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
8800/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8801/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8802/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8803///
8804/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_sh)
8805#[inline]
8806#[target_feature(enable = "avx512fp16,avx512vl")]
8807#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
8808#[rustc_legacy_const_generics(4)]
8809#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8810pub fn _mm_mask_min_round_sh<const SAE: i32>(
8811    src: __m128h,
8812    k: __mmask8,
8813    a: __m128h,
8814    b: __m128h,
8815) -> __m128h {
8816    unsafe {
8817        static_assert_sae!(SAE);
8818        vminsh(a, b, src, k, SAE)
8819    }
8820}
8821
8822/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8823/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8824/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8825/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8826/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8827///
8828/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_sh)
8829#[inline]
8830#[target_feature(enable = "avx512fp16,avx512vl")]
8831#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
8832#[rustc_legacy_const_generics(3)]
8833#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8834pub fn _mm_maskz_min_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8835    static_assert_sae!(SAE);
8836    _mm_mask_min_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
8837}
8838
8839/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8840/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8841/// This intrinsic essentially calculates `floor(log2(x))` for each element.
8842///
8843/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ph)
8844#[inline]
8845#[target_feature(enable = "avx512fp16,avx512vl")]
8846#[cfg_attr(test, assert_instr(vgetexpph))]
8847#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8848pub fn _mm_getexp_ph(a: __m128h) -> __m128h {
8849    _mm_mask_getexp_ph(_mm_undefined_ph(), 0xff, a)
8850}
8851
8852/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8853/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8854/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8855/// `floor(log2(x))` for each element.
8856///
8857/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ph)
8858#[inline]
8859#[target_feature(enable = "avx512fp16,avx512vl")]
8860#[cfg_attr(test, assert_instr(vgetexpph))]
8861#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8862pub fn _mm_mask_getexp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
8863    unsafe { vgetexpph_128(a, src, k) }
8864}
8865
8866/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8867/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
8868/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
8869/// `floor(log2(x))` for each element.
8870///
8871/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ph)
8872#[inline]
8873#[target_feature(enable = "avx512fp16,avx512vl")]
8874#[cfg_attr(test, assert_instr(vgetexpph))]
8875#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8876pub fn _mm_maskz_getexp_ph(k: __mmask8, a: __m128h) -> __m128h {
8877    _mm_mask_getexp_ph(_mm_setzero_ph(), k, a)
8878}
8879
8880/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8881/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8882/// This intrinsic essentially calculates `floor(log2(x))` for each element.
8883///
8884/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getexp_ph)
8885#[inline]
8886#[target_feature(enable = "avx512fp16,avx512vl")]
8887#[cfg_attr(test, assert_instr(vgetexpph))]
8888#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8889pub fn _mm256_getexp_ph(a: __m256h) -> __m256h {
8890    _mm256_mask_getexp_ph(_mm256_undefined_ph(), 0xffff, a)
8891}
8892
8893/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8894/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8895/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8896/// `floor(log2(x))` for each element.
8897///
8898/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getexp_ph)
8899#[inline]
8900#[target_feature(enable = "avx512fp16,avx512vl")]
8901#[cfg_attr(test, assert_instr(vgetexpph))]
8902#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8903pub fn _mm256_mask_getexp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
8904    unsafe { vgetexpph_256(a, src, k) }
8905}
8906
8907/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8908/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
8909/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
8910/// `floor(log2(x))` for each element.
8911///
8912/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getexp_ph)
8913#[inline]
8914#[target_feature(enable = "avx512fp16,avx512vl")]
8915#[cfg_attr(test, assert_instr(vgetexpph))]
8916#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8917pub fn _mm256_maskz_getexp_ph(k: __mmask16, a: __m256h) -> __m256h {
8918    _mm256_mask_getexp_ph(_mm256_setzero_ph(), k, a)
8919}
8920
8921/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8922/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8923/// This intrinsic essentially calculates `floor(log2(x))` for each element.
8924///
8925/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_ph)
8926#[inline]
8927#[target_feature(enable = "avx512fp16")]
8928#[cfg_attr(test, assert_instr(vgetexpph))]
8929#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8930pub fn _mm512_getexp_ph(a: __m512h) -> __m512h {
8931    _mm512_mask_getexp_ph(_mm512_undefined_ph(), 0xffffffff, a)
8932}
8933
8934/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8935/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8936/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8937/// `floor(log2(x))` for each element.
8938///
8939/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_ph)
8940#[inline]
8941#[target_feature(enable = "avx512fp16")]
8942#[cfg_attr(test, assert_instr(vgetexpph))]
8943#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8944pub fn _mm512_mask_getexp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
8945    _mm512_mask_getexp_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
8946}
8947
8948/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8949/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
8950/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
8951/// `floor(log2(x))` for each element.
8952///
8953/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_ph)
8954#[inline]
8955#[target_feature(enable = "avx512fp16")]
8956#[cfg_attr(test, assert_instr(vgetexpph))]
8957#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8958pub fn _mm512_maskz_getexp_ph(k: __mmask32, a: __m512h) -> __m512h {
8959    _mm512_mask_getexp_ph(_mm512_setzero_ph(), k, a)
8960}
8961
8962/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8963/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8964/// This intrinsic essentially calculates `floor(log2(x))` for each element. Exceptions can be suppressed
8965/// by passing _MM_FROUND_NO_EXC in the sae parameter
8966///
8967/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_round_ph)
8968#[inline]
8969#[target_feature(enable = "avx512fp16")]
8970#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
8971#[rustc_legacy_const_generics(1)]
8972#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8973pub fn _mm512_getexp_round_ph<const SAE: i32>(a: __m512h) -> __m512h {
8974    static_assert_sae!(SAE);
8975    _mm512_mask_getexp_round_ph::<SAE>(_mm512_undefined_ph(), 0xffffffff, a)
8976}
8977
8978/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8979/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8980/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8981/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
8982///
8983/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_round_ph)
8984#[inline]
8985#[target_feature(enable = "avx512fp16")]
8986#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
8987#[rustc_legacy_const_generics(3)]
8988#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8989pub fn _mm512_mask_getexp_round_ph<const SAE: i32>(
8990    src: __m512h,
8991    k: __mmask32,
8992    a: __m512h,
8993) -> __m512h {
8994    unsafe {
8995        static_assert_sae!(SAE);
8996        vgetexpph_512(a, src, k, SAE)
8997    }
8998}
8999
9000/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9001/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
9002/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
9003/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9004///
9005/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_round_ph)
9006#[inline]
9007#[target_feature(enable = "avx512fp16")]
9008#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
9009#[rustc_legacy_const_generics(2)]
9010#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9011pub fn _mm512_maskz_getexp_round_ph<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512h {
9012    static_assert_sae!(SAE);
9013    _mm512_mask_getexp_round_ph::<SAE>(_mm512_setzero_ph(), k, a)
9014}
9015
9016/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9017/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9018/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
9019/// calculates `floor(log2(x))` for the lower element.
9020///
9021/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_sh)
9022#[inline]
9023#[target_feature(enable = "avx512fp16")]
9024#[cfg_attr(test, assert_instr(vgetexpsh))]
9025#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9026pub fn _mm_getexp_sh(a: __m128h, b: __m128h) -> __m128h {
9027    _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
9028}
9029
9030/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9031/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9032/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
9033/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
9034/// for the lower element.
9035///
9036/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_sh)
9037#[inline]
9038#[target_feature(enable = "avx512fp16")]
9039#[cfg_attr(test, assert_instr(vgetexpsh))]
9040#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9041pub fn _mm_mask_getexp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9042    _mm_mask_getexp_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
9043}
9044
9045/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9046/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9047/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
9048/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
9049/// lower element.
9050///
9051/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_sh)
9052#[inline]
9053#[target_feature(enable = "avx512fp16")]
9054#[cfg_attr(test, assert_instr(vgetexpsh))]
9055#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9056pub fn _mm_maskz_getexp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9057    _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), k, a, b)
9058}
9059
9060/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9061/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9062/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
9063/// calculates `floor(log2(x))` for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9064/// in the sae parameter
9065///
9066/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_sh)
9067#[inline]
9068#[target_feature(enable = "avx512fp16")]
9069#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9070#[rustc_legacy_const_generics(2)]
9071#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9072pub fn _mm_getexp_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
9073    static_assert_sae!(SAE);
9074    _mm_mask_getexp_round_sh::<SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
9075}
9076
9077/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9078/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9079/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
9080/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
9081/// for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9082///
9083/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_sh)
9084#[inline]
9085#[target_feature(enable = "avx512fp16")]
9086#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9087#[rustc_legacy_const_generics(4)]
9088#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9089pub fn _mm_mask_getexp_round_sh<const SAE: i32>(
9090    src: __m128h,
9091    k: __mmask8,
9092    a: __m128h,
9093    b: __m128h,
9094) -> __m128h {
9095    unsafe {
9096        static_assert_sae!(SAE);
9097        vgetexpsh(a, b, src, k, SAE)
9098    }
9099}
9100
9101/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9102/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9103/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
9104/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
9105/// lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9106///
9107/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_sh)
9108#[inline]
9109#[target_feature(enable = "avx512fp16")]
9110#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9111#[rustc_legacy_const_generics(3)]
9112#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9113pub fn _mm_maskz_getexp_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9114    static_assert_sae!(SAE);
9115    _mm_mask_getexp_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
9116}
9117
9118/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9119/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9120/// on the interval range defined by norm and the sign depends on sign and the source sign.
9121///
9122/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9123///
9124///     _MM_MANT_NORM_1_2     // interval [1, 2)
9125///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9126///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9127///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9128///
9129/// The sign is determined by sc which can take the following values:
9130///
9131///     _MM_MANT_SIGN_src     // sign = sign(src)
9132///     _MM_MANT_SIGN_zero    // sign = 0
9133///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9134///
9135/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ph)
9136#[inline]
9137#[target_feature(enable = "avx512fp16,avx512vl")]
9138#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9139#[rustc_legacy_const_generics(1, 2)]
9140#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9141pub fn _mm_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9142    a: __m128h,
9143) -> __m128h {
9144    static_assert_uimm_bits!(NORM, 4);
9145    static_assert_uimm_bits!(SIGN, 2);
9146    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_undefined_ph(), 0xff, a)
9147}
9148
9149/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9150/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9151/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9152/// by norm and the sign depends on sign and the source sign.
9153///
9154/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9155///
9156///     _MM_MANT_NORM_1_2     // interval [1, 2)
9157///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9158///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9159///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9160///
9161/// The sign is determined by sc which can take the following values:
9162///
9163///     _MM_MANT_SIGN_src     // sign = sign(src)
9164///     _MM_MANT_SIGN_zero    // sign = 0
9165///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9166///
9167/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ph)
9168#[inline]
9169#[target_feature(enable = "avx512fp16,avx512vl")]
9170#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9171#[rustc_legacy_const_generics(3, 4)]
9172#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9173pub fn _mm_mask_getmant_ph<
9174    const NORM: _MM_MANTISSA_NORM_ENUM,
9175    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9176>(
9177    src: __m128h,
9178    k: __mmask8,
9179    a: __m128h,
9180) -> __m128h {
9181    unsafe {
9182        static_assert_uimm_bits!(NORM, 4);
9183        static_assert_uimm_bits!(SIGN, 2);
9184        vgetmantph_128(a, (SIGN << 2) | NORM, src, k)
9185    }
9186}
9187
9188/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9189/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9190/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9191/// by norm and the sign depends on sign and the source sign.
9192///
9193/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9194///
9195///     _MM_MANT_NORM_1_2     // interval [1, 2)
9196///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9197///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9198///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9199///
9200/// The sign is determined by sc which can take the following values:
9201///
9202///     _MM_MANT_SIGN_src     // sign = sign(src)
9203///     _MM_MANT_SIGN_zero    // sign = 0
9204///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9205///
9206/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ph)
9207#[inline]
9208#[target_feature(enable = "avx512fp16,avx512vl")]
9209#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9210#[rustc_legacy_const_generics(2, 3)]
9211#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9212pub fn _mm_maskz_getmant_ph<
9213    const NORM: _MM_MANTISSA_NORM_ENUM,
9214    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9215>(
9216    k: __mmask8,
9217    a: __m128h,
9218) -> __m128h {
9219    static_assert_uimm_bits!(NORM, 4);
9220    static_assert_uimm_bits!(SIGN, 2);
9221    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_setzero_ph(), k, a)
9222}
9223
9224/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9225/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9226/// on the interval range defined by norm and the sign depends on sign and the source sign.
9227///
9228/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9229///
9230///     _MM_MANT_NORM_1_2     // interval [1, 2)
9231///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9232///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9233///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9234///
9235/// The sign is determined by sc which can take the following values:
9236///
9237///     _MM_MANT_SIGN_src     // sign = sign(src)
9238///     _MM_MANT_SIGN_zero    // sign = 0
9239///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9240///
9241/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getmant_ph)
9242#[inline]
9243#[target_feature(enable = "avx512fp16,avx512vl")]
9244#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9245#[rustc_legacy_const_generics(1, 2)]
9246#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9247pub fn _mm256_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9248    a: __m256h,
9249) -> __m256h {
9250    static_assert_uimm_bits!(NORM, 4);
9251    static_assert_uimm_bits!(SIGN, 2);
9252    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_undefined_ph(), 0xffff, a)
9253}
9254
9255/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9256/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9257/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9258/// by norm and the sign depends on sign and the source sign.
9259///
9260/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9261///
9262///     _MM_MANT_NORM_1_2     // interval [1, 2)
9263///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9264///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9265///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9266///
9267/// The sign is determined by sc which can take the following values:
9268///
9269///     _MM_MANT_SIGN_src     // sign = sign(src)
9270///     _MM_MANT_SIGN_zero    // sign = 0
9271///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9272///
9273/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getmant_ph)
9274#[inline]
9275#[target_feature(enable = "avx512fp16,avx512vl")]
9276#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9277#[rustc_legacy_const_generics(3, 4)]
9278#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9279pub fn _mm256_mask_getmant_ph<
9280    const NORM: _MM_MANTISSA_NORM_ENUM,
9281    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9282>(
9283    src: __m256h,
9284    k: __mmask16,
9285    a: __m256h,
9286) -> __m256h {
9287    unsafe {
9288        static_assert_uimm_bits!(NORM, 4);
9289        static_assert_uimm_bits!(SIGN, 2);
9290        vgetmantph_256(a, (SIGN << 2) | NORM, src, k)
9291    }
9292}
9293
9294/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9295/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9296/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9297/// by norm and the sign depends on sign and the source sign.
9298///
9299/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9300///
9301///     _MM_MANT_NORM_1_2     // interval [1, 2)
9302///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9303///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9304///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9305///
9306/// The sign is determined by sc which can take the following values:
9307///
9308///     _MM_MANT_SIGN_src     // sign = sign(src)
9309///     _MM_MANT_SIGN_zero    // sign = 0
9310///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9311///
9312/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getmant_ph)
9313#[inline]
9314#[target_feature(enable = "avx512fp16,avx512vl")]
9315#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9316#[rustc_legacy_const_generics(2, 3)]
9317#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9318pub fn _mm256_maskz_getmant_ph<
9319    const NORM: _MM_MANTISSA_NORM_ENUM,
9320    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9321>(
9322    k: __mmask16,
9323    a: __m256h,
9324) -> __m256h {
9325    static_assert_uimm_bits!(NORM, 4);
9326    static_assert_uimm_bits!(SIGN, 2);
9327    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_setzero_ph(), k, a)
9328}
9329
9330/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9331/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9332/// on the interval range defined by norm and the sign depends on sign and the source sign.
9333///
9334/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9335///
9336///     _MM_MANT_NORM_1_2     // interval [1, 2)
9337///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9338///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9339///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9340///
9341/// The sign is determined by sc which can take the following values:
9342///
9343///     _MM_MANT_SIGN_src     // sign = sign(src)
9344///     _MM_MANT_SIGN_zero    // sign = 0
9345///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9346///
9347/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_ph)
9348#[inline]
9349#[target_feature(enable = "avx512fp16")]
9350#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9351#[rustc_legacy_const_generics(1, 2)]
9352#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9353pub fn _mm512_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9354    a: __m512h,
9355) -> __m512h {
9356    static_assert_uimm_bits!(NORM, 4);
9357    static_assert_uimm_bits!(SIGN, 2);
9358    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_undefined_ph(), 0xffffffff, a)
9359}
9360
9361/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9362/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9363/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9364/// by norm and the sign depends on sign and the source sign.
9365///
9366/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9367///
9368///     _MM_MANT_NORM_1_2     // interval [1, 2)
9369///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9370///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9371///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9372///
9373/// The sign is determined by sc which can take the following values:
9374///
9375///     _MM_MANT_SIGN_src     // sign = sign(src)
9376///     _MM_MANT_SIGN_zero    // sign = 0
9377///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9378///
9379/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_ph)
9380#[inline]
9381#[target_feature(enable = "avx512fp16")]
9382#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9383#[rustc_legacy_const_generics(3, 4)]
9384#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9385pub fn _mm512_mask_getmant_ph<
9386    const NORM: _MM_MANTISSA_NORM_ENUM,
9387    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9388>(
9389    src: __m512h,
9390    k: __mmask32,
9391    a: __m512h,
9392) -> __m512h {
9393    static_assert_uimm_bits!(NORM, 4);
9394    static_assert_uimm_bits!(SIGN, 2);
9395    _mm512_mask_getmant_round_ph::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a)
9396}
9397
9398/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9399/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9400/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9401/// by norm and the sign depends on sign and the source sign.
9402///
9403/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9404///
9405///     _MM_MANT_NORM_1_2     // interval [1, 2)
9406///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9407///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9408///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9409///
9410/// The sign is determined by sc which can take the following values:
9411///
9412///     _MM_MANT_SIGN_src     // sign = sign(src)
9413///     _MM_MANT_SIGN_zero    // sign = 0
9414///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9415///
9416/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_ph)
9417#[inline]
9418#[target_feature(enable = "avx512fp16")]
9419#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9420#[rustc_legacy_const_generics(2, 3)]
9421#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9422pub fn _mm512_maskz_getmant_ph<
9423    const NORM: _MM_MANTISSA_NORM_ENUM,
9424    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9425>(
9426    k: __mmask32,
9427    a: __m512h,
9428) -> __m512h {
9429    static_assert_uimm_bits!(NORM, 4);
9430    static_assert_uimm_bits!(SIGN, 2);
9431    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_setzero_ph(), k, a)
9432}
9433
9434/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9435/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9436/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
9437/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9438///
9439/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9440///
9441///     _MM_MANT_NORM_1_2     // interval [1, 2)
9442///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9443///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9444///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9445///
9446/// The sign is determined by sc which can take the following values:
9447///
9448///     _MM_MANT_SIGN_src     // sign = sign(src)
9449///     _MM_MANT_SIGN_zero    // sign = 0
9450///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9451///
9452/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9453///
9454/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_round_ph)
9455#[inline]
9456#[target_feature(enable = "avx512fp16")]
9457#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9458#[rustc_legacy_const_generics(1, 2, 3)]
9459#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9460pub fn _mm512_getmant_round_ph<
9461    const NORM: _MM_MANTISSA_NORM_ENUM,
9462    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9463    const SAE: i32,
9464>(
9465    a: __m512h,
9466) -> __m512h {
9467    static_assert_uimm_bits!(NORM, 4);
9468    static_assert_uimm_bits!(SIGN, 2);
9469    static_assert_sae!(SAE);
9470    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
9471}
9472
9473/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9474/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9475/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9476/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9477/// in the sae parameter
9478///
9479/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9480///
9481///     _MM_MANT_NORM_1_2     // interval [1, 2)
9482///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9483///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9484///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9485///
9486/// The sign is determined by sc which can take the following values:
9487///
9488///     _MM_MANT_SIGN_src     // sign = sign(src)
9489///     _MM_MANT_SIGN_zero    // sign = 0
9490///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9491///
9492/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9493///
9494/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_round_ph)
9495#[inline]
9496#[target_feature(enable = "avx512fp16")]
9497#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9498#[rustc_legacy_const_generics(3, 4, 5)]
9499#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9500pub fn _mm512_mask_getmant_round_ph<
9501    const NORM: _MM_MANTISSA_NORM_ENUM,
9502    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9503    const SAE: i32,
9504>(
9505    src: __m512h,
9506    k: __mmask32,
9507    a: __m512h,
9508) -> __m512h {
9509    unsafe {
9510        static_assert_uimm_bits!(NORM, 4);
9511        static_assert_uimm_bits!(SIGN, 2);
9512        static_assert_sae!(SAE);
9513        vgetmantph_512(a, (SIGN << 2) | NORM, src, k, SAE)
9514    }
9515}
9516
9517/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9518/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9519/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9520/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9521/// in the sae parameter
9522///
9523/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9524///
9525///     _MM_MANT_NORM_1_2     // interval [1, 2)
9526///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9527///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9528///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9529///
9530/// The sign is determined by sc which can take the following values:
9531///
9532///     _MM_MANT_SIGN_src     // sign = sign(src)
9533///     _MM_MANT_SIGN_zero    // sign = 0
9534///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9535///
9536/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9537///
9538/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_ph)
9539#[inline]
9540#[target_feature(enable = "avx512fp16")]
9541#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9542#[rustc_legacy_const_generics(2, 3, 4)]
9543#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9544pub fn _mm512_maskz_getmant_round_ph<
9545    const NORM: _MM_MANTISSA_NORM_ENUM,
9546    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9547    const SAE: i32,
9548>(
9549    k: __mmask32,
9550    a: __m512h,
9551) -> __m512h {
9552    static_assert_uimm_bits!(NORM, 4);
9553    static_assert_uimm_bits!(SIGN, 2);
9554    static_assert_sae!(SAE);
9555    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_setzero_ph(), k, a)
9556}
9557
9558/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9559/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
9560/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9561/// on the interval range defined by norm and the sign depends on sign and the source sign.
9562///
9563/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9564///
9565///     _MM_MANT_NORM_1_2     // interval [1, 2)
9566///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9567///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9568///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9569///
9570/// The sign is determined by sc which can take the following values:
9571///
9572///     _MM_MANT_SIGN_src     // sign = sign(src)
9573///     _MM_MANT_SIGN_zero    // sign = 0
9574///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9575///
9576/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sh)
9577#[inline]
9578#[target_feature(enable = "avx512fp16")]
9579#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9580#[rustc_legacy_const_generics(2, 3)]
9581#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9582pub fn _mm_getmant_sh<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9583    a: __m128h,
9584    b: __m128h,
9585) -> __m128h {
9586    static_assert_uimm_bits!(NORM, 4);
9587    static_assert_uimm_bits!(SIGN, 2);
9588    _mm_mask_getmant_sh::<NORM, SIGN>(f16x8::ZERO.as_m128h(), 0xff, a, b)
9589}
9590
9591/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9592/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
9593/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9594/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9595/// the source sign.
9596///
9597/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9598///
9599///     _MM_MANT_NORM_1_2     // interval [1, 2)
9600///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9601///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9602///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9603///
9604/// The sign is determined by sc which can take the following values:
9605///
9606///     _MM_MANT_SIGN_src     // sign = sign(src)
9607///     _MM_MANT_SIGN_zero    // sign = 0
9608///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9609///
9610/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_sh)
9611#[inline]
9612#[target_feature(enable = "avx512fp16")]
9613#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9614#[rustc_legacy_const_generics(4, 5)]
9615#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9616pub fn _mm_mask_getmant_sh<
9617    const NORM: _MM_MANTISSA_NORM_ENUM,
9618    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9619>(
9620    src: __m128h,
9621    k: __mmask8,
9622    a: __m128h,
9623    b: __m128h,
9624) -> __m128h {
9625    static_assert_uimm_bits!(NORM, 4);
9626    static_assert_uimm_bits!(SIGN, 2);
9627    _mm_mask_getmant_round_sh::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
9628}
9629
9630/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9631/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
9632/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9633/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9634/// the source sign.
9635///
9636/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9637///
9638///     _MM_MANT_NORM_1_2     // interval [1, 2)
9639///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9640///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9641///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9642///
9643/// The sign is determined by sc which can take the following values:
9644///
9645///     _MM_MANT_SIGN_src     // sign = sign(src)
9646///     _MM_MANT_SIGN_zero    // sign = 0
9647///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9648///
9649/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_sh)
9650#[inline]
9651#[target_feature(enable = "avx512fp16")]
9652#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9653#[rustc_legacy_const_generics(3, 4)]
9654#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9655pub fn _mm_maskz_getmant_sh<
9656    const NORM: _MM_MANTISSA_NORM_ENUM,
9657    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9658>(
9659    k: __mmask8,
9660    a: __m128h,
9661    b: __m128h,
9662) -> __m128h {
9663    static_assert_uimm_bits!(NORM, 4);
9664    static_assert_uimm_bits!(SIGN, 2);
9665    _mm_mask_getmant_sh::<NORM, SIGN>(f16x8::ZERO.as_m128h(), k, a, b)
9666}
9667
9668/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9669/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
9670/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9671/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
9672/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9673///
9674/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9675///
9676///     _MM_MANT_NORM_1_2     // interval [1, 2)
9677///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9678///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9679///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9680///
9681/// The sign is determined by sc which can take the following values:
9682///
9683///     _MM_MANT_SIGN_src     // sign = sign(src)
9684///     _MM_MANT_SIGN_zero    // sign = 0
9685///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9686///
9687/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9688///
9689/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_sh)
9690#[inline]
9691#[target_feature(enable = "avx512fp16")]
9692#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9693#[rustc_legacy_const_generics(2, 3, 4)]
9694#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9695pub fn _mm_getmant_round_sh<
9696    const NORM: _MM_MANTISSA_NORM_ENUM,
9697    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9698    const SAE: i32,
9699>(
9700    a: __m128h,
9701    b: __m128h,
9702) -> __m128h {
9703    static_assert_uimm_bits!(NORM, 4);
9704    static_assert_uimm_bits!(SIGN, 2);
9705    static_assert_sae!(SAE);
9706    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
9707}
9708
9709/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9710/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
9711/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9712/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9713/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9714///
9715/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9716///
9717///     _MM_MANT_NORM_1_2     // interval [1, 2)
9718///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9719///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9720///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9721///
9722/// The sign is determined by sc which can take the following values:
9723///
9724///     _MM_MANT_SIGN_src     // sign = sign(src)
9725///     _MM_MANT_SIGN_zero    // sign = 0
9726///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9727///
9728/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9729///
9730/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_sh)
9731#[inline]
9732#[target_feature(enable = "avx512fp16")]
9733#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9734#[rustc_legacy_const_generics(4, 5, 6)]
9735#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9736pub fn _mm_mask_getmant_round_sh<
9737    const NORM: _MM_MANTISSA_NORM_ENUM,
9738    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9739    const SAE: i32,
9740>(
9741    src: __m128h,
9742    k: __mmask8,
9743    a: __m128h,
9744    b: __m128h,
9745) -> __m128h {
9746    unsafe {
9747        static_assert_uimm_bits!(NORM, 4);
9748        static_assert_uimm_bits!(SIGN, 2);
9749        static_assert_sae!(SAE);
9750        vgetmantsh(a, b, (SIGN << 2) | NORM, src, k, SAE)
9751    }
9752}
9753
9754/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9755/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
9756/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9757/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9758/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9759///
9760/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9761///
9762///     _MM_MANT_NORM_1_2     // interval [1, 2)
9763///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9764///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9765///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9766///
9767/// The sign is determined by sc which can take the following values:
9768///
9769///     _MM_MANT_SIGN_src     // sign = sign(src)
9770///     _MM_MANT_SIGN_zero    // sign = 0
9771///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9772///
9773/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9774///
9775/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_sh)
9776#[inline]
9777#[target_feature(enable = "avx512fp16")]
9778#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9779#[rustc_legacy_const_generics(3, 4, 5)]
9780#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9781pub fn _mm_maskz_getmant_round_sh<
9782    const NORM: _MM_MANTISSA_NORM_ENUM,
9783    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9784    const SAE: i32,
9785>(
9786    k: __mmask8,
9787    a: __m128h,
9788    b: __m128h,
9789) -> __m128h {
9790    static_assert_uimm_bits!(NORM, 4);
9791    static_assert_uimm_bits!(SIGN, 2);
9792    static_assert_sae!(SAE);
9793    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
9794}
9795
9796/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9797/// specified by imm8, and store the results in dst.
9798///
9799/// Rounding is done according to the imm8 parameter, which can be one of:
9800///
9801/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9802/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9803/// * [`_MM_FROUND_TO_POS_INF`] : round up
9804/// * [`_MM_FROUND_TO_ZERO`] : truncate
9805/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9806///
9807/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ph)
9808#[inline]
9809#[target_feature(enable = "avx512fp16,avx512vl")]
9810#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9811#[rustc_legacy_const_generics(1)]
9812#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9813pub fn _mm_roundscale_ph<const IMM8: i32>(a: __m128h) -> __m128h {
9814    static_assert_uimm_bits!(IMM8, 8);
9815    _mm_mask_roundscale_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
9816}
9817
9818/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9819/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
9820/// the corresponding mask bit is not set).
9821///
9822/// Rounding is done according to the imm8 parameter, which can be one of:
9823///
9824/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9825/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9826/// * [`_MM_FROUND_TO_POS_INF`] : round up
9827/// * [`_MM_FROUND_TO_ZERO`] : truncate
9828/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9829///
9830/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ph)
9831#[inline]
9832#[target_feature(enable = "avx512fp16,avx512vl")]
9833#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9834#[rustc_legacy_const_generics(3)]
9835#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9836pub fn _mm_mask_roundscale_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
9837    unsafe {
9838        static_assert_uimm_bits!(IMM8, 8);
9839        vrndscaleph_128(a, IMM8, src, k)
9840    }
9841}
9842
9843/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9844/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
9845/// mask bit is not set).
9846///
9847/// Rounding is done according to the imm8 parameter, which can be one of:
9848///
9849/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9850/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9851/// * [`_MM_FROUND_TO_POS_INF`] : round up
9852/// * [`_MM_FROUND_TO_ZERO`] : truncate
9853/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9854///
9855/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ph)
9856#[inline]
9857#[target_feature(enable = "avx512fp16,avx512vl")]
9858#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9859#[rustc_legacy_const_generics(2)]
9860#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9861pub fn _mm_maskz_roundscale_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
9862    static_assert_uimm_bits!(IMM8, 8);
9863    _mm_mask_roundscale_ph::<IMM8>(_mm_setzero_ph(), k, a)
9864}
9865
9866/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9867/// specified by imm8, and store the results in dst.
9868///
9869/// Rounding is done according to the imm8 parameter, which can be one of:
9870///
9871/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9872/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9873/// * [`_MM_FROUND_TO_POS_INF`] : round up
9874/// * [`_MM_FROUND_TO_ZERO`] : truncate
9875/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9876///
9877/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_roundscale_ph)
9878#[inline]
9879#[target_feature(enable = "avx512fp16,avx512vl")]
9880#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9881#[rustc_legacy_const_generics(1)]
9882#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9883pub fn _mm256_roundscale_ph<const IMM8: i32>(a: __m256h) -> __m256h {
9884    static_assert_uimm_bits!(IMM8, 8);
9885    _mm256_mask_roundscale_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
9886}
9887
9888/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9889/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
9890/// the corresponding mask bit is not set).
9891///
9892/// Rounding is done according to the imm8 parameter, which can be one of:
9893///
9894/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9895/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9896/// * [`_MM_FROUND_TO_POS_INF`] : round up
9897/// * [`_MM_FROUND_TO_ZERO`] : truncate
9898/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9899///
9900/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_roundscale_ph)
9901#[inline]
9902#[target_feature(enable = "avx512fp16,avx512vl")]
9903#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9904#[rustc_legacy_const_generics(3)]
9905#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9906pub fn _mm256_mask_roundscale_ph<const IMM8: i32>(
9907    src: __m256h,
9908    k: __mmask16,
9909    a: __m256h,
9910) -> __m256h {
9911    unsafe {
9912        static_assert_uimm_bits!(IMM8, 8);
9913        vrndscaleph_256(a, IMM8, src, k)
9914    }
9915}
9916
9917/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9918/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
9919/// mask bit is not set).
9920///
9921/// Rounding is done according to the imm8 parameter, which can be one of:
9922///
9923/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9924/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9925/// * [`_MM_FROUND_TO_POS_INF`] : round up
9926/// * [`_MM_FROUND_TO_ZERO`] : truncate
9927/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9928///
9929/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_roundscale_ph)
9930#[inline]
9931#[target_feature(enable = "avx512fp16,avx512vl")]
9932#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9933#[rustc_legacy_const_generics(2)]
9934#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9935pub fn _mm256_maskz_roundscale_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
9936    static_assert_uimm_bits!(IMM8, 8);
9937    _mm256_mask_roundscale_ph::<IMM8>(_mm256_setzero_ph(), k, a)
9938}
9939
9940/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9941/// specified by imm8, and store the results in dst.
9942///
9943/// Rounding is done according to the imm8 parameter, which can be one of:
9944///
9945/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9946/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9947/// * [`_MM_FROUND_TO_POS_INF`] : round up
9948/// * [`_MM_FROUND_TO_ZERO`] : truncate
9949/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9950///
9951/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ph)
9952#[inline]
9953#[target_feature(enable = "avx512fp16")]
9954#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9955#[rustc_legacy_const_generics(1)]
9956#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9957pub fn _mm512_roundscale_ph<const IMM8: i32>(a: __m512h) -> __m512h {
9958    static_assert_uimm_bits!(IMM8, 8);
9959    _mm512_mask_roundscale_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
9960}
9961
9962/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9963/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
9964/// the corresponding mask bit is not set).
9965///
9966/// Rounding is done according to the imm8 parameter, which can be one of:
9967///
9968/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9969/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9970/// * [`_MM_FROUND_TO_POS_INF`] : round up
9971/// * [`_MM_FROUND_TO_ZERO`] : truncate
9972/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9973///
9974/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ph)
9975#[inline]
9976#[target_feature(enable = "avx512fp16")]
9977#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9978#[rustc_legacy_const_generics(3)]
9979#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9980pub fn _mm512_mask_roundscale_ph<const IMM8: i32>(
9981    src: __m512h,
9982    k: __mmask32,
9983    a: __m512h,
9984) -> __m512h {
9985    static_assert_uimm_bits!(IMM8, 8);
9986    _mm512_mask_roundscale_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
9987}
9988
9989/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9990/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
9991/// mask bit is not set).
9992///
9993/// Rounding is done according to the imm8 parameter, which can be one of:
9994///
9995/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9996/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9997/// * [`_MM_FROUND_TO_POS_INF`] : round up
9998/// * [`_MM_FROUND_TO_ZERO`] : truncate
9999/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10000///
10001/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ph)
10002#[inline]
10003#[target_feature(enable = "avx512fp16")]
10004#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10005#[rustc_legacy_const_generics(2)]
10006#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10007pub fn _mm512_maskz_roundscale_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
10008    static_assert_uimm_bits!(IMM8, 8);
10009    _mm512_mask_roundscale_ph::<IMM8>(_mm512_setzero_ph(), k, a)
10010}
10011
10012/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10013/// specified by imm8, and store the results in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
10014/// in the sae parameter
10015///
10016/// Rounding is done according to the imm8 parameter, which can be one of:
10017///
10018/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10019/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10020/// * [`_MM_FROUND_TO_POS_INF`] : round up
10021/// * [`_MM_FROUND_TO_ZERO`] : truncate
10022/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10023///
10024/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ph)
10025#[inline]
10026#[target_feature(enable = "avx512fp16")]
10027#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10028#[rustc_legacy_const_generics(1, 2)]
10029#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10030pub fn _mm512_roundscale_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
10031    static_assert_uimm_bits!(IMM8, 8);
10032    static_assert_sae!(SAE);
10033    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
10034}
10035
10036/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10037/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
10038/// the corresponding mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
10039/// in the sae parameter
10040///
10041/// Rounding is done according to the imm8 parameter, which can be one of:
10042///
10043/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10044/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10045/// * [`_MM_FROUND_TO_POS_INF`] : round up
10046/// * [`_MM_FROUND_TO_ZERO`] : truncate
10047/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10048///
10049/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ph)
10050#[inline]
10051#[target_feature(enable = "avx512fp16")]
10052#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10053#[rustc_legacy_const_generics(3, 4)]
10054#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10055pub fn _mm512_mask_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
10056    src: __m512h,
10057    k: __mmask32,
10058    a: __m512h,
10059) -> __m512h {
10060    unsafe {
10061        static_assert_uimm_bits!(IMM8, 8);
10062        static_assert_sae!(SAE);
10063        vrndscaleph_512(a, IMM8, src, k, SAE)
10064    }
10065}
10066
10067/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10068/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
10069/// mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10070///
10071/// Rounding is done according to the imm8 parameter, which can be one of:
10072///
10073/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10074/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10075/// * [`_MM_FROUND_TO_POS_INF`] : round up
10076/// * [`_MM_FROUND_TO_ZERO`] : truncate
10077/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10078///
10079/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ph)
10080#[inline]
10081#[target_feature(enable = "avx512fp16")]
10082#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10083#[rustc_legacy_const_generics(2, 3)]
10084#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10085pub fn _mm512_maskz_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
10086    k: __mmask32,
10087    a: __m512h,
10088) -> __m512h {
10089    static_assert_uimm_bits!(IMM8, 8);
10090    static_assert_sae!(SAE);
10091    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
10092}
10093
10094/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10095/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
10096/// from a to the upper elements of dst.
10097///
10098/// Rounding is done according to the imm8 parameter, which can be one of:
10099///
10100/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10101/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10102/// * [`_MM_FROUND_TO_POS_INF`] : round up
10103/// * [`_MM_FROUND_TO_ZERO`] : truncate
10104/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10105///
10106/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sh)
10107#[inline]
10108#[target_feature(enable = "avx512fp16")]
10109#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10110#[rustc_legacy_const_generics(2)]
10111#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10112pub fn _mm_roundscale_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
10113    static_assert_uimm_bits!(IMM8, 8);
10114    _mm_mask_roundscale_sh::<IMM8>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10115}
10116
10117/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10118/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
10119/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10120///
10121/// Rounding is done according to the imm8 parameter, which can be one of:
10122///
10123/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10124/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10125/// * [`_MM_FROUND_TO_POS_INF`] : round up
10126/// * [`_MM_FROUND_TO_ZERO`] : truncate
10127/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10128///
10129/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sh)
10130#[inline]
10131#[target_feature(enable = "avx512fp16")]
10132#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10133#[rustc_legacy_const_generics(4)]
10134#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10135pub fn _mm_mask_roundscale_sh<const IMM8: i32>(
10136    src: __m128h,
10137    k: __mmask8,
10138    a: __m128h,
10139    b: __m128h,
10140) -> __m128h {
10141    static_assert_uimm_bits!(IMM8, 8);
10142    _mm_mask_roundscale_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10143}
10144
10145/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10146/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
10147/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10148///
10149/// Rounding is done according to the imm8 parameter, which can be one of:
10150///
10151/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10152/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10153/// * [`_MM_FROUND_TO_POS_INF`] : round up
10154/// * [`_MM_FROUND_TO_ZERO`] : truncate
10155/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10156///
10157/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sh)
10158#[inline]
10159#[target_feature(enable = "avx512fp16")]
10160#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10161#[rustc_legacy_const_generics(3)]
10162#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10163pub fn _mm_maskz_roundscale_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10164    static_assert_uimm_bits!(IMM8, 8);
10165    _mm_mask_roundscale_sh::<IMM8>(f16x8::ZERO.as_m128h(), k, a, b)
10166}
10167
10168/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10169/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
10170/// from a to the upper elements of dst.
10171///
10172/// Rounding is done according to the imm8 parameter, which can be one of:
10173///
10174/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10175/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10176/// * [`_MM_FROUND_TO_POS_INF`] : round up
10177/// * [`_MM_FROUND_TO_ZERO`] : truncate
10178/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10179///
10180/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10181///
10182/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sh)
10183#[inline]
10184#[target_feature(enable = "avx512fp16")]
10185#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10186#[rustc_legacy_const_generics(2, 3)]
10187#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10188pub fn _mm_roundscale_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
10189    static_assert_uimm_bits!(IMM8, 8);
10190    static_assert_sae!(SAE);
10191    _mm_mask_roundscale_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10192}
10193
10194/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10195/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
10196/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10197///
10198/// Rounding is done according to the imm8 parameter, which can be one of:
10199///
10200/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10201/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10202/// * [`_MM_FROUND_TO_POS_INF`] : round up
10203/// * [`_MM_FROUND_TO_ZERO`] : truncate
10204/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10205///
10206/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10207///
10208/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sh)
10209#[inline]
10210#[target_feature(enable = "avx512fp16")]
10211#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10212#[rustc_legacy_const_generics(4, 5)]
10213#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10214pub fn _mm_mask_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
10215    src: __m128h,
10216    k: __mmask8,
10217    a: __m128h,
10218    b: __m128h,
10219) -> __m128h {
10220    unsafe {
10221        static_assert_uimm_bits!(IMM8, 8);
10222        static_assert_sae!(SAE);
10223        vrndscalesh(a, b, src, k, IMM8, SAE)
10224    }
10225}
10226
10227/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10228/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
10229/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10230///
10231/// Rounding is done according to the imm8 parameter, which can be one of:
10232///
10233/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10234/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10235/// * [`_MM_FROUND_TO_POS_INF`] : round up
10236/// * [`_MM_FROUND_TO_ZERO`] : truncate
10237/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10238///
10239/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10240///
10241/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sh)
10242#[inline]
10243#[target_feature(enable = "avx512fp16")]
10244#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10245#[rustc_legacy_const_generics(3, 4)]
10246#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10247pub fn _mm_maskz_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
10248    k: __mmask8,
10249    a: __m128h,
10250    b: __m128h,
10251) -> __m128h {
10252    static_assert_uimm_bits!(IMM8, 8);
10253    static_assert_sae!(SAE);
10254    _mm_mask_roundscale_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
10255}
10256
10257/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10258/// the results in dst.
10259///
10260/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ph)
10261#[inline]
10262#[target_feature(enable = "avx512fp16,avx512vl")]
10263#[cfg_attr(test, assert_instr(vscalefph))]
10264#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10265pub fn _mm_scalef_ph(a: __m128h, b: __m128h) -> __m128h {
10266    _mm_mask_scalef_ph(_mm_undefined_ph(), 0xff, a, b)
10267}
10268
10269/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10270/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10271///
10272/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ph)
10273#[inline]
10274#[target_feature(enable = "avx512fp16,avx512vl")]
10275#[cfg_attr(test, assert_instr(vscalefph))]
10276#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10277pub fn _mm_mask_scalef_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10278    unsafe { vscalefph_128(a, b, src, k) }
10279}
10280
10281/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10282/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10283///
10284/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ph)
10285#[inline]
10286#[target_feature(enable = "avx512fp16,avx512vl")]
10287#[cfg_attr(test, assert_instr(vscalefph))]
10288#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10289pub fn _mm_maskz_scalef_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10290    _mm_mask_scalef_ph(_mm_setzero_ph(), k, a, b)
10291}
10292
10293/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10294/// the results in dst.
10295///
10296/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_scalef_ph)
10297#[inline]
10298#[target_feature(enable = "avx512fp16,avx512vl")]
10299#[cfg_attr(test, assert_instr(vscalefph))]
10300#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10301pub fn _mm256_scalef_ph(a: __m256h, b: __m256h) -> __m256h {
10302    _mm256_mask_scalef_ph(_mm256_undefined_ph(), 0xffff, a, b)
10303}
10304
10305/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10306/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10307///
10308/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_scalef_ph)
10309#[inline]
10310#[target_feature(enable = "avx512fp16,avx512vl")]
10311#[cfg_attr(test, assert_instr(vscalefph))]
10312#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10313pub fn _mm256_mask_scalef_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
10314    unsafe { vscalefph_256(a, b, src, k) }
10315}
10316
10317/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10318/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10319///
10320/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_scalef_ph)
10321#[inline]
10322#[target_feature(enable = "avx512fp16,avx512vl")]
10323#[cfg_attr(test, assert_instr(vscalefph))]
10324#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10325pub fn _mm256_maskz_scalef_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
10326    _mm256_mask_scalef_ph(_mm256_setzero_ph(), k, a, b)
10327}
10328
10329/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10330/// the results in dst.
10331///
10332/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ph)
10333#[inline]
10334#[target_feature(enable = "avx512fp16")]
10335#[cfg_attr(test, assert_instr(vscalefph))]
10336#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10337pub fn _mm512_scalef_ph(a: __m512h, b: __m512h) -> __m512h {
10338    _mm512_mask_scalef_ph(_mm512_undefined_ph(), 0xffffffff, a, b)
10339}
10340
10341/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10342/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10343///
10344/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ph)
10345#[inline]
10346#[target_feature(enable = "avx512fp16")]
10347#[cfg_attr(test, assert_instr(vscalefph))]
10348#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10349pub fn _mm512_mask_scalef_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
10350    _mm512_mask_scalef_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10351}
10352
10353/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10354/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10355///
10356/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ph)
10357#[inline]
10358#[target_feature(enable = "avx512fp16")]
10359#[cfg_attr(test, assert_instr(vscalefph))]
10360#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10361pub fn _mm512_maskz_scalef_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
10362    _mm512_mask_scalef_ph(_mm512_setzero_ph(), k, a, b)
10363}
10364
10365/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10366/// the results in dst.
10367///
10368/// Rounding is done according to the rounding parameter, which can be one of:
10369///
10370/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10371/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10372/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10373/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10374/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10375///
10376/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ph)
10377#[inline]
10378#[target_feature(enable = "avx512fp16")]
10379#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10380#[rustc_legacy_const_generics(2)]
10381#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10382pub fn _mm512_scalef_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
10383    static_assert_rounding!(ROUNDING);
10384    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_undefined_ph(), 0xffffffff, a, b)
10385}
10386
10387/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10388/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10389///
10390/// Rounding is done according to the rounding parameter, which can be one of:
10391///
10392/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10393/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10394/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10395/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10396/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10397///
10398/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ph)
10399#[inline]
10400#[target_feature(enable = "avx512fp16")]
10401#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10402#[rustc_legacy_const_generics(4)]
10403#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10404pub fn _mm512_mask_scalef_round_ph<const ROUNDING: i32>(
10405    src: __m512h,
10406    k: __mmask32,
10407    a: __m512h,
10408    b: __m512h,
10409) -> __m512h {
10410    unsafe {
10411        static_assert_rounding!(ROUNDING);
10412        vscalefph_512(a, b, src, k, ROUNDING)
10413    }
10414}
10415
10416/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10417/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10418///
10419/// Rounding is done according to the rounding parameter, which can be one of:
10420///
10421/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10422/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10423/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10424/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10425/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10426///
10427/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ph)
10428#[inline]
10429#[target_feature(enable = "avx512fp16")]
10430#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10431#[rustc_legacy_const_generics(3)]
10432#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10433pub fn _mm512_maskz_scalef_round_ph<const ROUNDING: i32>(
10434    k: __mmask32,
10435    a: __m512h,
10436    b: __m512h,
10437) -> __m512h {
10438    static_assert_rounding!(ROUNDING);
10439    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
10440}
10441
10442/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10443/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
10444/// elements of dst.
10445///
10446/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sh)
10447#[inline]
10448#[target_feature(enable = "avx512fp16")]
10449#[cfg_attr(test, assert_instr(vscalefsh))]
10450#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10451pub fn _mm_scalef_sh(a: __m128h, b: __m128h) -> __m128h {
10452    _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
10453}
10454
10455/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10456/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
10457/// and copy the upper 7 packed elements from a to the upper elements of dst.
10458///
10459/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sh)
10460#[inline]
10461#[target_feature(enable = "avx512fp16")]
10462#[cfg_attr(test, assert_instr(vscalefsh))]
10463#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10464pub fn _mm_mask_scalef_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10465    _mm_mask_scalef_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10466}
10467
10468/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10469/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
10470/// and copy the upper 7 packed elements from a to the upper elements of dst.
10471///
10472/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sh)
10473#[inline]
10474#[target_feature(enable = "avx512fp16")]
10475#[cfg_attr(test, assert_instr(vscalefsh))]
10476#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10477pub fn _mm_maskz_scalef_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10478    _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), k, a, b)
10479}
10480
10481/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10482/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
10483/// elements of dst.
10484///
10485/// Rounding is done according to the rounding parameter, which can be one of:
10486///
10487/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10488/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10489/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10490/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10491/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10492///
10493/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sh)
10494#[inline]
10495#[target_feature(enable = "avx512fp16")]
10496#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10497#[rustc_legacy_const_generics(2)]
10498#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10499pub fn _mm_scalef_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
10500    static_assert_rounding!(ROUNDING);
10501    _mm_mask_scalef_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10502}
10503
10504/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10505/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
10506/// and copy the upper 7 packed elements from a to the upper elements of dst.
10507///
10508/// Rounding is done according to the rounding parameter, which can be one of:
10509///
10510/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10511/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10512/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10513/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10514/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10515///
10516/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sh)
10517#[inline]
10518#[target_feature(enable = "avx512fp16")]
10519#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10520#[rustc_legacy_const_generics(4)]
10521#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10522pub fn _mm_mask_scalef_round_sh<const ROUNDING: i32>(
10523    src: __m128h,
10524    k: __mmask8,
10525    a: __m128h,
10526    b: __m128h,
10527) -> __m128h {
10528    unsafe {
10529        static_assert_rounding!(ROUNDING);
10530        vscalefsh(a, b, src, k, ROUNDING)
10531    }
10532}
10533
10534/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10535/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
10536/// and copy the upper 7 packed elements from a to the upper elements of dst.
10537///
10538/// Rounding is done according to the rounding parameter, which can be one of:
10539///
10540/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10541/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10542/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10543/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10544/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10545///
10546/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sh)
10547#[inline]
10548#[target_feature(enable = "avx512fp16")]
10549#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10550#[rustc_legacy_const_generics(3)]
10551#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10552pub fn _mm_maskz_scalef_round_sh<const ROUNDING: i32>(
10553    k: __mmask8,
10554    a: __m128h,
10555    b: __m128h,
10556) -> __m128h {
10557    static_assert_rounding!(ROUNDING);
10558    _mm_mask_scalef_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
10559}
10560
10561/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10562/// number of bits specified by imm8, and store the results in dst.
10563///
10564/// Rounding is done according to the imm8 parameter, which can be one of:
10565///
10566/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10567/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10568/// * [`_MM_FROUND_TO_POS_INF`] : round up
10569/// * [`_MM_FROUND_TO_ZERO`] : truncate
10570/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10571///
10572/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_ph)
10573#[inline]
10574#[target_feature(enable = "avx512fp16,avx512vl")]
10575#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10576#[rustc_legacy_const_generics(1)]
10577#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10578pub fn _mm_reduce_ph<const IMM8: i32>(a: __m128h) -> __m128h {
10579    static_assert_uimm_bits!(IMM8, 8);
10580    _mm_mask_reduce_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
10581}
10582
10583/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10584/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10585/// from src when the corresponding mask bit is not set).
10586///
10587/// Rounding is done according to the imm8 parameter, which can be one of:
10588///
10589/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10590/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10591/// * [`_MM_FROUND_TO_POS_INF`] : round up
10592/// * [`_MM_FROUND_TO_ZERO`] : truncate
10593/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10594///
10595/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_ph)
10596#[inline]
10597#[target_feature(enable = "avx512fp16,avx512vl")]
10598#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10599#[rustc_legacy_const_generics(3)]
10600#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10601pub fn _mm_mask_reduce_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
10602    unsafe {
10603        static_assert_uimm_bits!(IMM8, 8);
10604        vreduceph_128(a, IMM8, src, k)
10605    }
10606}
10607
10608/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10609/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10610/// out when the corresponding mask bit is not set).
10611///
10612/// Rounding is done according to the imm8 parameter, which can be one of:
10613///
10614/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10615/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10616/// * [`_MM_FROUND_TO_POS_INF`] : round up
10617/// * [`_MM_FROUND_TO_ZERO`] : truncate
10618/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10619///
10620/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_ph)
10621#[inline]
10622#[target_feature(enable = "avx512fp16,avx512vl")]
10623#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10624#[rustc_legacy_const_generics(2)]
10625#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10626pub fn _mm_maskz_reduce_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
10627    static_assert_uimm_bits!(IMM8, 8);
10628    _mm_mask_reduce_ph::<IMM8>(_mm_setzero_ph(), k, a)
10629}
10630
10631/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10632/// number of bits specified by imm8, and store the results in dst.
10633///
10634/// Rounding is done according to the imm8 parameter, which can be one of:
10635///
10636/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10637/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10638/// * [`_MM_FROUND_TO_POS_INF`] : round up
10639/// * [`_MM_FROUND_TO_ZERO`] : truncate
10640/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10641///
10642/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_ph)
10643#[inline]
10644#[target_feature(enable = "avx512fp16,avx512vl")]
10645#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10646#[rustc_legacy_const_generics(1)]
10647#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10648pub fn _mm256_reduce_ph<const IMM8: i32>(a: __m256h) -> __m256h {
10649    static_assert_uimm_bits!(IMM8, 8);
10650    _mm256_mask_reduce_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
10651}
10652
10653/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10654/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10655/// from src when the corresponding mask bit is not set).
10656///
10657/// Rounding is done according to the imm8 parameter, which can be one of:
10658///
10659/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10660/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10661/// * [`_MM_FROUND_TO_POS_INF`] : round up
10662/// * [`_MM_FROUND_TO_ZERO`] : truncate
10663/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10664///
10665/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_ph)
10666#[inline]
10667#[target_feature(enable = "avx512fp16,avx512vl")]
10668#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10669#[rustc_legacy_const_generics(3)]
10670#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10671pub fn _mm256_mask_reduce_ph<const IMM8: i32>(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
10672    unsafe {
10673        static_assert_uimm_bits!(IMM8, 8);
10674        vreduceph_256(a, IMM8, src, k)
10675    }
10676}
10677
10678/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10679/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10680/// out when the corresponding mask bit is not set).
10681///
10682/// Rounding is done according to the imm8 parameter, which can be one of:
10683///
10684/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10685/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10686/// * [`_MM_FROUND_TO_POS_INF`] : round up
10687/// * [`_MM_FROUND_TO_ZERO`] : truncate
10688/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10689///
10690/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_reduce_ph)
10691#[inline]
10692#[target_feature(enable = "avx512fp16,avx512vl")]
10693#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10694#[rustc_legacy_const_generics(2)]
10695#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10696pub fn _mm256_maskz_reduce_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
10697    static_assert_uimm_bits!(IMM8, 8);
10698    _mm256_mask_reduce_ph::<IMM8>(_mm256_setzero_ph(), k, a)
10699}
10700
10701/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10702/// number of bits specified by imm8, and store the results in dst.
10703///
10704/// Rounding is done according to the imm8 parameter, which can be one of:
10705///
10706/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10707/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10708/// * [`_MM_FROUND_TO_POS_INF`] : round up
10709/// * [`_MM_FROUND_TO_ZERO`] : truncate
10710/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10711///
10712/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_ph)
10713#[inline]
10714#[target_feature(enable = "avx512fp16")]
10715#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10716#[rustc_legacy_const_generics(1)]
10717#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10718pub fn _mm512_reduce_ph<const IMM8: i32>(a: __m512h) -> __m512h {
10719    static_assert_uimm_bits!(IMM8, 8);
10720    _mm512_mask_reduce_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
10721}
10722
10723/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10724/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10725/// from src when the corresponding mask bit is not set).
10726///
10727/// Rounding is done according to the imm8 parameter, which can be one of:
10728///
10729/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10730/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10731/// * [`_MM_FROUND_TO_POS_INF`] : round up
10732/// * [`_MM_FROUND_TO_ZERO`] : truncate
10733/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10734///
10735/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_ph)
10736#[inline]
10737#[target_feature(enable = "avx512fp16")]
10738#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10739#[rustc_legacy_const_generics(3)]
10740#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10741pub fn _mm512_mask_reduce_ph<const IMM8: i32>(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
10742    static_assert_uimm_bits!(IMM8, 8);
10743    _mm512_mask_reduce_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
10744}
10745
10746/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10747/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10748/// out when the corresponding mask bit is not set).
10749///
10750/// Rounding is done according to the imm8 parameter, which can be one of:
10751///
10752/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10753/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10754/// * [`_MM_FROUND_TO_POS_INF`] : round up
10755/// * [`_MM_FROUND_TO_ZERO`] : truncate
10756/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10757///
10758/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_ph)
10759#[inline]
10760#[target_feature(enable = "avx512fp16")]
10761#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10762#[rustc_legacy_const_generics(2)]
10763#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10764pub fn _mm512_maskz_reduce_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
10765    static_assert_uimm_bits!(IMM8, 8);
10766    _mm512_mask_reduce_ph::<IMM8>(_mm512_setzero_ph(), k, a)
10767}
10768
10769/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10770/// number of bits specified by imm8, and store the results in dst.
10771///
10772/// Rounding is done according to the imm8 parameter, which can be one of:
10773///
10774/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10775/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10776/// * [`_MM_FROUND_TO_POS_INF`] : round up
10777/// * [`_MM_FROUND_TO_ZERO`] : truncate
10778/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10779///
10780/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10781///
10782/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_round_ph)
10783#[inline]
10784#[target_feature(enable = "avx512fp16")]
10785#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
10786#[rustc_legacy_const_generics(1, 2)]
10787#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10788pub fn _mm512_reduce_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
10789    static_assert_uimm_bits!(IMM8, 8);
10790    static_assert_sae!(SAE);
10791    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
10792}
10793
10794/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10795/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10796/// from src when the corresponding mask bit is not set).
10797///
10798/// Rounding is done according to the imm8 parameter, which can be one of:
10799///
10800/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10801/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10802/// * [`_MM_FROUND_TO_POS_INF`] : round up
10803/// * [`_MM_FROUND_TO_ZERO`] : truncate
10804/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10805///
10806/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10807///
10808/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_round_ph)
10809#[inline]
10810#[target_feature(enable = "avx512fp16")]
10811#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
10812#[rustc_legacy_const_generics(3, 4)]
10813#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10814pub fn _mm512_mask_reduce_round_ph<const IMM8: i32, const SAE: i32>(
10815    src: __m512h,
10816    k: __mmask32,
10817    a: __m512h,
10818) -> __m512h {
10819    unsafe {
10820        static_assert_uimm_bits!(IMM8, 8);
10821        static_assert_sae!(SAE);
10822        vreduceph_512(a, IMM8, src, k, SAE)
10823    }
10824}
10825
10826/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10827/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10828/// out when the corresponding mask bit is not set).
10829///
10830/// Rounding is done according to the imm8 parameter, which can be one of:
10831///
10832/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10833/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10834/// * [`_MM_FROUND_TO_POS_INF`] : round up
10835/// * [`_MM_FROUND_TO_ZERO`] : truncate
10836/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10837///
10838/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10839///
10840/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_round_ph)
10841#[inline]
10842#[target_feature(enable = "avx512fp16")]
10843#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
10844#[rustc_legacy_const_generics(2, 3)]
10845#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10846pub fn _mm512_maskz_reduce_round_ph<const IMM8: i32, const SAE: i32>(
10847    k: __mmask32,
10848    a: __m512h,
10849) -> __m512h {
10850    static_assert_uimm_bits!(IMM8, 8);
10851    static_assert_sae!(SAE);
10852    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
10853}
10854
10855/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10856/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the
10857/// upper 7 packed elements from a to the upper elements of dst.
10858///
10859/// Rounding is done according to the imm8 parameter, which can be one of:
10860///
10861/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10862/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10863/// * [`_MM_FROUND_TO_POS_INF`] : round up
10864/// * [`_MM_FROUND_TO_ZERO`] : truncate
10865/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10866///
10867/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_sh)
10868#[inline]
10869#[target_feature(enable = "avx512fp16")]
10870#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
10871#[rustc_legacy_const_generics(2)]
10872#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10873pub fn _mm_reduce_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
10874    static_assert_uimm_bits!(IMM8, 8);
10875    _mm_mask_reduce_sh::<IMM8>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10876}
10877
10878/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10879/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
10880/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from
10881/// a to the upper elements of dst.
10882///
10883/// Rounding is done according to the imm8 parameter, which can be one of:
10884///
10885/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10886/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10887/// * [`_MM_FROUND_TO_POS_INF`] : round up
10888/// * [`_MM_FROUND_TO_ZERO`] : truncate
10889/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10890///
10891/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_sh)
10892#[inline]
10893#[target_feature(enable = "avx512fp16")]
10894#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
10895#[rustc_legacy_const_generics(4)]
10896#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10897pub fn _mm_mask_reduce_sh<const IMM8: i32>(
10898    src: __m128h,
10899    k: __mmask8,
10900    a: __m128h,
10901    b: __m128h,
10902) -> __m128h {
10903    static_assert_uimm_bits!(IMM8, 8);
10904    _mm_mask_reduce_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10905}
10906
10907/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10908/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
10909/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
10910/// to the upper elements of dst.
10911///
10912/// Rounding is done according to the imm8 parameter, which can be one of:
10913///
10914/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10915/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10916/// * [`_MM_FROUND_TO_POS_INF`] : round up
10917/// * [`_MM_FROUND_TO_ZERO`] : truncate
10918/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10919///
10920/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_sh)
10921#[inline]
10922#[target_feature(enable = "avx512fp16")]
10923#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
10924#[rustc_legacy_const_generics(3)]
10925#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10926pub fn _mm_maskz_reduce_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10927    static_assert_uimm_bits!(IMM8, 8);
10928    _mm_mask_reduce_sh::<IMM8>(f16x8::ZERO.as_m128h(), k, a, b)
10929}
10930
10931/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10932/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the upper
10933/// 7 packed elements from a to the upper elements of dst.
10934///
10935/// Rounding is done according to the imm8 parameter, which can be one of:
10936///
10937/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10938/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10939/// * [`_MM_FROUND_TO_POS_INF`] : round up
10940/// * [`_MM_FROUND_TO_ZERO`] : truncate
10941/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10942///
10943/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10944///
10945/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_round_sh)
10946#[inline]
10947#[target_feature(enable = "avx512fp16")]
10948#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
10949#[rustc_legacy_const_generics(2, 3)]
10950#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10951pub fn _mm_reduce_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
10952    static_assert_uimm_bits!(IMM8, 8);
10953    static_assert_sae!(SAE);
10954    _mm_mask_reduce_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10955}
10956
10957/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10958/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
10959/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a
10960/// to the upper elements of dst.
10961///
10962/// Rounding is done according to the imm8 parameter, which can be one of:
10963///
10964/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10965/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10966/// * [`_MM_FROUND_TO_POS_INF`] : round up
10967/// * [`_MM_FROUND_TO_ZERO`] : truncate
10968/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10969///
10970/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10971///
10972/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_round_sh)
10973#[inline]
10974#[target_feature(enable = "avx512fp16")]
10975#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
10976#[rustc_legacy_const_generics(4, 5)]
10977#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10978pub fn _mm_mask_reduce_round_sh<const IMM8: i32, const SAE: i32>(
10979    src: __m128h,
10980    k: __mmask8,
10981    a: __m128h,
10982    b: __m128h,
10983) -> __m128h {
10984    unsafe {
10985        static_assert_uimm_bits!(IMM8, 8);
10986        static_assert_sae!(SAE);
10987        vreducesh(a, b, src, k, IMM8, SAE)
10988    }
10989}
10990
10991/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10992/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
10993/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
10994/// to the upper elements of dst.
10995///
10996/// Rounding is done according to the imm8 parameter, which can be one of:
10997///
10998/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10999/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11000/// * [`_MM_FROUND_TO_POS_INF`] : round up
11001/// * [`_MM_FROUND_TO_ZERO`] : truncate
11002/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11003///
11004/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11005///
11006/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_round_sh)
11007#[inline]
11008#[target_feature(enable = "avx512fp16")]
11009#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
11010#[rustc_legacy_const_generics(3, 4)]
11011#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11012pub fn _mm_maskz_reduce_round_sh<const IMM8: i32, const SAE: i32>(
11013    k: __mmask8,
11014    a: __m128h,
11015    b: __m128h,
11016) -> __m128h {
11017    static_assert_uimm_bits!(IMM8, 8);
11018    static_assert_sae!(SAE);
11019    _mm_mask_reduce_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
11020}
11021
11022/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11023/// sum of all elements in a.
11024///
11025/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_add_ph)
11026#[inline]
11027#[target_feature(enable = "avx512fp16,avx512vl")]
11028#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11029pub fn _mm_reduce_add_ph(a: __m128h) -> f16 {
11030    unsafe {
11031        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11032        let a = _mm_add_ph(a, b);
11033        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11034        let a = _mm_add_ph(a, b);
11035        simd_extract::<_, f16>(a, 0) + simd_extract::<_, f16>(a, 1)
11036    }
11037}
11038
11039/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11040/// sum of all elements in a.
11041///
11042/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_add_ph)
11043#[inline]
11044#[target_feature(enable = "avx512fp16,avx512vl")]
11045#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11046pub fn _mm256_reduce_add_ph(a: __m256h) -> f16 {
11047    unsafe {
11048        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11049        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11050        _mm_reduce_add_ph(_mm_add_ph(p, q))
11051    }
11052}
11053
11054/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11055/// sum of all elements in a.
11056///
11057/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_ph)
11058#[inline]
11059#[target_feature(enable = "avx512fp16")]
11060#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11061pub fn _mm512_reduce_add_ph(a: __m512h) -> f16 {
11062    unsafe {
11063        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11064        let q = simd_shuffle!(
11065            a,
11066            a,
11067            [
11068                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11069            ]
11070        );
11071        _mm256_reduce_add_ph(_mm256_add_ph(p, q))
11072    }
11073}
11074
11075/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11076/// the product of all elements in a.
11077///
11078/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_mul_ph)
11079#[inline]
11080#[target_feature(enable = "avx512fp16,avx512vl")]
11081#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11082pub fn _mm_reduce_mul_ph(a: __m128h) -> f16 {
11083    unsafe {
11084        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11085        let a = _mm_mul_ph(a, b);
11086        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11087        let a = _mm_mul_ph(a, b);
11088        simd_extract::<_, f16>(a, 0) * simd_extract::<_, f16>(a, 1)
11089    }
11090}
11091
11092/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11093/// the product of all elements in a.
11094///
11095/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_mul_ph)
11096#[inline]
11097#[target_feature(enable = "avx512fp16,avx512vl")]
11098#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11099pub fn _mm256_reduce_mul_ph(a: __m256h) -> f16 {
11100    unsafe {
11101        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11102        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11103        _mm_reduce_mul_ph(_mm_mul_ph(p, q))
11104    }
11105}
11106
11107/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11108/// the product of all elements in a.
11109///
11110/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_ph)
11111#[inline]
11112#[target_feature(enable = "avx512fp16")]
11113#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11114pub unsafe fn _mm512_reduce_mul_ph(a: __m512h) -> f16 {
11115    unsafe {
11116        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11117        let q = simd_shuffle!(
11118            a,
11119            a,
11120            [
11121                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11122            ]
11123        );
11124        _mm256_reduce_mul_ph(_mm256_mul_ph(p, q))
11125    }
11126}
11127
11128/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11129/// minimum of all elements in a.
11130///
11131/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_min_ph)
11132#[inline]
11133#[target_feature(enable = "avx512fp16,avx512vl")]
11134#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11135pub fn _mm_reduce_min_ph(a: __m128h) -> f16 {
11136    unsafe {
11137        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11138        let a = _mm_min_ph(a, b);
11139        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11140        let a = _mm_min_ph(a, b);
11141        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
11142        simd_extract!(_mm_min_sh(a, b), 0)
11143    }
11144}
11145
11146/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11147/// minimum of all elements in a.
11148///
11149/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_min_ph)
11150#[inline]
11151#[target_feature(enable = "avx512fp16,avx512vl")]
11152#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11153pub fn _mm256_reduce_min_ph(a: __m256h) -> f16 {
11154    unsafe {
11155        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11156        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11157        _mm_reduce_min_ph(_mm_min_ph(p, q))
11158    }
11159}
11160
11161/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11162/// minimum of all elements in a.
11163///
11164/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_ph)
11165#[inline]
11166#[target_feature(enable = "avx512fp16")]
11167#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11168pub fn _mm512_reduce_min_ph(a: __m512h) -> f16 {
11169    unsafe {
11170        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11171        let q = simd_shuffle!(
11172            a,
11173            a,
11174            [
11175                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11176            ]
11177        );
11178        _mm256_reduce_min_ph(_mm256_min_ph(p, q))
11179    }
11180}
11181
11182/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11183/// maximum of all elements in a.
11184///
11185/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_max_ph)
11186#[inline]
11187#[target_feature(enable = "avx512fp16,avx512vl")]
11188#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11189pub fn _mm_reduce_max_ph(a: __m128h) -> f16 {
11190    unsafe {
11191        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11192        let a = _mm_max_ph(a, b);
11193        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11194        let a = _mm_max_ph(a, b);
11195        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
11196        simd_extract!(_mm_max_sh(a, b), 0)
11197    }
11198}
11199
11200/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11201/// maximum of all elements in a.
11202///
11203/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_max_ph)
11204#[inline]
11205#[target_feature(enable = "avx512fp16,avx512vl")]
11206#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11207pub fn _mm256_reduce_max_ph(a: __m256h) -> f16 {
11208    unsafe {
11209        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11210        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11211        _mm_reduce_max_ph(_mm_max_ph(p, q))
11212    }
11213}
11214
11215/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11216/// maximum of all elements in a.
11217///
11218/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_ph)
11219#[inline]
11220#[target_feature(enable = "avx512fp16")]
11221#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11222pub fn _mm512_reduce_max_ph(a: __m512h) -> f16 {
11223    unsafe {
11224        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11225        let q = simd_shuffle!(
11226            a,
11227            a,
11228            [
11229                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11230            ]
11231        );
11232        _mm256_reduce_max_ph(_mm256_max_ph(p, q))
11233    }
11234}
11235
11236macro_rules! fpclass_asm { // FIXME: use LLVM intrinsics
11237    ($mask_type: ty, $reg: ident, $a: expr) => {{
11238        let dst: $mask_type;
11239        asm!(
11240            "vfpclassph {k}, {src}, {imm8}",
11241            k = lateout(kreg) dst,
11242            src = in($reg) $a,
11243            imm8 = const IMM8,
11244            options(pure, nomem, nostack)
11245        );
11246        dst
11247    }};
11248    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr) => {{
11249        let dst: $mask_type;
11250        asm!(
11251            "vfpclassph {k} {{ {mask} }}, {src}, {imm8}",
11252            k = lateout(kreg) dst,
11253            mask = in(kreg) $mask,
11254            src = in($reg) $a,
11255            imm8 = const IMM8,
11256            options(pure, nomem, nostack)
11257        );
11258        dst
11259    }};
11260}
11261
11262/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11263/// by imm8, and store the results in mask vector k.
11264/// imm can be a combination of:
11265///
11266///     0x01 // QNaN
11267///     0x02 // Positive Zero
11268///     0x04 // Negative Zero
11269///     0x08 // Positive Infinity
11270///     0x10 // Negative Infinity
11271///     0x20 // Denormal
11272///     0x40 // Negative
11273///     0x80 // SNaN
11274///
11275/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_ph_mask)
11276#[inline]
11277#[target_feature(enable = "avx512fp16,avx512vl")]
11278#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11279#[rustc_legacy_const_generics(1)]
11280#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11281pub fn _mm_fpclass_ph_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
11282    unsafe {
11283        static_assert_uimm_bits!(IMM8, 8);
11284        fpclass_asm!(__mmask8, xmm_reg, a)
11285    }
11286}
11287
11288/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11289/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11290/// corresponding mask bit is not set).
11291/// imm can be a combination of:
11292///
11293///     0x01 // QNaN
11294///     0x02 // Positive Zero
11295///     0x04 // Negative Zero
11296///     0x08 // Positive Infinity
11297///     0x10 // Negative Infinity
11298///     0x20 // Denormal
11299///     0x40 // Negative
11300///     0x80 // SNaN
11301///
11302/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_ph_mask)
11303#[inline]
11304#[target_feature(enable = "avx512fp16,avx512vl")]
11305#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11306#[rustc_legacy_const_generics(2)]
11307#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11308pub fn _mm_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
11309    unsafe {
11310        static_assert_uimm_bits!(IMM8, 8);
11311        fpclass_asm!(__mmask8, k1, xmm_reg, a)
11312    }
11313}
11314
11315/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11316/// by imm8, and store the results in mask vector k.
11317/// imm can be a combination of:
11318///
11319///     0x01 // QNaN
11320///     0x02 // Positive Zero
11321///     0x04 // Negative Zero
11322///     0x08 // Positive Infinity
11323///     0x10 // Negative Infinity
11324///     0x20 // Denormal
11325///     0x40 // Negative
11326///     0x80 // SNaN
11327///
11328/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fpclass_ph_mask)
11329#[inline]
11330#[target_feature(enable = "avx512fp16,avx512vl")]
11331#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11332#[rustc_legacy_const_generics(1)]
11333#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11334pub fn _mm256_fpclass_ph_mask<const IMM8: i32>(a: __m256h) -> __mmask16 {
11335    unsafe {
11336        static_assert_uimm_bits!(IMM8, 8);
11337        fpclass_asm!(__mmask16, ymm_reg, a)
11338    }
11339}
11340
11341/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11342/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11343/// corresponding mask bit is not set).
11344/// imm can be a combination of:
11345///
11346///     0x01 // QNaN
11347///     0x02 // Positive Zero
11348///     0x04 // Negative Zero
11349///     0x08 // Positive Infinity
11350///     0x10 // Negative Infinity
11351///     0x20 // Denormal
11352///     0x40 // Negative
11353///     0x80 // SNaN
11354///
11355/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fpclass_ph_mask)
11356#[inline]
11357#[target_feature(enable = "avx512fp16,avx512vl")]
11358#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11359#[rustc_legacy_const_generics(2)]
11360#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11361pub fn _mm256_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask16, a: __m256h) -> __mmask16 {
11362    unsafe {
11363        static_assert_uimm_bits!(IMM8, 8);
11364        fpclass_asm!(__mmask16, k1, ymm_reg, a)
11365    }
11366}
11367
11368/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11369/// by imm8, and store the results in mask vector k.
11370/// imm can be a combination of:
11371///
11372///     0x01 // QNaN
11373///     0x02 // Positive Zero
11374///     0x04 // Negative Zero
11375///     0x08 // Positive Infinity
11376///     0x10 // Negative Infinity
11377///     0x20 // Denormal
11378///     0x40 // Negative
11379///     0x80 // SNaN
11380///
11381/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fpclass_ph_mask)
11382#[inline]
11383#[target_feature(enable = "avx512fp16")]
11384#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11385#[rustc_legacy_const_generics(1)]
11386#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11387pub fn _mm512_fpclass_ph_mask<const IMM8: i32>(a: __m512h) -> __mmask32 {
11388    unsafe {
11389        static_assert_uimm_bits!(IMM8, 8);
11390        fpclass_asm!(__mmask32, zmm_reg, a)
11391    }
11392}
11393
11394/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11395/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11396/// corresponding mask bit is not set).
11397/// imm can be a combination of:
11398///
11399///     0x01 // QNaN
11400///     0x02 // Positive Zero
11401///     0x04 // Negative Zero
11402///     0x08 // Positive Infinity
11403///     0x10 // Negative Infinity
11404///     0x20 // Denormal
11405///     0x40 // Negative
11406///     0x80 // SNaN
11407///
11408/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fpclass_ph_mask)
11409#[inline]
11410#[target_feature(enable = "avx512fp16")]
11411#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11412#[rustc_legacy_const_generics(2)]
11413#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11414pub fn _mm512_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask32, a: __m512h) -> __mmask32 {
11415    unsafe {
11416        static_assert_uimm_bits!(IMM8, 8);
11417        fpclass_asm!(__mmask32, k1, zmm_reg, a)
11418    }
11419}
11420
11421/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
11422/// by imm8, and store the result in mask vector k.
11423/// imm can be a combination of:
11424///
11425///     0x01 // QNaN
11426///     0x02 // Positive Zero
11427///     0x04 // Negative Zero
11428///     0x08 // Positive Infinity
11429///     0x10 // Negative Infinity
11430///     0x20 // Denormal
11431///     0x40 // Negative
11432///     0x80 // SNaN
11433///
11434/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_sh_mask)
11435#[inline]
11436#[target_feature(enable = "avx512fp16")]
11437#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
11438#[rustc_legacy_const_generics(1)]
11439#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11440pub fn _mm_fpclass_sh_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
11441    _mm_mask_fpclass_sh_mask::<IMM8>(0xff, a)
11442}
11443
11444/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
11445/// by imm8, and store the result in mask vector k using zeromask k (elements are zeroed out when the
11446/// corresponding mask bit is not set).
11447/// imm can be a combination of:
11448///
11449///     0x01 // QNaN
11450///     0x02 // Positive Zero
11451///     0x04 // Negative Zero
11452///     0x08 // Positive Infinity
11453///     0x10 // Negative Infinity
11454///     0x20 // Denormal
11455///     0x40 // Negative
11456///     0x80 // SNaN
11457///
11458/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_sh_mask)
11459#[inline]
11460#[target_feature(enable = "avx512fp16")]
11461#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
11462#[rustc_legacy_const_generics(2)]
11463#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11464pub fn _mm_mask_fpclass_sh_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
11465    unsafe {
11466        static_assert_uimm_bits!(IMM8, 8);
11467        vfpclasssh(a, IMM8, k1)
11468    }
11469}
11470
11471/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11472/// and store the results in dst.
11473///
11474/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_ph)
11475#[inline]
11476#[target_feature(enable = "avx512fp16,avx512vl")]
11477#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11478pub fn _mm_mask_blend_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
11479    unsafe { simd_select_bitmask(k, b, a) }
11480}
11481
11482/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11483/// and store the results in dst.
11484///
11485/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_ph)
11486#[inline]
11487#[target_feature(enable = "avx512fp16,avx512vl")]
11488#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11489pub fn _mm256_mask_blend_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
11490    unsafe { simd_select_bitmask(k, b, a) }
11491}
11492
11493/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11494/// and store the results in dst.
11495///
11496/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ph)
11497#[inline]
11498#[target_feature(enable = "avx512fp16")]
11499#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11500pub fn _mm512_mask_blend_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
11501    unsafe { simd_select_bitmask(k, b, a) }
11502}
11503
11504/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11505/// and index in idx, and store the results in dst.
11506///
11507/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_ph)
11508#[inline]
11509#[target_feature(enable = "avx512fp16,avx512vl")]
11510#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11511pub fn _mm_permutex2var_ph(a: __m128h, idx: __m128i, b: __m128h) -> __m128h {
11512    _mm_castsi128_ph(_mm_permutex2var_epi16(
11513        _mm_castph_si128(a),
11514        idx,
11515        _mm_castph_si128(b),
11516    ))
11517}
11518
11519/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11520/// and index in idx, and store the results in dst.
11521///
11522/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_ph)
11523#[inline]
11524#[target_feature(enable = "avx512fp16,avx512vl")]
11525#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11526pub fn _mm256_permutex2var_ph(a: __m256h, idx: __m256i, b: __m256h) -> __m256h {
11527    _mm256_castsi256_ph(_mm256_permutex2var_epi16(
11528        _mm256_castph_si256(a),
11529        idx,
11530        _mm256_castph_si256(b),
11531    ))
11532}
11533
11534/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11535/// and index in idx, and store the results in dst.
11536///
11537/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_ph)
11538#[inline]
11539#[target_feature(enable = "avx512fp16")]
11540#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11541pub fn _mm512_permutex2var_ph(a: __m512h, idx: __m512i, b: __m512h) -> __m512h {
11542    _mm512_castsi512_ph(_mm512_permutex2var_epi16(
11543        _mm512_castph_si512(a),
11544        idx,
11545        _mm512_castph_si512(b),
11546    ))
11547}
11548
11549/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11550/// and store the results in dst.
11551///
11552/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutexvar_ph)
11553#[inline]
11554#[target_feature(enable = "avx512fp16,avx512vl")]
11555#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11556pub fn _mm_permutexvar_ph(idx: __m128i, a: __m128h) -> __m128h {
11557    _mm_castsi128_ph(_mm_permutexvar_epi16(idx, _mm_castph_si128(a)))
11558}
11559
11560/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11561/// and store the results in dst.
11562///
11563/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_ph)
11564#[inline]
11565#[target_feature(enable = "avx512fp16,avx512vl")]
11566#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11567pub fn _mm256_permutexvar_ph(idx: __m256i, a: __m256h) -> __m256h {
11568    _mm256_castsi256_ph(_mm256_permutexvar_epi16(idx, _mm256_castph_si256(a)))
11569}
11570
11571/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11572/// and store the results in dst.
11573///
11574/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_ph)
11575#[inline]
11576#[target_feature(enable = "avx512fp16")]
11577#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11578pub fn _mm512_permutexvar_ph(idx: __m512i, a: __m512h) -> __m512h {
11579    _mm512_castsi512_ph(_mm512_permutexvar_epi16(idx, _mm512_castph_si512(a)))
11580}
11581
11582/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11583/// and store the results in dst.
11584///
11585/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_ph)
11586#[inline]
11587#[target_feature(enable = "avx512fp16,avx512vl")]
11588#[cfg_attr(test, assert_instr(vcvtw2ph))]
11589#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11590pub fn _mm_cvtepi16_ph(a: __m128i) -> __m128h {
11591    unsafe { vcvtw2ph_128(a.as_i16x8(), _MM_FROUND_CUR_DIRECTION) }
11592}
11593
11594/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11595/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11596/// mask bit is not set).
11597///
11598/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi16_ph)
11599#[inline]
11600#[target_feature(enable = "avx512fp16,avx512vl")]
11601#[cfg_attr(test, assert_instr(vcvtw2ph))]
11602#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11603pub fn _mm_mask_cvtepi16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11604    unsafe { simd_select_bitmask(k, _mm_cvtepi16_ph(a), src) }
11605}
11606
11607/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11608/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11609///
11610/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi16_ph)
11611#[inline]
11612#[target_feature(enable = "avx512fp16,avx512vl")]
11613#[cfg_attr(test, assert_instr(vcvtw2ph))]
11614#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11615pub fn _mm_maskz_cvtepi16_ph(k: __mmask8, a: __m128i) -> __m128h {
11616    _mm_mask_cvtepi16_ph(_mm_setzero_ph(), k, a)
11617}
11618
11619/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11620/// and store the results in dst.
11621///
11622/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_ph)
11623#[inline]
11624#[target_feature(enable = "avx512fp16,avx512vl")]
11625#[cfg_attr(test, assert_instr(vcvtw2ph))]
11626#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11627pub fn _mm256_cvtepi16_ph(a: __m256i) -> __m256h {
11628    unsafe { vcvtw2ph_256(a.as_i16x16(), _MM_FROUND_CUR_DIRECTION) }
11629}
11630
11631/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11632/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11633/// mask bit is not set).
11634///
11635/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi16_ph)
11636#[inline]
11637#[target_feature(enable = "avx512fp16,avx512vl")]
11638#[cfg_attr(test, assert_instr(vcvtw2ph))]
11639#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11640pub fn _mm256_mask_cvtepi16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
11641    unsafe { simd_select_bitmask(k, _mm256_cvtepi16_ph(a), src) }
11642}
11643
11644/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11645/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11646///
11647/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi16_ph)
11648#[inline]
11649#[target_feature(enable = "avx512fp16,avx512vl")]
11650#[cfg_attr(test, assert_instr(vcvtw2ph))]
11651#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11652pub fn _mm256_maskz_cvtepi16_ph(k: __mmask16, a: __m256i) -> __m256h {
11653    _mm256_mask_cvtepi16_ph(_mm256_setzero_ph(), k, a)
11654}
11655
11656/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11657/// and store the results in dst.
11658///
11659/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi16_ph)
11660#[inline]
11661#[target_feature(enable = "avx512fp16")]
11662#[cfg_attr(test, assert_instr(vcvtw2ph))]
11663#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11664pub fn _mm512_cvtepi16_ph(a: __m512i) -> __m512h {
11665    unsafe { vcvtw2ph_512(a.as_i16x32(), _MM_FROUND_CUR_DIRECTION) }
11666}
11667
11668/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11669/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11670/// mask bit is not set).
11671///
11672/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi16_ph)
11673#[inline]
11674#[target_feature(enable = "avx512fp16")]
11675#[cfg_attr(test, assert_instr(vcvtw2ph))]
11676#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11677pub fn _mm512_mask_cvtepi16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
11678    unsafe { simd_select_bitmask(k, _mm512_cvtepi16_ph(a), src) }
11679}
11680
11681/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11682/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11683///
11684/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi16_ph)
11685#[inline]
11686#[target_feature(enable = "avx512fp16")]
11687#[cfg_attr(test, assert_instr(vcvtw2ph))]
11688#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11689pub fn _mm512_maskz_cvtepi16_ph(k: __mmask32, a: __m512i) -> __m512h {
11690    _mm512_mask_cvtepi16_ph(_mm512_setzero_ph(), k, a)
11691}
11692
11693/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11694/// and store the results in dst.
11695///
11696/// Rounding is done according to the rounding parameter, which can be one of:
11697///
11698/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11699/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11700/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11701/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11702/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11703///
11704/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi16_ph)
11705#[inline]
11706#[target_feature(enable = "avx512fp16")]
11707#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
11708#[rustc_legacy_const_generics(1)]
11709#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11710pub fn _mm512_cvt_roundepi16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
11711    unsafe {
11712        static_assert_rounding!(ROUNDING);
11713        vcvtw2ph_512(a.as_i16x32(), ROUNDING)
11714    }
11715}
11716
11717/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11718/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11719/// mask bit is not set).
11720///
11721/// Rounding is done according to the rounding parameter, which can be one of:
11722///
11723/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11724/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11725/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11726/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11727/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11728///
11729/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi16_ph)
11730#[inline]
11731#[target_feature(enable = "avx512fp16")]
11732#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
11733#[rustc_legacy_const_generics(3)]
11734#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11735pub fn _mm512_mask_cvt_roundepi16_ph<const ROUNDING: i32>(
11736    src: __m512h,
11737    k: __mmask32,
11738    a: __m512i,
11739) -> __m512h {
11740    unsafe {
11741        static_assert_rounding!(ROUNDING);
11742        simd_select_bitmask(k, _mm512_cvt_roundepi16_ph::<ROUNDING>(a), src)
11743    }
11744}
11745
11746/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11747/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11748///
11749/// Rounding is done according to the rounding parameter, which can be one of:
11750///
11751/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11752/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11753/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11754/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11755/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11756///
11757/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi16_ph)
11758#[inline]
11759#[target_feature(enable = "avx512fp16")]
11760#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
11761#[rustc_legacy_const_generics(2)]
11762#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11763pub fn _mm512_maskz_cvt_roundepi16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
11764    static_assert_rounding!(ROUNDING);
11765    _mm512_mask_cvt_roundepi16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
11766}
11767
11768/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11769/// and store the results in dst.
11770///
11771/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_ph)
11772#[inline]
11773#[target_feature(enable = "avx512fp16,avx512vl")]
11774#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11775#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11776pub fn _mm_cvtepu16_ph(a: __m128i) -> __m128h {
11777    unsafe { vcvtuw2ph_128(a.as_u16x8(), _MM_FROUND_CUR_DIRECTION) }
11778}
11779
11780/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11781/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11782/// mask bit is not set).
11783///
11784/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu16_ph)
11785#[inline]
11786#[target_feature(enable = "avx512fp16,avx512vl")]
11787#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11788#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11789pub fn _mm_mask_cvtepu16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11790    unsafe { simd_select_bitmask(k, _mm_cvtepu16_ph(a), src) }
11791}
11792
11793/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11794/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11795///
11796/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu16_ph)
11797#[inline]
11798#[target_feature(enable = "avx512fp16,avx512vl")]
11799#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11800#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11801pub fn _mm_maskz_cvtepu16_ph(k: __mmask8, a: __m128i) -> __m128h {
11802    _mm_mask_cvtepu16_ph(_mm_setzero_ph(), k, a)
11803}
11804
11805/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11806/// and store the results in dst.
11807///
11808/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_ph)
11809#[inline]
11810#[target_feature(enable = "avx512fp16,avx512vl")]
11811#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11812#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11813pub fn _mm256_cvtepu16_ph(a: __m256i) -> __m256h {
11814    unsafe { vcvtuw2ph_256(a.as_u16x16(), _MM_FROUND_CUR_DIRECTION) }
11815}
11816
11817/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11818/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11819/// mask bit is not set).
11820///
11821/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu16_ph)
11822#[inline]
11823#[target_feature(enable = "avx512fp16,avx512vl")]
11824#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11825#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11826pub fn _mm256_mask_cvtepu16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
11827    unsafe { simd_select_bitmask(k, _mm256_cvtepu16_ph(a), src) }
11828}
11829
11830/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11831/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11832///
11833/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu16_ph)
11834#[inline]
11835#[target_feature(enable = "avx512fp16,avx512vl")]
11836#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11837#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11838pub fn _mm256_maskz_cvtepu16_ph(k: __mmask16, a: __m256i) -> __m256h {
11839    _mm256_mask_cvtepu16_ph(_mm256_setzero_ph(), k, a)
11840}
11841
11842/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11843/// and store the results in dst.
11844///
11845/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu16_ph)
11846#[inline]
11847#[target_feature(enable = "avx512fp16")]
11848#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11849#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11850pub fn _mm512_cvtepu16_ph(a: __m512i) -> __m512h {
11851    unsafe { vcvtuw2ph_512(a.as_u16x32(), _MM_FROUND_CUR_DIRECTION) }
11852}
11853
11854/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11855/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11856/// mask bit is not set).
11857///
11858/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu16_ph)
11859#[inline]
11860#[target_feature(enable = "avx512fp16")]
11861#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11862#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11863pub fn _mm512_mask_cvtepu16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
11864    unsafe { simd_select_bitmask(k, _mm512_cvtepu16_ph(a), src) }
11865}
11866
11867/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11868/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11869///
11870/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu16_ph)
11871#[inline]
11872#[target_feature(enable = "avx512fp16")]
11873#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11874#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11875pub fn _mm512_maskz_cvtepu16_ph(k: __mmask32, a: __m512i) -> __m512h {
11876    _mm512_mask_cvtepu16_ph(_mm512_setzero_ph(), k, a)
11877}
11878
11879/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11880/// and store the results in dst.
11881///
11882/// Rounding is done according to the rounding parameter, which can be one of:
11883///
11884/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11885/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11886/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11887/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11888/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11889///
11890/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu16_ph)
11891#[inline]
11892#[target_feature(enable = "avx512fp16")]
11893#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
11894#[rustc_legacy_const_generics(1)]
11895#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11896pub fn _mm512_cvt_roundepu16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
11897    unsafe {
11898        static_assert_rounding!(ROUNDING);
11899        vcvtuw2ph_512(a.as_u16x32(), ROUNDING)
11900    }
11901}
11902
11903/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11904/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11905/// mask bit is not set).
11906///
11907/// Rounding is done according to the rounding parameter, which can be one of:
11908///
11909/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11910/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11911/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11912/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11913/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11914///
11915/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu16_ph)
11916#[inline]
11917#[target_feature(enable = "avx512fp16")]
11918#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
11919#[rustc_legacy_const_generics(3)]
11920#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11921pub fn _mm512_mask_cvt_roundepu16_ph<const ROUNDING: i32>(
11922    src: __m512h,
11923    k: __mmask32,
11924    a: __m512i,
11925) -> __m512h {
11926    unsafe {
11927        static_assert_rounding!(ROUNDING);
11928        simd_select_bitmask(k, _mm512_cvt_roundepu16_ph::<ROUNDING>(a), src)
11929    }
11930}
11931
11932/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11933/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11934///
11935/// Rounding is done according to the rounding parameter, which can be one of:
11936///
11937/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11938/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11939/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11940/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11941/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11942///
11943/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu16_ph)
11944#[inline]
11945#[target_feature(enable = "avx512fp16")]
11946#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
11947#[rustc_legacy_const_generics(2)]
11948#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11949pub fn _mm512_maskz_cvt_roundepu16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
11950    static_assert_rounding!(ROUNDING);
11951    _mm512_mask_cvt_roundepu16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
11952}
11953
11954/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11955/// and store the results in dst. The upper 64 bits of dst are zeroed out.
11956///
11957/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_ph)
11958#[inline]
11959#[target_feature(enable = "avx512fp16,avx512vl")]
11960#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11961#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11962pub fn _mm_cvtepi32_ph(a: __m128i) -> __m128h {
11963    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), 0xff, a)
11964}
11965
11966/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11967/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11968/// mask bit is not set). The upper 64 bits of dst are zeroed out.
11969///
11970/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_ph)
11971#[inline]
11972#[target_feature(enable = "avx512fp16,avx512vl")]
11973#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11974#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11975pub fn _mm_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11976    unsafe { vcvtdq2ph_128(a.as_i32x4(), src, k) }
11977}
11978
11979/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11980/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11981/// The upper 64 bits of dst are zeroed out.
11982///
11983/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_ph)
11984#[inline]
11985#[target_feature(enable = "avx512fp16,avx512vl")]
11986#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11987#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11988pub fn _mm_maskz_cvtepi32_ph(k: __mmask8, a: __m128i) -> __m128h {
11989    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
11990}
11991
11992/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11993/// and store the results in dst.
11994///
11995/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_ph)
11996#[inline]
11997#[target_feature(enable = "avx512fp16,avx512vl")]
11998#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11999#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12000pub fn _mm256_cvtepi32_ph(a: __m256i) -> __m128h {
12001    unsafe { vcvtdq2ph_256(a.as_i32x8(), _MM_FROUND_CUR_DIRECTION) }
12002}
12003
12004/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12005/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12006/// mask bit is not set).
12007///
12008/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_ph)
12009#[inline]
12010#[target_feature(enable = "avx512fp16,avx512vl")]
12011#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12012#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12013pub fn _mm256_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12014    unsafe { simd_select_bitmask(k, _mm256_cvtepi32_ph(a), src) }
12015}
12016
12017/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12018/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12019///
12020/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_ph)
12021#[inline]
12022#[target_feature(enable = "avx512fp16,avx512vl")]
12023#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12024#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12025pub fn _mm256_maskz_cvtepi32_ph(k: __mmask8, a: __m256i) -> __m128h {
12026    _mm256_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
12027}
12028
12029/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12030/// and store the results in dst.
12031///
12032/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_ph)
12033#[inline]
12034#[target_feature(enable = "avx512fp16")]
12035#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12036#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12037pub fn _mm512_cvtepi32_ph(a: __m512i) -> __m256h {
12038    unsafe { vcvtdq2ph_512(a.as_i32x16(), _MM_FROUND_CUR_DIRECTION) }
12039}
12040
12041/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12042/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12043/// mask bit is not set).
12044///
12045/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_ph)
12046#[inline]
12047#[target_feature(enable = "avx512fp16")]
12048#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12049#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12050pub fn _mm512_mask_cvtepi32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
12051    unsafe { simd_select_bitmask(k, _mm512_cvtepi32_ph(a), src) }
12052}
12053
12054/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12055/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12056///
12057/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_ph)
12058#[inline]
12059#[target_feature(enable = "avx512fp16")]
12060#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12061#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12062pub fn _mm512_maskz_cvtepi32_ph(k: __mmask16, a: __m512i) -> __m256h {
12063    _mm512_mask_cvtepi32_ph(f16x16::ZERO.as_m256h(), k, a)
12064}
12065
12066/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12067/// and store the results in dst.
12068///
12069/// Rounding is done according to the rounding parameter, which can be one of:
12070///
12071/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12072/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12073/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12074/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12075/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12076///
12077/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi32_ph)
12078#[inline]
12079#[target_feature(enable = "avx512fp16")]
12080#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12081#[rustc_legacy_const_generics(1)]
12082#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12083pub fn _mm512_cvt_roundepi32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
12084    unsafe {
12085        static_assert_rounding!(ROUNDING);
12086        vcvtdq2ph_512(a.as_i32x16(), ROUNDING)
12087    }
12088}
12089
12090/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12091/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12092/// mask bit is not set).
12093///
12094/// Rounding is done according to the rounding parameter, which can be one of:
12095///
12096/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12097/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12098/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12099/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12100/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12101///
12102/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi32_ph)
12103#[inline]
12104#[target_feature(enable = "avx512fp16")]
12105#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12106#[rustc_legacy_const_generics(3)]
12107#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12108pub fn _mm512_mask_cvt_roundepi32_ph<const ROUNDING: i32>(
12109    src: __m256h,
12110    k: __mmask16,
12111    a: __m512i,
12112) -> __m256h {
12113    unsafe {
12114        static_assert_rounding!(ROUNDING);
12115        simd_select_bitmask(k, _mm512_cvt_roundepi32_ph::<ROUNDING>(a), src)
12116    }
12117}
12118
12119/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12120/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12121///
12122/// Rounding is done according to the rounding parameter, which can be one of:
12123///
12124/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12125/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12126/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12127/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12128/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12129///
12130/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi32_ph)
12131#[inline]
12132#[target_feature(enable = "avx512fp16")]
12133#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12134#[rustc_legacy_const_generics(2)]
12135#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12136pub fn _mm512_maskz_cvt_roundepi32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
12137    static_assert_rounding!(ROUNDING);
12138    _mm512_mask_cvt_roundepi32_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
12139}
12140
12141/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12142/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12143/// of dst.
12144///
12145/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvti32_sh)
12146#[inline]
12147#[target_feature(enable = "avx512fp16")]
12148#[cfg_attr(test, assert_instr(vcvtsi2sh))]
12149#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12150pub fn _mm_cvti32_sh(a: __m128h, b: i32) -> __m128h {
12151    unsafe { vcvtsi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
12152}
12153
12154/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12155/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12156/// of dst.
12157///
12158/// Rounding is done according to the rounding parameter, which can be one of:
12159///
12160/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12161/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12162/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12163/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12164/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12165///
12166/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi32_sh)
12167#[inline]
12168#[target_feature(enable = "avx512fp16")]
12169#[cfg_attr(test, assert_instr(vcvtsi2sh, ROUNDING = 8))]
12170#[rustc_legacy_const_generics(2)]
12171#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12172pub fn _mm_cvt_roundi32_sh<const ROUNDING: i32>(a: __m128h, b: i32) -> __m128h {
12173    unsafe {
12174        static_assert_rounding!(ROUNDING);
12175        vcvtsi2sh(a, b, ROUNDING)
12176    }
12177}
12178
12179/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12180/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12181///
12182/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_ph)
12183#[inline]
12184#[target_feature(enable = "avx512fp16,avx512vl")]
12185#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12186#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12187pub fn _mm_cvtepu32_ph(a: __m128i) -> __m128h {
12188    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), 0xff, a)
12189}
12190
12191/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12192/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12193/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12194///
12195/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu32_ph)
12196#[inline]
12197#[target_feature(enable = "avx512fp16,avx512vl")]
12198#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12199#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12200pub fn _mm_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12201    unsafe { vcvtudq2ph_128(a.as_u32x4(), src, k) }
12202}
12203
12204/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12205/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12206/// The upper 64 bits of dst are zeroed out.
12207///
12208/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu32_ph)
12209#[inline]
12210#[target_feature(enable = "avx512fp16,avx512vl")]
12211#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12212#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12213pub fn _mm_maskz_cvtepu32_ph(k: __mmask8, a: __m128i) -> __m128h {
12214    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
12215}
12216
12217/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12218/// and store the results in dst.
12219///
12220/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_ph)
12221#[inline]
12222#[target_feature(enable = "avx512fp16,avx512vl")]
12223#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12224#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12225pub fn _mm256_cvtepu32_ph(a: __m256i) -> __m128h {
12226    unsafe { vcvtudq2ph_256(a.as_u32x8(), _MM_FROUND_CUR_DIRECTION) }
12227}
12228
12229/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12230/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12231/// mask bit is not set).
12232///
12233/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu32_ph)
12234#[inline]
12235#[target_feature(enable = "avx512fp16,avx512vl")]
12236#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12237#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12238pub fn _mm256_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12239    unsafe { simd_select_bitmask(k, _mm256_cvtepu32_ph(a), src) }
12240}
12241
12242/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12243/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12244///
12245/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu32_ph)
12246#[inline]
12247#[target_feature(enable = "avx512fp16,avx512vl")]
12248#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12249#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12250pub fn _mm256_maskz_cvtepu32_ph(k: __mmask8, a: __m256i) -> __m128h {
12251    _mm256_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
12252}
12253
12254/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12255/// and store the results in dst.
12256///
12257/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu32_ph)
12258#[inline]
12259#[target_feature(enable = "avx512fp16")]
12260#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12261#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12262pub fn _mm512_cvtepu32_ph(a: __m512i) -> __m256h {
12263    unsafe { vcvtudq2ph_512(a.as_u32x16(), _MM_FROUND_CUR_DIRECTION) }
12264}
12265
12266/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12267/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12268/// mask bit is not set).
12269///
12270/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu32_ph)
12271#[inline]
12272#[target_feature(enable = "avx512fp16")]
12273#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12274#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12275pub fn _mm512_mask_cvtepu32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
12276    unsafe { simd_select_bitmask(k, _mm512_cvtepu32_ph(a), src) }
12277}
12278
12279/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12280/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12281///
12282/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu32_ph)
12283#[inline]
12284#[target_feature(enable = "avx512fp16")]
12285#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12286#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12287pub fn _mm512_maskz_cvtepu32_ph(k: __mmask16, a: __m512i) -> __m256h {
12288    _mm512_mask_cvtepu32_ph(f16x16::ZERO.as_m256h(), k, a)
12289}
12290
12291/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12292/// and store the results in dst.
12293///
12294/// Rounding is done according to the rounding parameter, which can be one of:
12295///
12296/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12297/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12298/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12299/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12300/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12301///
12302/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu32_ph)
12303#[inline]
12304#[target_feature(enable = "avx512fp16")]
12305#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12306#[rustc_legacy_const_generics(1)]
12307#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12308pub fn _mm512_cvt_roundepu32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
12309    unsafe {
12310        static_assert_rounding!(ROUNDING);
12311        vcvtudq2ph_512(a.as_u32x16(), ROUNDING)
12312    }
12313}
12314
12315/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12316/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12317/// mask bit is not set).
12318///
12319/// Rounding is done according to the rounding parameter, which can be one of:
12320///
12321/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12322/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12323/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12324/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12325/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12326///
12327/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu32_ph)
12328#[inline]
12329#[target_feature(enable = "avx512fp16")]
12330#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12331#[rustc_legacy_const_generics(3)]
12332#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12333pub fn _mm512_mask_cvt_roundepu32_ph<const ROUNDING: i32>(
12334    src: __m256h,
12335    k: __mmask16,
12336    a: __m512i,
12337) -> __m256h {
12338    unsafe {
12339        static_assert_rounding!(ROUNDING);
12340        simd_select_bitmask(k, _mm512_cvt_roundepu32_ph::<ROUNDING>(a), src)
12341    }
12342}
12343
12344/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12345/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12346///
12347/// Rounding is done according to the rounding parameter, which can be one of:
12348///
12349/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12350/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12351/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12352/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12353/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12354///
12355/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu32_ph)
12356#[inline]
12357#[target_feature(enable = "avx512fp16")]
12358#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12359#[rustc_legacy_const_generics(2)]
12360#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12361pub fn _mm512_maskz_cvt_roundepu32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
12362    static_assert_rounding!(ROUNDING);
12363    _mm512_mask_cvt_roundepu32_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
12364}
12365
12366/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12367/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12368/// of dst.
12369///
12370/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_sh)
12371#[inline]
12372#[target_feature(enable = "avx512fp16")]
12373#[cfg_attr(test, assert_instr(vcvtusi2sh))]
12374#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12375pub fn _mm_cvtu32_sh(a: __m128h, b: u32) -> __m128h {
12376    unsafe { vcvtusi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
12377}
12378
12379/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12380/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12381/// of dst.
12382///
12383/// Rounding is done according to the rounding parameter, which can be one of:
12384///
12385/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12386/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12387/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12388/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12389/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12390///
12391/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu32_sh)
12392#[inline]
12393#[target_feature(enable = "avx512fp16")]
12394#[cfg_attr(test, assert_instr(vcvtusi2sh, ROUNDING = 8))]
12395#[rustc_legacy_const_generics(2)]
12396#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12397pub fn _mm_cvt_roundu32_sh<const ROUNDING: i32>(a: __m128h, b: u32) -> __m128h {
12398    unsafe {
12399        static_assert_rounding!(ROUNDING);
12400        vcvtusi2sh(a, b, ROUNDING)
12401    }
12402}
12403
12404/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12405/// and store the results in dst. The upper 96 bits of dst are zeroed out.
12406///
12407/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi64_ph)
12408#[inline]
12409#[target_feature(enable = "avx512fp16,avx512vl")]
12410#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12411#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12412pub fn _mm_cvtepi64_ph(a: __m128i) -> __m128h {
12413    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
12414}
12415
12416/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12417/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12418/// mask bit is not set). The upper 96 bits of dst are zeroed out.
12419///
12420/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_ph)
12421#[inline]
12422#[target_feature(enable = "avx512fp16,avx512vl")]
12423#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12424#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12425pub fn _mm_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12426    unsafe { vcvtqq2ph_128(a.as_i64x2(), src, k) }
12427}
12428
12429/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12430/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12431/// The upper 96 bits of dst are zeroed out.
12432///
12433/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi64_ph)
12434#[inline]
12435#[target_feature(enable = "avx512fp16,avx512vl")]
12436#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12437#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12438pub fn _mm_maskz_cvtepi64_ph(k: __mmask8, a: __m128i) -> __m128h {
12439    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
12440}
12441
12442/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12443/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12444///
12445/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi64_ph)
12446#[inline]
12447#[target_feature(enable = "avx512fp16,avx512vl")]
12448#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12449#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12450pub fn _mm256_cvtepi64_ph(a: __m256i) -> __m128h {
12451    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
12452}
12453
12454/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12455/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12456/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12457///
12458/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_ph)
12459#[inline]
12460#[target_feature(enable = "avx512fp16,avx512vl")]
12461#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12462#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12463pub fn _mm256_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12464    unsafe { vcvtqq2ph_256(a.as_i64x4(), src, k) }
12465}
12466
12467/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12468/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12469/// The upper 64 bits of dst are zeroed out.
12470///
12471/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi64_ph)
12472#[inline]
12473#[target_feature(enable = "avx512fp16,avx512vl")]
12474#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12475#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12476pub fn _mm256_maskz_cvtepi64_ph(k: __mmask8, a: __m256i) -> __m128h {
12477    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
12478}
12479
12480/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12481/// and store the results in dst.
12482///
12483/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi64_ph)
12484#[inline]
12485#[target_feature(enable = "avx512fp16")]
12486#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12487#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12488pub fn _mm512_cvtepi64_ph(a: __m512i) -> __m128h {
12489    unsafe { vcvtqq2ph_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION) }
12490}
12491
12492/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12493/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12494/// mask bit is not set).
12495///
12496/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_ph)
12497#[inline]
12498#[target_feature(enable = "avx512fp16")]
12499#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12500#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12501pub fn _mm512_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
12502    unsafe { simd_select_bitmask(k, _mm512_cvtepi64_ph(a), src) }
12503}
12504
12505/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12506/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12507///
12508/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi64_ph)
12509#[inline]
12510#[target_feature(enable = "avx512fp16")]
12511#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12512#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12513pub fn _mm512_maskz_cvtepi64_ph(k: __mmask8, a: __m512i) -> __m128h {
12514    _mm512_mask_cvtepi64_ph(f16x8::ZERO.as_m128h(), k, a)
12515}
12516
12517/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12518/// and store the results in dst.
12519///
12520/// Rounding is done according to the rounding parameter, which can be one of:
12521///
12522/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12523/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12524/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12525/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12526/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12527///
12528/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi64_ph)
12529#[inline]
12530#[target_feature(enable = "avx512fp16")]
12531#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12532#[rustc_legacy_const_generics(1)]
12533#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12534pub fn _mm512_cvt_roundepi64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
12535    unsafe {
12536        static_assert_rounding!(ROUNDING);
12537        vcvtqq2ph_512(a.as_i64x8(), ROUNDING)
12538    }
12539}
12540
12541/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12542/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12543/// mask bit is not set).
12544///
12545/// Rounding is done according to the rounding parameter, which can be one of:
12546///
12547/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12548/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12549/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12550/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12551/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12552///
12553/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi64_ph)
12554#[inline]
12555#[target_feature(enable = "avx512fp16")]
12556#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12557#[rustc_legacy_const_generics(3)]
12558#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12559pub fn _mm512_mask_cvt_roundepi64_ph<const ROUNDING: i32>(
12560    src: __m128h,
12561    k: __mmask8,
12562    a: __m512i,
12563) -> __m128h {
12564    unsafe {
12565        static_assert_rounding!(ROUNDING);
12566        simd_select_bitmask(k, _mm512_cvt_roundepi64_ph::<ROUNDING>(a), src)
12567    }
12568}
12569
12570/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12571/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12572///
12573/// Rounding is done according to the rounding parameter, which can be one of:
12574///
12575/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12576/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12577/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12578/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12579/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12580///
12581/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi64_ph)
12582#[inline]
12583#[target_feature(enable = "avx512fp16")]
12584#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12585#[rustc_legacy_const_generics(2)]
12586#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12587pub fn _mm512_maskz_cvt_roundepi64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
12588    static_assert_rounding!(ROUNDING);
12589    _mm512_mask_cvt_roundepi64_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
12590}
12591
12592/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12593/// and store the results in dst. The upper 96 bits of dst are zeroed out.
12594///
12595/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu64_ph)
12596#[inline]
12597#[target_feature(enable = "avx512fp16,avx512vl")]
12598#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12599#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12600pub fn _mm_cvtepu64_ph(a: __m128i) -> __m128h {
12601    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
12602}
12603
12604/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12605/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12606/// mask bit is not set). The upper 96 bits of dst are zeroed out.
12607///
12608/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu64_ph)
12609#[inline]
12610#[target_feature(enable = "avx512fp16,avx512vl")]
12611#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12612#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12613pub fn _mm_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12614    unsafe { vcvtuqq2ph_128(a.as_u64x2(), src, k) }
12615}
12616
12617/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12618/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12619/// The upper 96 bits of dst are zeroed out.
12620///
12621/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu64_ph)
12622#[inline]
12623#[target_feature(enable = "avx512fp16,avx512vl")]
12624#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12625#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12626pub fn _mm_maskz_cvtepu64_ph(k: __mmask8, a: __m128i) -> __m128h {
12627    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
12628}
12629
12630/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12631/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12632///
12633/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu64_ph)
12634#[inline]
12635#[target_feature(enable = "avx512fp16,avx512vl")]
12636#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12637#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12638pub fn _mm256_cvtepu64_ph(a: __m256i) -> __m128h {
12639    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
12640}
12641
12642/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12643/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12644/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12645///
12646/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu64_ph)
12647#[inline]
12648#[target_feature(enable = "avx512fp16,avx512vl")]
12649#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12650#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12651pub fn _mm256_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12652    unsafe { vcvtuqq2ph_256(a.as_u64x4(), src, k) }
12653}
12654
12655/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12656/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12657/// The upper 64 bits of dst are zeroed out.
12658///
12659/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu64_ph)
12660#[inline]
12661#[target_feature(enable = "avx512fp16,avx512vl")]
12662#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12663#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12664pub fn _mm256_maskz_cvtepu64_ph(k: __mmask8, a: __m256i) -> __m128h {
12665    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
12666}
12667
12668/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12669/// and store the results in dst.
12670///
12671/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu64_ph)
12672#[inline]
12673#[target_feature(enable = "avx512fp16")]
12674#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12675#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12676pub fn _mm512_cvtepu64_ph(a: __m512i) -> __m128h {
12677    unsafe { vcvtuqq2ph_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION) }
12678}
12679
12680/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12681/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12682/// mask bit is not set).
12683///
12684/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu64_ph)
12685#[inline]
12686#[target_feature(enable = "avx512fp16")]
12687#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12688#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12689pub fn _mm512_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
12690    unsafe { simd_select_bitmask(k, _mm512_cvtepu64_ph(a), src) }
12691}
12692
12693/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12694/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12695///
12696/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu64_ph)
12697#[inline]
12698#[target_feature(enable = "avx512fp16")]
12699#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12700#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12701pub fn _mm512_maskz_cvtepu64_ph(k: __mmask8, a: __m512i) -> __m128h {
12702    _mm512_mask_cvtepu64_ph(f16x8::ZERO.as_m128h(), k, a)
12703}
12704
12705/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12706/// and store the results in dst.
12707///
12708/// Rounding is done according to the rounding parameter, which can be one of:
12709///
12710/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12711/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12712/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12713/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12714/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12715///
12716/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu64_ph)
12717#[inline]
12718#[target_feature(enable = "avx512fp16")]
12719#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
12720#[rustc_legacy_const_generics(1)]
12721#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12722pub fn _mm512_cvt_roundepu64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
12723    unsafe {
12724        static_assert_rounding!(ROUNDING);
12725        vcvtuqq2ph_512(a.as_u64x8(), ROUNDING)
12726    }
12727}
12728
12729/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12730/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12731/// mask bit is not set).
12732///
12733/// Rounding is done according to the rounding parameter, which can be one of:
12734///
12735/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12736/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12737/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12738/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12739/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12740///
12741/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu64_ph)
12742#[inline]
12743#[target_feature(enable = "avx512fp16")]
12744#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
12745#[rustc_legacy_const_generics(3)]
12746#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12747pub fn _mm512_mask_cvt_roundepu64_ph<const ROUNDING: i32>(
12748    src: __m128h,
12749    k: __mmask8,
12750    a: __m512i,
12751) -> __m128h {
12752    unsafe {
12753        static_assert_rounding!(ROUNDING);
12754        simd_select_bitmask(k, _mm512_cvt_roundepu64_ph::<ROUNDING>(a), src)
12755    }
12756}
12757
12758/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12759/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12760///
12761/// Rounding is done according to the rounding parameter, which can be one of:
12762///
12763/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12764/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12765/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12766/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12767/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12768///
12769/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu64_ph)
12770#[inline]
12771#[target_feature(enable = "avx512fp16")]
12772#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
12773#[rustc_legacy_const_generics(2)]
12774#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12775pub fn _mm512_maskz_cvt_roundepu64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
12776    static_assert_rounding!(ROUNDING);
12777    _mm512_mask_cvt_roundepu64_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
12778}
12779
12780/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12781/// floating-point elements, and store the results in dst.
12782///
12783/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxps_ph)
12784#[inline]
12785#[target_feature(enable = "avx512fp16,avx512vl")]
12786#[cfg_attr(test, assert_instr(vcvtps2phx))]
12787#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12788pub fn _mm_cvtxps_ph(a: __m128) -> __m128h {
12789    _mm_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
12790}
12791
12792/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12793/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12794/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
12795///
12796/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxps_ph)
12797#[inline]
12798#[target_feature(enable = "avx512fp16,avx512vl")]
12799#[cfg_attr(test, assert_instr(vcvtps2phx))]
12800#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12801pub fn _mm_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m128) -> __m128h {
12802    unsafe { vcvtps2phx_128(a, src, k) }
12803}
12804
12805/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12806/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12807/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
12808///
12809/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxps_ph)
12810#[inline]
12811#[target_feature(enable = "avx512fp16,avx512vl")]
12812#[cfg_attr(test, assert_instr(vcvtps2phx))]
12813#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12814pub fn _mm_maskz_cvtxps_ph(k: __mmask8, a: __m128) -> __m128h {
12815    _mm_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
12816}
12817
12818/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12819/// floating-point elements, and store the results in dst.
12820///
12821/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxps_ph)
12822#[inline]
12823#[target_feature(enable = "avx512fp16,avx512vl")]
12824#[cfg_attr(test, assert_instr(vcvtps2phx))]
12825#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12826pub fn _mm256_cvtxps_ph(a: __m256) -> __m128h {
12827    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
12828}
12829
12830/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12831/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12832/// when the corresponding mask bit is not set).
12833///
12834/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxps_ph)
12835#[inline]
12836#[target_feature(enable = "avx512fp16,avx512vl")]
12837#[cfg_attr(test, assert_instr(vcvtps2phx))]
12838#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12839pub fn _mm256_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m256) -> __m128h {
12840    unsafe { vcvtps2phx_256(a, src, k) }
12841}
12842
12843/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12844/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12845/// corresponding mask bit is not set).
12846///
12847/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxps_ph)
12848#[inline]
12849#[target_feature(enable = "avx512fp16,avx512vl")]
12850#[cfg_attr(test, assert_instr(vcvtps2phx))]
12851#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12852pub fn _mm256_maskz_cvtxps_ph(k: __mmask8, a: __m256) -> __m128h {
12853    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
12854}
12855
12856/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12857/// floating-point elements, and store the results in dst.
12858///
12859/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxps_ph)
12860#[inline]
12861#[target_feature(enable = "avx512fp16")]
12862#[cfg_attr(test, assert_instr(vcvtps2phx))]
12863#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12864pub fn _mm512_cvtxps_ph(a: __m512) -> __m256h {
12865    _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), 0xffff, a)
12866}
12867
12868/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12869/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12870/// when the corresponding mask bit is not set).
12871///
12872/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxps_ph)
12873#[inline]
12874#[target_feature(enable = "avx512fp16")]
12875#[cfg_attr(test, assert_instr(vcvtps2phx))]
12876#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12877pub fn _mm512_mask_cvtxps_ph(src: __m256h, k: __mmask16, a: __m512) -> __m256h {
12878    unsafe { vcvtps2phx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
12879}
12880
12881/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12882/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12883/// corresponding mask bit is not set).
12884///
12885/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxps_ph)
12886#[inline]
12887#[target_feature(enable = "avx512fp16")]
12888#[cfg_attr(test, assert_instr(vcvtps2phx))]
12889#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12890pub fn _mm512_maskz_cvtxps_ph(k: __mmask16, a: __m512) -> __m256h {
12891    _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), k, a)
12892}
12893
12894/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12895/// floating-point elements, and store the results in dst.
12896///
12897/// Rounding is done according to the rounding parameter, which can be one of:
12898///
12899/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12900/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12901/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12902/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12903/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12904///
12905/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundps_ph)
12906#[inline]
12907#[target_feature(enable = "avx512fp16")]
12908#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
12909#[rustc_legacy_const_generics(1)]
12910#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12911pub fn _mm512_cvtx_roundps_ph<const ROUNDING: i32>(a: __m512) -> __m256h {
12912    static_assert_rounding!(ROUNDING);
12913    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), 0xffff, a)
12914}
12915
12916/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12917/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12918/// when the corresponding mask bit is not set).
12919///
12920/// Rounding is done according to the rounding parameter, which can be one of:
12921///
12922/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12923/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12924/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12925/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12926/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12927///
12928/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundps_ph)
12929#[inline]
12930#[target_feature(enable = "avx512fp16")]
12931#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
12932#[rustc_legacy_const_generics(3)]
12933#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12934pub fn _mm512_mask_cvtx_roundps_ph<const ROUNDING: i32>(
12935    src: __m256h,
12936    k: __mmask16,
12937    a: __m512,
12938) -> __m256h {
12939    unsafe {
12940        static_assert_rounding!(ROUNDING);
12941        vcvtps2phx_512(a, src, k, ROUNDING)
12942    }
12943}
12944
12945/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12946/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12947/// corresponding mask bit is not set).
12948///
12949/// Rounding is done according to the rounding parameter, which can be one of:
12950///
12951/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12952/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12953/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12954/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12955/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12956///
12957/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundps_ph)
12958#[inline]
12959#[target_feature(enable = "avx512fp16")]
12960#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
12961#[rustc_legacy_const_generics(2)]
12962#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12963pub fn _mm512_maskz_cvtx_roundps_ph<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m256h {
12964    static_assert_rounding!(ROUNDING);
12965    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
12966}
12967
12968/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
12969/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
12970/// elements from a to the upper elements of dst.
12971///
12972/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sh)
12973#[inline]
12974#[target_feature(enable = "avx512fp16")]
12975#[cfg_attr(test, assert_instr(vcvtss2sh))]
12976#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12977pub fn _mm_cvtss_sh(a: __m128h, b: __m128) -> __m128h {
12978    _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
12979}
12980
12981/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
12982/// floating-point elements, store the result in the lower element of dst using writemask k (the element
12983/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
12984/// upper elements of dst.
12985///
12986/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtss_sh)
12987#[inline]
12988#[target_feature(enable = "avx512fp16")]
12989#[cfg_attr(test, assert_instr(vcvtss2sh))]
12990#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12991pub fn _mm_mask_cvtss_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128) -> __m128h {
12992    unsafe { vcvtss2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
12993}
12994
12995/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
12996/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
12997/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
12998/// elements of dst.
12999///
13000/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtss_sh)
13001#[inline]
13002#[target_feature(enable = "avx512fp16")]
13003#[cfg_attr(test, assert_instr(vcvtss2sh))]
13004#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13005pub fn _mm_maskz_cvtss_sh(k: __mmask8, a: __m128h, b: __m128) -> __m128h {
13006    _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), k, a, b)
13007}
13008
13009/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13010/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13011/// elements from a to the upper elements of dst.
13012///
13013/// Rounding is done according to the rounding parameter, which can be one of:
13014///
13015/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13016/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13017/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13018/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13019/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13020///
13021/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_sh)
13022#[inline]
13023#[target_feature(enable = "avx512fp16")]
13024#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13025#[rustc_legacy_const_generics(2)]
13026#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13027pub fn _mm_cvt_roundss_sh<const ROUNDING: i32>(a: __m128h, b: __m128) -> __m128h {
13028    static_assert_rounding!(ROUNDING);
13029    _mm_mask_cvt_roundss_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
13030}
13031
13032/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13033/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13034/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13035/// upper elements of dst.
13036///
13037/// Rounding is done according to the rounding parameter, which can be one of:
13038///
13039/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13040/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13041/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13042/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13043/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13044///
13045/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundss_sh)
13046#[inline]
13047#[target_feature(enable = "avx512fp16")]
13048#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13049#[rustc_legacy_const_generics(4)]
13050#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13051pub fn _mm_mask_cvt_roundss_sh<const ROUNDING: i32>(
13052    src: __m128h,
13053    k: __mmask8,
13054    a: __m128h,
13055    b: __m128,
13056) -> __m128h {
13057    unsafe {
13058        static_assert_rounding!(ROUNDING);
13059        vcvtss2sh(a, b, src, k, ROUNDING)
13060    }
13061}
13062
13063/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13064/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13065/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13066/// elements of dst.
13067///
13068/// Rounding is done according to the rounding parameter, which can be one of:
13069///
13070/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13071/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13072/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13073/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13074/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13075///
13076/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundss_sh)
13077#[inline]
13078#[target_feature(enable = "avx512fp16")]
13079#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13080#[rustc_legacy_const_generics(3)]
13081#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13082pub fn _mm_maskz_cvt_roundss_sh<const ROUNDING: i32>(
13083    k: __mmask8,
13084    a: __m128h,
13085    b: __m128,
13086) -> __m128h {
13087    static_assert_rounding!(ROUNDING);
13088    _mm_mask_cvt_roundss_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
13089}
13090
13091/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13092/// floating-point elements, and store the results in dst. The upper 96 bits of dst are zeroed out.
13093///
13094/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ph)
13095#[inline]
13096#[target_feature(enable = "avx512fp16,avx512vl")]
13097#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13098#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13099pub fn _mm_cvtpd_ph(a: __m128d) -> __m128h {
13100    _mm_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
13101}
13102
13103/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13104/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13105/// when the corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
13106///
13107/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpd_ph)
13108#[inline]
13109#[target_feature(enable = "avx512fp16,avx512vl")]
13110#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13111#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13112pub fn _mm_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m128d) -> __m128h {
13113    unsafe { vcvtpd2ph_128(a, src, k) }
13114}
13115
13116/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13117/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13118/// corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
13119///
13120/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpd_ph)
13121#[inline]
13122#[target_feature(enable = "avx512fp16,avx512vl")]
13123#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13124#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13125pub fn _mm_maskz_cvtpd_ph(k: __mmask8, a: __m128d) -> __m128h {
13126    _mm_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
13127}
13128
13129/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13130/// floating-point elements, and store the results in dst. The upper 64 bits of dst are zeroed out.
13131///
13132/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_ph)
13133#[inline]
13134#[target_feature(enable = "avx512fp16,avx512vl")]
13135#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13136#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13137pub fn _mm256_cvtpd_ph(a: __m256d) -> __m128h {
13138    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
13139}
13140
13141/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13142/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13143/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
13144///
13145/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpd_ph)
13146#[inline]
13147#[target_feature(enable = "avx512fp16,avx512vl")]
13148#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13149#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13150pub fn _mm256_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m256d) -> __m128h {
13151    unsafe { vcvtpd2ph_256(a, src, k) }
13152}
13153
13154/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13155/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13156/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
13157///
13158/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpd_ph)
13159#[inline]
13160#[target_feature(enable = "avx512fp16,avx512vl")]
13161#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13162#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13163pub fn _mm256_maskz_cvtpd_ph(k: __mmask8, a: __m256d) -> __m128h {
13164    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
13165}
13166
13167/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13168/// floating-point elements, and store the results in dst.
13169///
13170/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_ph)
13171#[inline]
13172#[target_feature(enable = "avx512fp16")]
13173#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13174#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13175pub fn _mm512_cvtpd_ph(a: __m512d) -> __m128h {
13176    _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), 0xff, a)
13177}
13178
13179/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13180/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13181/// when the corresponding mask bit is not set).
13182///
13183/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_ph)
13184#[inline]
13185#[target_feature(enable = "avx512fp16")]
13186#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13187#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13188pub fn _mm512_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m512d) -> __m128h {
13189    unsafe { vcvtpd2ph_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
13190}
13191
13192/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13193/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13194/// corresponding mask bit is not set).
13195///
13196/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_ph)
13197#[inline]
13198#[target_feature(enable = "avx512fp16")]
13199#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13200#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13201pub fn _mm512_maskz_cvtpd_ph(k: __mmask8, a: __m512d) -> __m128h {
13202    _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), k, a)
13203}
13204
13205/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13206/// floating-point elements, and store the results in dst.
13207///
13208/// Rounding is done according to the rounding parameter, which can be one of:
13209///
13210/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13211/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13212/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13213/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13214/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13215///
13216/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_ph)
13217#[inline]
13218#[target_feature(enable = "avx512fp16")]
13219#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13220#[rustc_legacy_const_generics(1)]
13221#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13222pub fn _mm512_cvt_roundpd_ph<const ROUNDING: i32>(a: __m512d) -> __m128h {
13223    static_assert_rounding!(ROUNDING);
13224    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a)
13225}
13226
13227/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13228/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13229/// when the corresponding mask bit is not set).
13230///
13231/// Rounding is done according to the rounding parameter, which can be one of:
13232///
13233/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13234/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13235/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13236/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13237/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13238///
13239/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_ph)
13240#[inline]
13241#[target_feature(enable = "avx512fp16")]
13242#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13243#[rustc_legacy_const_generics(3)]
13244#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13245pub fn _mm512_mask_cvt_roundpd_ph<const ROUNDING: i32>(
13246    src: __m128h,
13247    k: __mmask8,
13248    a: __m512d,
13249) -> __m128h {
13250    unsafe {
13251        static_assert_rounding!(ROUNDING);
13252        vcvtpd2ph_512(a, src, k, ROUNDING)
13253    }
13254}
13255
13256/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13257/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13258/// corresponding mask bit is not set).
13259///
13260/// Rounding is done according to the rounding parameter, which can be one of:
13261///
13262/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13263/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13264/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13265/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13266/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13267///
13268/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_ph)
13269#[inline]
13270#[target_feature(enable = "avx512fp16")]
13271#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13272#[rustc_legacy_const_generics(2)]
13273#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13274pub fn _mm512_maskz_cvt_roundpd_ph<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m128h {
13275    static_assert_rounding!(ROUNDING);
13276    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
13277}
13278
13279/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13280/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13281/// elements from a to the upper elements of dst.
13282///
13283/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_sh)
13284#[inline]
13285#[target_feature(enable = "avx512fp16")]
13286#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13287#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13288pub fn _mm_cvtsd_sh(a: __m128h, b: __m128d) -> __m128h {
13289    _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
13290}
13291
13292/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13293/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13294/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13295/// upper elements of dst.
13296///
13297/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsd_sh)
13298#[inline]
13299#[target_feature(enable = "avx512fp16")]
13300#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13301#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13302pub fn _mm_mask_cvtsd_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
13303    unsafe { vcvtsd2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
13304}
13305
13306/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13307/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13308/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13309/// elements of dst.
13310///
13311/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsd_sh)
13312#[inline]
13313#[target_feature(enable = "avx512fp16")]
13314#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13315#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13316pub fn _mm_maskz_cvtsd_sh(k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
13317    _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), k, a, b)
13318}
13319
13320/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13321/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13322/// elements from a to the upper elements of dst.
13323///
13324/// Rounding is done according to the rounding parameter, which can be one of:
13325///
13326/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13327/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13328/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13329/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13330/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13331///
13332/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_sh)
13333#[inline]
13334#[target_feature(enable = "avx512fp16")]
13335#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13336#[rustc_legacy_const_generics(2)]
13337#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13338pub fn _mm_cvt_roundsd_sh<const ROUNDING: i32>(a: __m128h, b: __m128d) -> __m128h {
13339    static_assert_rounding!(ROUNDING);
13340    _mm_mask_cvt_roundsd_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
13341}
13342
13343/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13344/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13345/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13346/// upper elements of dst.
13347///
13348/// Rounding is done according to the rounding parameter, which can be one of:
13349///
13350/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13351/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13352/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13353/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13354/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13355///
13356/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsd_sh)
13357#[inline]
13358#[target_feature(enable = "avx512fp16")]
13359#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13360#[rustc_legacy_const_generics(4)]
13361#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13362pub fn _mm_mask_cvt_roundsd_sh<const ROUNDING: i32>(
13363    src: __m128h,
13364    k: __mmask8,
13365    a: __m128h,
13366    b: __m128d,
13367) -> __m128h {
13368    unsafe {
13369        static_assert_rounding!(ROUNDING);
13370        vcvtsd2sh(a, b, src, k, ROUNDING)
13371    }
13372}
13373
13374/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13375/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13376/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13377/// elements of dst.
13378///
13379/// Rounding is done according to the rounding parameter, which can be one of:
13380///
13381/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13382/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13383/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13384/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13385/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13386///
13387/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsd_sh)
13388#[inline]
13389#[target_feature(enable = "avx512fp16")]
13390#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13391#[rustc_legacy_const_generics(3)]
13392#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13393pub fn _mm_maskz_cvt_roundsd_sh<const ROUNDING: i32>(
13394    k: __mmask8,
13395    a: __m128h,
13396    b: __m128d,
13397) -> __m128h {
13398    static_assert_rounding!(ROUNDING);
13399    _mm_mask_cvt_roundsd_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
13400}
13401
13402/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13403/// store the results in dst.
13404///
13405/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi16)
13406#[inline]
13407#[target_feature(enable = "avx512fp16,avx512vl")]
13408#[cfg_attr(test, assert_instr(vcvtph2w))]
13409#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13410pub fn _mm_cvtph_epi16(a: __m128h) -> __m128i {
13411    _mm_mask_cvtph_epi16(_mm_undefined_si128(), 0xff, a)
13412}
13413
13414/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13415/// store the results in dst using writemask k (elements are copied from src when the corresponding
13416/// mask bit is not set).
13417///
13418/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi16)
13419#[inline]
13420#[target_feature(enable = "avx512fp16,avx512vl")]
13421#[cfg_attr(test, assert_instr(vcvtph2w))]
13422#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13423pub fn _mm_mask_cvtph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13424    unsafe { transmute(vcvtph2w_128(a, src.as_i16x8(), k)) }
13425}
13426
13427/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13428/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13429///
13430/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi16)
13431#[inline]
13432#[target_feature(enable = "avx512fp16,avx512vl")]
13433#[cfg_attr(test, assert_instr(vcvtph2w))]
13434#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13435pub fn _mm_maskz_cvtph_epi16(k: __mmask8, a: __m128h) -> __m128i {
13436    _mm_mask_cvtph_epi16(_mm_setzero_si128(), k, a)
13437}
13438
13439/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13440/// store the results in dst.
13441///
13442/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi16)
13443#[inline]
13444#[target_feature(enable = "avx512fp16,avx512vl")]
13445#[cfg_attr(test, assert_instr(vcvtph2w))]
13446#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13447pub fn _mm256_cvtph_epi16(a: __m256h) -> __m256i {
13448    _mm256_mask_cvtph_epi16(_mm256_undefined_si256(), 0xffff, a)
13449}
13450
13451/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13452/// store the results in dst using writemask k (elements are copied from src when the corresponding
13453/// mask bit is not set).
13454///
13455/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi16)
13456#[inline]
13457#[target_feature(enable = "avx512fp16,avx512vl")]
13458#[cfg_attr(test, assert_instr(vcvtph2w))]
13459#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13460pub fn _mm256_mask_cvtph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13461    unsafe { transmute(vcvtph2w_256(a, src.as_i16x16(), k)) }
13462}
13463
13464/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13465/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13466///
13467/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi16)
13468#[inline]
13469#[target_feature(enable = "avx512fp16,avx512vl")]
13470#[cfg_attr(test, assert_instr(vcvtph2w))]
13471#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13472pub fn _mm256_maskz_cvtph_epi16(k: __mmask16, a: __m256h) -> __m256i {
13473    _mm256_mask_cvtph_epi16(_mm256_setzero_si256(), k, a)
13474}
13475
13476/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13477/// store the results in dst.
13478///
13479/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi16)
13480#[inline]
13481#[target_feature(enable = "avx512fp16")]
13482#[cfg_attr(test, assert_instr(vcvtph2w))]
13483#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13484pub fn _mm512_cvtph_epi16(a: __m512h) -> __m512i {
13485    _mm512_mask_cvtph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
13486}
13487
13488/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13489/// store the results in dst using writemask k (elements are copied from src when the corresponding
13490/// mask bit is not set).
13491///
13492/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi16)
13493#[inline]
13494#[target_feature(enable = "avx512fp16")]
13495#[cfg_attr(test, assert_instr(vcvtph2w))]
13496#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13497pub fn _mm512_mask_cvtph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13498    unsafe {
13499        transmute(vcvtph2w_512(
13500            a,
13501            src.as_i16x32(),
13502            k,
13503            _MM_FROUND_CUR_DIRECTION,
13504        ))
13505    }
13506}
13507
13508/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13509/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13510///
13511/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi16)
13512#[inline]
13513#[target_feature(enable = "avx512fp16")]
13514#[cfg_attr(test, assert_instr(vcvtph2w))]
13515#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13516pub fn _mm512_maskz_cvtph_epi16(k: __mmask32, a: __m512h) -> __m512i {
13517    _mm512_mask_cvtph_epi16(_mm512_setzero_si512(), k, a)
13518}
13519
13520/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13521/// store the results in dst.
13522///
13523/// Rounding is done according to the rounding parameter, which can be one of:
13524///
13525/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13526/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13527/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13528/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13529/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13530///
13531/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi16)
13532#[inline]
13533#[target_feature(enable = "avx512fp16")]
13534#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13535#[rustc_legacy_const_generics(1)]
13536#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13537pub fn _mm512_cvt_roundph_epi16<const ROUNDING: i32>(a: __m512h) -> __m512i {
13538    static_assert_rounding!(ROUNDING);
13539    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_undefined_epi32(), 0xffffffff, a)
13540}
13541
13542/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13543/// store the results in dst using writemask k (elements are copied from src when the corresponding
13544/// mask bit is not set).
13545///
13546/// Rounding is done according to the rounding parameter, which can be one of:
13547///
13548/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13549/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13550/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13551/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13552/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13553///
13554/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi16)
13555#[inline]
13556#[target_feature(enable = "avx512fp16")]
13557#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13558#[rustc_legacy_const_generics(3)]
13559#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13560pub fn _mm512_mask_cvt_roundph_epi16<const ROUNDING: i32>(
13561    src: __m512i,
13562    k: __mmask32,
13563    a: __m512h,
13564) -> __m512i {
13565    unsafe {
13566        static_assert_rounding!(ROUNDING);
13567        transmute(vcvtph2w_512(a, src.as_i16x32(), k, ROUNDING))
13568    }
13569}
13570
13571/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13572/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13573///
13574/// Rounding is done according to the rounding parameter, which can be one of:
13575///
13576/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13577/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13578/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13579/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13580/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13581///
13582/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi16)
13583#[inline]
13584#[target_feature(enable = "avx512fp16")]
13585#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13586#[rustc_legacy_const_generics(2)]
13587#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13588pub fn _mm512_maskz_cvt_roundph_epi16<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512i {
13589    static_assert_rounding!(ROUNDING);
13590    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_setzero_si512(), k, a)
13591}
13592
13593/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13594/// and store the results in dst.
13595///
13596/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu16)
13597#[inline]
13598#[target_feature(enable = "avx512fp16,avx512vl")]
13599#[cfg_attr(test, assert_instr(vcvtph2uw))]
13600#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13601pub fn _mm_cvtph_epu16(a: __m128h) -> __m128i {
13602    _mm_mask_cvtph_epu16(_mm_undefined_si128(), 0xff, a)
13603}
13604
13605/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13606/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13607/// mask bit is not set).
13608///
13609/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu16)
13610#[inline]
13611#[target_feature(enable = "avx512fp16,avx512vl")]
13612#[cfg_attr(test, assert_instr(vcvtph2uw))]
13613#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13614pub fn _mm_mask_cvtph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13615    unsafe { transmute(vcvtph2uw_128(a, src.as_u16x8(), k)) }
13616}
13617
13618/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13619/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13620///
13621/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu16)
13622#[inline]
13623#[target_feature(enable = "avx512fp16,avx512vl")]
13624#[cfg_attr(test, assert_instr(vcvtph2uw))]
13625#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13626pub fn _mm_maskz_cvtph_epu16(k: __mmask8, a: __m128h) -> __m128i {
13627    _mm_mask_cvtph_epu16(_mm_setzero_si128(), k, a)
13628}
13629
13630/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13631/// and store the results in dst.
13632///
13633/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu16)
13634#[inline]
13635#[target_feature(enable = "avx512fp16,avx512vl")]
13636#[cfg_attr(test, assert_instr(vcvtph2uw))]
13637#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13638pub fn _mm256_cvtph_epu16(a: __m256h) -> __m256i {
13639    _mm256_mask_cvtph_epu16(_mm256_undefined_si256(), 0xffff, a)
13640}
13641
13642/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13643/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13644/// mask bit is not set).
13645///
13646/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu16)
13647#[inline]
13648#[target_feature(enable = "avx512fp16,avx512vl")]
13649#[cfg_attr(test, assert_instr(vcvtph2uw))]
13650#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13651pub fn _mm256_mask_cvtph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13652    unsafe { transmute(vcvtph2uw_256(a, src.as_u16x16(), k)) }
13653}
13654
13655/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13656/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13657///
13658/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu16)
13659#[inline]
13660#[target_feature(enable = "avx512fp16,avx512vl")]
13661#[cfg_attr(test, assert_instr(vcvtph2uw))]
13662#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13663pub fn _mm256_maskz_cvtph_epu16(k: __mmask16, a: __m256h) -> __m256i {
13664    _mm256_mask_cvtph_epu16(_mm256_setzero_si256(), k, a)
13665}
13666
13667/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13668/// and store the results in dst.
13669///
13670/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu16)
13671#[inline]
13672#[target_feature(enable = "avx512fp16")]
13673#[cfg_attr(test, assert_instr(vcvtph2uw))]
13674#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13675pub fn _mm512_cvtph_epu16(a: __m512h) -> __m512i {
13676    _mm512_mask_cvtph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
13677}
13678
13679/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13680/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13681/// mask bit is not set).
13682///
13683/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu16)
13684#[inline]
13685#[target_feature(enable = "avx512fp16")]
13686#[cfg_attr(test, assert_instr(vcvtph2uw))]
13687#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13688pub fn _mm512_mask_cvtph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13689    unsafe {
13690        transmute(vcvtph2uw_512(
13691            a,
13692            src.as_u16x32(),
13693            k,
13694            _MM_FROUND_CUR_DIRECTION,
13695        ))
13696    }
13697}
13698
13699/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13700/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13701///
13702/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu16)
13703#[inline]
13704#[target_feature(enable = "avx512fp16")]
13705#[cfg_attr(test, assert_instr(vcvtph2uw))]
13706#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13707pub fn _mm512_maskz_cvtph_epu16(k: __mmask32, a: __m512h) -> __m512i {
13708    _mm512_mask_cvtph_epu16(_mm512_setzero_si512(), k, a)
13709}
13710
13711/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13712/// and store the results in dst.
13713///
13714/// Rounding is done according to the rounding parameter, which can be one of:
13715///
13716/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13717/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13718/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13719/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13720/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13721///
13722/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu16)
13723#[inline]
13724#[target_feature(enable = "avx512fp16")]
13725#[cfg_attr(test, assert_instr(vcvtph2uw, ROUNDING = 8))]
13726#[rustc_legacy_const_generics(1)]
13727#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13728pub fn _mm512_cvt_roundph_epu16<const ROUNDING: i32>(a: __m512h) -> __m512i {
13729    static_assert_rounding!(ROUNDING);
13730    _mm512_mask_cvt_roundph_epu16::<ROUNDING>(_mm512_undefined_epi32(), 0xffffffff, a)
13731}
13732
13733/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13734/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13735/// mask bit is not set).
13736///
13737/// Rounding is done according to the rounding parameter, which can be one of:
13738///
13739/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13740/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13741/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13742/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13743/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13744///
13745/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu16)
13746#[inline]
13747#[target_feature(enable = "avx512fp16")]
13748#[cfg_attr(test, assert_instr(vcvtph2uw, ROUNDING = 8))]
13749#[rustc_legacy_const_generics(3)]
13750#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13751pub fn _mm512_mask_cvt_roundph_epu16<const ROUNDING: i32>(
13752    src: __m512i,
13753    k: __mmask32,
13754    a: __m512h,
13755) -> __m512i {
13756    unsafe {
13757        static_assert_rounding!(ROUNDING);
13758        transmute(vcvtph2uw_512(a, src.as_u16x32(), k, ROUNDING))
13759    }
13760}
13761
13762/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13763/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13764///
13765/// Rounding is done according to the rounding parameter, which can be one of:
13766///
13767/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13768/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13769/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13770/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13771/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13772///
13773/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu16)
13774#[inline]
13775#[target_feature(enable = "avx512fp16")]
13776#[cfg_attr(test, assert_instr(vcvtph2uw, ROUNDING = 8))]
13777#[rustc_legacy_const_generics(2)]
13778#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13779pub fn _mm512_maskz_cvt_roundph_epu16<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512i {
13780    static_assert_rounding!(ROUNDING);
13781    _mm512_mask_cvt_roundph_epu16::<ROUNDING>(_mm512_setzero_si512(), k, a)
13782}
13783
13784/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13785/// truncation, and store the results in dst.
13786///
13787/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi16)
13788#[inline]
13789#[target_feature(enable = "avx512fp16,avx512vl")]
13790#[cfg_attr(test, assert_instr(vcvttph2w))]
13791#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13792pub fn _mm_cvttph_epi16(a: __m128h) -> __m128i {
13793    _mm_mask_cvttph_epi16(_mm_undefined_si128(), 0xff, a)
13794}
13795
13796/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13797/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13798/// mask bit is not set).
13799///
13800/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi16)
13801#[inline]
13802#[target_feature(enable = "avx512fp16,avx512vl")]
13803#[cfg_attr(test, assert_instr(vcvttph2w))]
13804#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13805pub fn _mm_mask_cvttph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13806    unsafe { transmute(vcvttph2w_128(a, src.as_i16x8(), k)) }
13807}
13808
13809/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13810/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13811/// mask bit is not set).
13812///
13813/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi16)
13814#[inline]
13815#[target_feature(enable = "avx512fp16,avx512vl")]
13816#[cfg_attr(test, assert_instr(vcvttph2w))]
13817#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13818pub fn _mm_maskz_cvttph_epi16(k: __mmask8, a: __m128h) -> __m128i {
13819    _mm_mask_cvttph_epi16(_mm_setzero_si128(), k, a)
13820}
13821
13822/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13823/// truncation, and store the results in dst.
13824///
13825/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi16)
13826#[inline]
13827#[target_feature(enable = "avx512fp16,avx512vl")]
13828#[cfg_attr(test, assert_instr(vcvttph2w))]
13829#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13830pub fn _mm256_cvttph_epi16(a: __m256h) -> __m256i {
13831    _mm256_mask_cvttph_epi16(_mm256_undefined_si256(), 0xffff, a)
13832}
13833
13834/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13835/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13836/// mask bit is not set).
13837///
13838/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi16)
13839#[inline]
13840#[target_feature(enable = "avx512fp16,avx512vl")]
13841#[cfg_attr(test, assert_instr(vcvttph2w))]
13842#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13843pub fn _mm256_mask_cvttph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13844    unsafe { transmute(vcvttph2w_256(a, src.as_i16x16(), k)) }
13845}
13846
13847/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13848/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13849/// mask bit is not set).
13850///
13851/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi16)
13852#[inline]
13853#[target_feature(enable = "avx512fp16,avx512vl")]
13854#[cfg_attr(test, assert_instr(vcvttph2w))]
13855#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13856pub fn _mm256_maskz_cvttph_epi16(k: __mmask16, a: __m256h) -> __m256i {
13857    _mm256_mask_cvttph_epi16(_mm256_setzero_si256(), k, a)
13858}
13859
13860/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13861/// truncation, and store the results in dst.
13862///
13863/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi16)
13864#[inline]
13865#[target_feature(enable = "avx512fp16")]
13866#[cfg_attr(test, assert_instr(vcvttph2w))]
13867#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13868pub fn _mm512_cvttph_epi16(a: __m512h) -> __m512i {
13869    _mm512_mask_cvttph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
13870}
13871
13872/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13873/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13874/// mask bit is not set).
13875///
13876/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi16)
13877#[inline]
13878#[target_feature(enable = "avx512fp16")]
13879#[cfg_attr(test, assert_instr(vcvttph2w))]
13880#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13881pub fn _mm512_mask_cvttph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13882    unsafe {
13883        transmute(vcvttph2w_512(
13884            a,
13885            src.as_i16x32(),
13886            k,
13887            _MM_FROUND_CUR_DIRECTION,
13888        ))
13889    }
13890}
13891
13892/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13893/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13894/// mask bit is not set).
13895///
13896/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi16)
13897#[inline]
13898#[target_feature(enable = "avx512fp16")]
13899#[cfg_attr(test, assert_instr(vcvttph2w))]
13900#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13901pub fn _mm512_maskz_cvttph_epi16(k: __mmask32, a: __m512h) -> __m512i {
13902    _mm512_mask_cvttph_epi16(_mm512_setzero_si512(), k, a)
13903}
13904
13905/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13906/// truncation, and store the results in dst.
13907///
13908/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
13909///
13910/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi16)
13911#[inline]
13912#[target_feature(enable = "avx512fp16")]
13913#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
13914#[rustc_legacy_const_generics(1)]
13915#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13916pub fn _mm512_cvtt_roundph_epi16<const SAE: i32>(a: __m512h) -> __m512i {
13917    static_assert_sae!(SAE);
13918    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
13919}
13920
13921/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13922/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13923/// mask bit is not set).
13924///
13925/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
13926///
13927/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi16)
13928#[inline]
13929#[target_feature(enable = "avx512fp16")]
13930#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
13931#[rustc_legacy_const_generics(3)]
13932#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13933pub fn _mm512_mask_cvtt_roundph_epi16<const SAE: i32>(
13934    src: __m512i,
13935    k: __mmask32,
13936    a: __m512h,
13937) -> __m512i {
13938    unsafe {
13939        static_assert_sae!(SAE);
13940        transmute(vcvttph2w_512(a, src.as_i16x32(), k, SAE))
13941    }
13942}
13943
13944/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13945/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13946/// mask bit is not set).
13947///
13948/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
13949///
13950/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi16)
13951#[inline]
13952#[target_feature(enable = "avx512fp16")]
13953#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
13954#[rustc_legacy_const_generics(2)]
13955#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13956pub fn _mm512_maskz_cvtt_roundph_epi16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
13957    static_assert_sae!(SAE);
13958    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_setzero_si512(), k, a)
13959}
13960
13961/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13962/// truncation, and store the results in dst.
13963///
13964/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu16)
13965#[inline]
13966#[target_feature(enable = "avx512fp16,avx512vl")]
13967#[cfg_attr(test, assert_instr(vcvttph2uw))]
13968#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13969pub fn _mm_cvttph_epu16(a: __m128h) -> __m128i {
13970    _mm_mask_cvttph_epu16(_mm_undefined_si128(), 0xff, a)
13971}
13972
13973/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13974/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13975/// mask bit is not set).
13976///
13977/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu16)
13978#[inline]
13979#[target_feature(enable = "avx512fp16,avx512vl")]
13980#[cfg_attr(test, assert_instr(vcvttph2uw))]
13981#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13982pub fn _mm_mask_cvttph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13983    unsafe { transmute(vcvttph2uw_128(a, src.as_u16x8(), k)) }
13984}
13985
13986/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13987/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13988/// mask bit is not set).
13989///
13990/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu16)
13991#[inline]
13992#[target_feature(enable = "avx512fp16,avx512vl")]
13993#[cfg_attr(test, assert_instr(vcvttph2uw))]
13994#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13995pub fn _mm_maskz_cvttph_epu16(k: __mmask8, a: __m128h) -> __m128i {
13996    _mm_mask_cvttph_epu16(_mm_setzero_si128(), k, a)
13997}
13998
13999/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14000/// truncation, and store the results in dst.
14001///
14002/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu16)
14003#[inline]
14004#[target_feature(enable = "avx512fp16,avx512vl")]
14005#[cfg_attr(test, assert_instr(vcvttph2uw))]
14006#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14007pub fn _mm256_cvttph_epu16(a: __m256h) -> __m256i {
14008    _mm256_mask_cvttph_epu16(_mm256_undefined_si256(), 0xffff, a)
14009}
14010
14011/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14012/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14013/// mask bit is not set).
14014///
14015/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu16)
14016#[inline]
14017#[target_feature(enable = "avx512fp16,avx512vl")]
14018#[cfg_attr(test, assert_instr(vcvttph2uw))]
14019#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14020pub fn _mm256_mask_cvttph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
14021    unsafe { transmute(vcvttph2uw_256(a, src.as_u16x16(), k)) }
14022}
14023
14024/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14025/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14026/// mask bit is not set).
14027///
14028/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu16)
14029#[inline]
14030#[target_feature(enable = "avx512fp16,avx512vl")]
14031#[cfg_attr(test, assert_instr(vcvttph2uw))]
14032#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14033pub fn _mm256_maskz_cvttph_epu16(k: __mmask16, a: __m256h) -> __m256i {
14034    _mm256_mask_cvttph_epu16(_mm256_setzero_si256(), k, a)
14035}
14036
14037/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14038/// truncation, and store the results in dst.
14039///
14040/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu16)
14041#[inline]
14042#[target_feature(enable = "avx512fp16")]
14043#[cfg_attr(test, assert_instr(vcvttph2uw))]
14044#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14045pub fn _mm512_cvttph_epu16(a: __m512h) -> __m512i {
14046    _mm512_mask_cvttph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
14047}
14048
14049/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14050/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14051/// mask bit is not set).
14052///
14053/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu16)
14054#[inline]
14055#[target_feature(enable = "avx512fp16")]
14056#[cfg_attr(test, assert_instr(vcvttph2uw))]
14057#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14058pub fn _mm512_mask_cvttph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
14059    unsafe {
14060        transmute(vcvttph2uw_512(
14061            a,
14062            src.as_u16x32(),
14063            k,
14064            _MM_FROUND_CUR_DIRECTION,
14065        ))
14066    }
14067}
14068
14069/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14070/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14071/// mask bit is not set).
14072///
14073/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu16)
14074#[inline]
14075#[target_feature(enable = "avx512fp16")]
14076#[cfg_attr(test, assert_instr(vcvttph2uw))]
14077#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14078pub fn _mm512_maskz_cvttph_epu16(k: __mmask32, a: __m512h) -> __m512i {
14079    _mm512_mask_cvttph_epu16(_mm512_setzero_si512(), k, a)
14080}
14081
14082/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14083/// truncation, and store the results in dst.
14084///
14085/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14086///
14087/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu16)
14088#[inline]
14089#[target_feature(enable = "avx512fp16")]
14090#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14091#[rustc_legacy_const_generics(1)]
14092#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14093pub fn _mm512_cvtt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
14094    static_assert_sae!(SAE);
14095    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
14096}
14097
14098/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14099/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14100/// mask bit is not set).
14101///
14102/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14103///
14104/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu16)
14105#[inline]
14106#[target_feature(enable = "avx512fp16")]
14107#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14108#[rustc_legacy_const_generics(3)]
14109#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14110pub fn _mm512_mask_cvtt_roundph_epu16<const SAE: i32>(
14111    src: __m512i,
14112    k: __mmask32,
14113    a: __m512h,
14114) -> __m512i {
14115    unsafe {
14116        static_assert_sae!(SAE);
14117        transmute(vcvttph2uw_512(a, src.as_u16x32(), k, SAE))
14118    }
14119}
14120
14121/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14122/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14123/// mask bit is not set).
14124///
14125/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14126///
14127/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu16)
14128#[inline]
14129#[target_feature(enable = "avx512fp16")]
14130#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14131#[rustc_legacy_const_generics(2)]
14132#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14133pub fn _mm512_maskz_cvtt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
14134    static_assert_sae!(SAE);
14135    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_setzero_si512(), k, a)
14136}
14137
14138/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14139/// results in dst.
14140///
14141/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi32)
14142#[inline]
14143#[target_feature(enable = "avx512fp16,avx512vl")]
14144#[cfg_attr(test, assert_instr(vcvtph2dq))]
14145#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14146pub fn _mm_cvtph_epi32(a: __m128h) -> __m128i {
14147    _mm_mask_cvtph_epi32(_mm_undefined_si128(), 0xff, a)
14148}
14149
14150/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14151/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14152///
14153/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi32)
14154#[inline]
14155#[target_feature(enable = "avx512fp16,avx512vl")]
14156#[cfg_attr(test, assert_instr(vcvtph2dq))]
14157#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14158pub fn _mm_mask_cvtph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14159    unsafe { transmute(vcvtph2dq_128(a, src.as_i32x4(), k)) }
14160}
14161
14162/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14163/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14164///
14165/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi32)
14166#[inline]
14167#[target_feature(enable = "avx512fp16,avx512vl")]
14168#[cfg_attr(test, assert_instr(vcvtph2dq))]
14169#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14170pub fn _mm_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m128i {
14171    _mm_mask_cvtph_epi32(_mm_setzero_si128(), k, a)
14172}
14173
14174/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14175/// results in dst.
14176///
14177/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi32)
14178#[inline]
14179#[target_feature(enable = "avx512fp16,avx512vl")]
14180#[cfg_attr(test, assert_instr(vcvtph2dq))]
14181#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14182pub fn _mm256_cvtph_epi32(a: __m128h) -> __m256i {
14183    _mm256_mask_cvtph_epi32(_mm256_undefined_si256(), 0xff, a)
14184}
14185
14186/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14187/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14188///
14189/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi32)
14190#[inline]
14191#[target_feature(enable = "avx512fp16,avx512vl")]
14192#[cfg_attr(test, assert_instr(vcvtph2dq))]
14193#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14194pub fn _mm256_mask_cvtph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14195    unsafe { transmute(vcvtph2dq_256(a, src.as_i32x8(), k)) }
14196}
14197
14198/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14199/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14200///
14201/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi32)
14202#[inline]
14203#[target_feature(enable = "avx512fp16,avx512vl")]
14204#[cfg_attr(test, assert_instr(vcvtph2dq))]
14205#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14206pub fn _mm256_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m256i {
14207    _mm256_mask_cvtph_epi32(_mm256_setzero_si256(), k, a)
14208}
14209
14210/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14211/// results in dst.
14212///
14213/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi32)
14214#[inline]
14215#[target_feature(enable = "avx512fp16")]
14216#[cfg_attr(test, assert_instr(vcvtph2dq))]
14217#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14218pub fn _mm512_cvtph_epi32(a: __m256h) -> __m512i {
14219    _mm512_mask_cvtph_epi32(_mm512_undefined_epi32(), 0xffff, a)
14220}
14221
14222/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14223/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14224///
14225/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi32)
14226#[inline]
14227#[target_feature(enable = "avx512fp16")]
14228#[cfg_attr(test, assert_instr(vcvtph2dq))]
14229#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14230pub fn _mm512_mask_cvtph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14231    unsafe {
14232        transmute(vcvtph2dq_512(
14233            a,
14234            src.as_i32x16(),
14235            k,
14236            _MM_FROUND_CUR_DIRECTION,
14237        ))
14238    }
14239}
14240
14241/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14242/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14243///
14244/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi32)
14245#[inline]
14246#[target_feature(enable = "avx512fp16")]
14247#[cfg_attr(test, assert_instr(vcvtph2dq))]
14248#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14249pub fn _mm512_maskz_cvtph_epi32(k: __mmask16, a: __m256h) -> __m512i {
14250    _mm512_mask_cvtph_epi32(_mm512_setzero_si512(), k, a)
14251}
14252
14253/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14254/// results in dst.
14255///
14256/// Rounding is done according to the rounding parameter, which can be one of:
14257///
14258/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14259/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14260/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14261/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14262/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14263///
14264/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi32)
14265#[inline]
14266#[target_feature(enable = "avx512fp16")]
14267#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14268#[rustc_legacy_const_generics(1)]
14269#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14270pub fn _mm512_cvt_roundph_epi32<const ROUNDING: i32>(a: __m256h) -> __m512i {
14271    static_assert_rounding!(ROUNDING);
14272    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
14273}
14274
14275/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14276/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14277///
14278/// Rounding is done according to the rounding parameter, which can be one of:
14279///
14280/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14281/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14282/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14283/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14284/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14285///
14286/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi32)
14287#[inline]
14288#[target_feature(enable = "avx512fp16")]
14289#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14290#[rustc_legacy_const_generics(3)]
14291#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14292pub fn _mm512_mask_cvt_roundph_epi32<const ROUNDING: i32>(
14293    src: __m512i,
14294    k: __mmask16,
14295    a: __m256h,
14296) -> __m512i {
14297    unsafe {
14298        static_assert_rounding!(ROUNDING);
14299        transmute(vcvtph2dq_512(a, src.as_i32x16(), k, ROUNDING))
14300    }
14301}
14302
14303/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14304/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14305///
14306/// Rounding is done according to the rounding parameter, which can be one of:
14307///
14308/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14309/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14310/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14311/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14312/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14313///
14314/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi32)
14315#[inline]
14316#[target_feature(enable = "avx512fp16")]
14317#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14318#[rustc_legacy_const_generics(2)]
14319#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14320pub fn _mm512_maskz_cvt_roundph_epi32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
14321    static_assert_rounding!(ROUNDING);
14322    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_setzero_si512(), k, a)
14323}
14324
14325/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
14326/// the result in dst.
14327///
14328/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_i32)
14329#[inline]
14330#[target_feature(enable = "avx512fp16")]
14331#[cfg_attr(test, assert_instr(vcvtsh2si))]
14332#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14333pub fn _mm_cvtsh_i32(a: __m128h) -> i32 {
14334    unsafe { vcvtsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
14335}
14336
14337/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
14338/// the result in dst.
14339///
14340/// Rounding is done according to the rounding parameter, which can be one of:
14341///
14342/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14343/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14344/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14345/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14346/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14347///
14348/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_i32)
14349#[inline]
14350#[target_feature(enable = "avx512fp16")]
14351#[cfg_attr(test, assert_instr(vcvtsh2si, ROUNDING = 8))]
14352#[rustc_legacy_const_generics(1)]
14353#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14354pub fn _mm_cvt_roundsh_i32<const ROUNDING: i32>(a: __m128h) -> i32 {
14355    unsafe {
14356        static_assert_rounding!(ROUNDING);
14357        vcvtsh2si32(a, ROUNDING)
14358    }
14359}
14360
14361/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14362/// results in dst.
14363///
14364/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu32)
14365#[inline]
14366#[target_feature(enable = "avx512fp16,avx512vl")]
14367#[cfg_attr(test, assert_instr(vcvtph2udq))]
14368#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14369pub fn _mm_cvtph_epu32(a: __m128h) -> __m128i {
14370    _mm_mask_cvtph_epu32(_mm_undefined_si128(), 0xff, a)
14371}
14372
14373/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14374/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14375///
14376/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu32)
14377#[inline]
14378#[target_feature(enable = "avx512fp16,avx512vl")]
14379#[cfg_attr(test, assert_instr(vcvtph2udq))]
14380#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14381pub fn _mm_mask_cvtph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14382    unsafe { transmute(vcvtph2udq_128(a, src.as_u32x4(), k)) }
14383}
14384
14385/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14386/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14387///
14388/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu32)
14389#[inline]
14390#[target_feature(enable = "avx512fp16,avx512vl")]
14391#[cfg_attr(test, assert_instr(vcvtph2udq))]
14392#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14393pub fn _mm_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m128i {
14394    _mm_mask_cvtph_epu32(_mm_setzero_si128(), k, a)
14395}
14396
14397/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14398/// the results in dst.
14399///
14400/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu32)
14401#[inline]
14402#[target_feature(enable = "avx512fp16,avx512vl")]
14403#[cfg_attr(test, assert_instr(vcvtph2udq))]
14404#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14405pub fn _mm256_cvtph_epu32(a: __m128h) -> __m256i {
14406    _mm256_mask_cvtph_epu32(_mm256_undefined_si256(), 0xff, a)
14407}
14408
14409/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14410/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14411///
14412/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu32)
14413#[inline]
14414#[target_feature(enable = "avx512fp16,avx512vl")]
14415#[cfg_attr(test, assert_instr(vcvtph2udq))]
14416#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14417pub fn _mm256_mask_cvtph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14418    unsafe { transmute(vcvtph2udq_256(a, src.as_u32x8(), k)) }
14419}
14420
14421/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14422/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14423///
14424/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu32)
14425#[inline]
14426#[target_feature(enable = "avx512fp16,avx512vl")]
14427#[cfg_attr(test, assert_instr(vcvtph2udq))]
14428#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14429pub fn _mm256_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m256i {
14430    _mm256_mask_cvtph_epu32(_mm256_setzero_si256(), k, a)
14431}
14432
14433/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14434/// the results in dst.
14435///
14436/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu32)
14437#[inline]
14438#[target_feature(enable = "avx512fp16")]
14439#[cfg_attr(test, assert_instr(vcvtph2udq))]
14440#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14441pub fn _mm512_cvtph_epu32(a: __m256h) -> __m512i {
14442    _mm512_mask_cvtph_epu32(_mm512_undefined_epi32(), 0xffff, a)
14443}
14444
14445/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14446/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14447///
14448/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu32)
14449#[inline]
14450#[target_feature(enable = "avx512fp16")]
14451#[cfg_attr(test, assert_instr(vcvtph2udq))]
14452#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14453pub fn _mm512_mask_cvtph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14454    unsafe {
14455        transmute(vcvtph2udq_512(
14456            a,
14457            src.as_u32x16(),
14458            k,
14459            _MM_FROUND_CUR_DIRECTION,
14460        ))
14461    }
14462}
14463
14464/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14465/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14466///
14467/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu32)
14468#[inline]
14469#[target_feature(enable = "avx512fp16")]
14470#[cfg_attr(test, assert_instr(vcvtph2udq))]
14471#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14472pub fn _mm512_maskz_cvtph_epu32(k: __mmask16, a: __m256h) -> __m512i {
14473    _mm512_mask_cvtph_epu32(_mm512_setzero_si512(), k, a)
14474}
14475
14476/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14477/// the results in dst.
14478///
14479/// Rounding is done according to the rounding parameter, which can be one of:
14480///
14481/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14482/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14483/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14484/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14485/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14486///
14487/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu32)
14488#[inline]
14489#[target_feature(enable = "avx512fp16")]
14490#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14491#[rustc_legacy_const_generics(1)]
14492#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14493pub fn _mm512_cvt_roundph_epu32<const ROUNDING: i32>(a: __m256h) -> __m512i {
14494    static_assert_rounding!(ROUNDING);
14495    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
14496}
14497
14498/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14499/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14500///
14501/// Rounding is done according to the rounding parameter, which can be one of:
14502///
14503/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14504/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14505/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14506/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14507/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14508///
14509/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu32)
14510#[inline]
14511#[target_feature(enable = "avx512fp16")]
14512#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14513#[rustc_legacy_const_generics(3)]
14514#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14515pub fn _mm512_mask_cvt_roundph_epu32<const ROUNDING: i32>(
14516    src: __m512i,
14517    k: __mmask16,
14518    a: __m256h,
14519) -> __m512i {
14520    unsafe {
14521        static_assert_rounding!(ROUNDING);
14522        transmute(vcvtph2udq_512(a, src.as_u32x16(), k, ROUNDING))
14523    }
14524}
14525
14526/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14527/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14528///
14529/// Rounding is done according to the rounding parameter, which can be one of:
14530///
14531/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14532/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14533/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14534/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14535/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14536///
14537/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu32)
14538#[inline]
14539#[target_feature(enable = "avx512fp16")]
14540#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14541#[rustc_legacy_const_generics(2)]
14542#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14543pub fn _mm512_maskz_cvt_roundph_epu32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
14544    static_assert_rounding!(ROUNDING);
14545    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_setzero_si512(), k, a)
14546}
14547
14548/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
14549/// the result in dst.
14550///
14551/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_u32)
14552#[inline]
14553#[target_feature(enable = "avx512fp16")]
14554#[cfg_attr(test, assert_instr(vcvtsh2usi))]
14555#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14556pub fn _mm_cvtsh_u32(a: __m128h) -> u32 {
14557    unsafe { vcvtsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
14558}
14559
14560/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
14561/// the result in dst.
14562///
14563/// Rounding is done according to the rounding parameter, which can be one of:
14564///
14565/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14566/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14567/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14568/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14569/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14570///
14571/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_u32)
14572#[inline]
14573#[target_feature(enable = "avx512fp16")]
14574#[cfg_attr(test, assert_instr(vcvtsh2usi, ROUNDING = 8))]
14575#[rustc_legacy_const_generics(1)]
14576#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14577pub fn _mm_cvt_roundsh_u32<const ROUNDING: i32>(a: __m128h) -> u32 {
14578    unsafe {
14579        static_assert_rounding!(ROUNDING);
14580        vcvtsh2usi32(a, ROUNDING)
14581    }
14582}
14583
14584/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14585/// store the results in dst.
14586///
14587/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi32)
14588#[inline]
14589#[target_feature(enable = "avx512fp16,avx512vl")]
14590#[cfg_attr(test, assert_instr(vcvttph2dq))]
14591#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14592pub fn _mm_cvttph_epi32(a: __m128h) -> __m128i {
14593    _mm_mask_cvttph_epi32(_mm_undefined_si128(), 0xff, a)
14594}
14595
14596/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14597/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14598///
14599/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi32)
14600#[inline]
14601#[target_feature(enable = "avx512fp16,avx512vl")]
14602#[cfg_attr(test, assert_instr(vcvttph2dq))]
14603#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14604pub fn _mm_mask_cvttph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14605    unsafe { transmute(vcvttph2dq_128(a, src.as_i32x4(), k)) }
14606}
14607
14608/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14609/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14610///
14611/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi32)
14612#[inline]
14613#[target_feature(enable = "avx512fp16,avx512vl")]
14614#[cfg_attr(test, assert_instr(vcvttph2dq))]
14615#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14616pub fn _mm_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m128i {
14617    _mm_mask_cvttph_epi32(_mm_setzero_si128(), k, a)
14618}
14619
14620/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14621/// store the results in dst.
14622///
14623/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi32)
14624#[inline]
14625#[target_feature(enable = "avx512fp16,avx512vl")]
14626#[cfg_attr(test, assert_instr(vcvttph2dq))]
14627#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14628pub fn _mm256_cvttph_epi32(a: __m128h) -> __m256i {
14629    _mm256_mask_cvttph_epi32(_mm256_undefined_si256(), 0xff, a)
14630}
14631
14632/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14633/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14634///
14635/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi32)
14636#[inline]
14637#[target_feature(enable = "avx512fp16,avx512vl")]
14638#[cfg_attr(test, assert_instr(vcvttph2dq))]
14639#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14640pub fn _mm256_mask_cvttph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14641    unsafe { transmute(vcvttph2dq_256(a, src.as_i32x8(), k)) }
14642}
14643
14644/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14645/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14646///
14647/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi32)
14648#[inline]
14649#[target_feature(enable = "avx512fp16,avx512vl")]
14650#[cfg_attr(test, assert_instr(vcvttph2dq))]
14651#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14652pub fn _mm256_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m256i {
14653    _mm256_mask_cvttph_epi32(_mm256_setzero_si256(), k, a)
14654}
14655
14656/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14657/// store the results in dst.
14658///
14659/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi32)
14660#[inline]
14661#[target_feature(enable = "avx512fp16")]
14662#[cfg_attr(test, assert_instr(vcvttph2dq))]
14663#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14664pub fn _mm512_cvttph_epi32(a: __m256h) -> __m512i {
14665    _mm512_mask_cvttph_epi32(_mm512_undefined_epi32(), 0xffff, a)
14666}
14667
14668/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14669/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14670///
14671/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi32)
14672#[inline]
14673#[target_feature(enable = "avx512fp16")]
14674#[cfg_attr(test, assert_instr(vcvttph2dq))]
14675#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14676pub fn _mm512_mask_cvttph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14677    unsafe {
14678        transmute(vcvttph2dq_512(
14679            a,
14680            src.as_i32x16(),
14681            k,
14682            _MM_FROUND_CUR_DIRECTION,
14683        ))
14684    }
14685}
14686
14687/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14688/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14689///
14690/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi32)
14691#[inline]
14692#[target_feature(enable = "avx512fp16")]
14693#[cfg_attr(test, assert_instr(vcvttph2dq))]
14694#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14695pub fn _mm512_maskz_cvttph_epi32(k: __mmask16, a: __m256h) -> __m512i {
14696    _mm512_mask_cvttph_epi32(_mm512_setzero_si512(), k, a)
14697}
14698
14699/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14700/// store the results in dst.
14701///
14702/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14703///
14704/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi32)
14705#[inline]
14706#[target_feature(enable = "avx512fp16")]
14707#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14708#[rustc_legacy_const_generics(1)]
14709#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14710pub fn _mm512_cvtt_roundph_epi32<const SAE: i32>(a: __m256h) -> __m512i {
14711    static_assert_sae!(SAE);
14712    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
14713}
14714
14715/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14716/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14717///
14718/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14719///
14720/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi32)
14721#[inline]
14722#[target_feature(enable = "avx512fp16")]
14723#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14724#[rustc_legacy_const_generics(3)]
14725#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14726pub fn _mm512_mask_cvtt_roundph_epi32<const SAE: i32>(
14727    src: __m512i,
14728    k: __mmask16,
14729    a: __m256h,
14730) -> __m512i {
14731    unsafe {
14732        static_assert_sae!(SAE);
14733        transmute(vcvttph2dq_512(a, src.as_i32x16(), k, SAE))
14734    }
14735}
14736
14737/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14738/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14739///
14740/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14741///
14742/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi32)
14743#[inline]
14744#[target_feature(enable = "avx512fp16")]
14745#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14746#[rustc_legacy_const_generics(2)]
14747#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14748pub fn _mm512_maskz_cvtt_roundph_epi32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
14749    static_assert_sae!(SAE);
14750    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_setzero_si512(), k, a)
14751}
14752
14753/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
14754/// the result in dst.
14755///
14756/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_i32)
14757#[inline]
14758#[target_feature(enable = "avx512fp16")]
14759#[cfg_attr(test, assert_instr(vcvttsh2si))]
14760#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14761pub fn _mm_cvttsh_i32(a: __m128h) -> i32 {
14762    unsafe { vcvttsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
14763}
14764
14765/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
14766/// the result in dst.
14767///
14768/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14769///
14770/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_i32)
14771#[inline]
14772#[target_feature(enable = "avx512fp16")]
14773#[cfg_attr(test, assert_instr(vcvttsh2si, SAE = 8))]
14774#[rustc_legacy_const_generics(1)]
14775#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14776pub fn _mm_cvtt_roundsh_i32<const SAE: i32>(a: __m128h) -> i32 {
14777    unsafe {
14778        static_assert_sae!(SAE);
14779        vcvttsh2si32(a, SAE)
14780    }
14781}
14782
14783/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14784/// store the results in dst.
14785///
14786/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu32)
14787#[inline]
14788#[target_feature(enable = "avx512fp16,avx512vl")]
14789#[cfg_attr(test, assert_instr(vcvttph2udq))]
14790#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14791pub fn _mm_cvttph_epu32(a: __m128h) -> __m128i {
14792    _mm_mask_cvttph_epu32(_mm_undefined_si128(), 0xff, a)
14793}
14794
14795/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14796/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14797///
14798/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu32)
14799#[inline]
14800#[target_feature(enable = "avx512fp16,avx512vl")]
14801#[cfg_attr(test, assert_instr(vcvttph2udq))]
14802#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14803pub fn _mm_mask_cvttph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14804    unsafe { transmute(vcvttph2udq_128(a, src.as_u32x4(), k)) }
14805}
14806
14807/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14808/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14809///
14810/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu32)
14811#[inline]
14812#[target_feature(enable = "avx512fp16,avx512vl")]
14813#[cfg_attr(test, assert_instr(vcvttph2udq))]
14814#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14815pub fn _mm_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m128i {
14816    _mm_mask_cvttph_epu32(_mm_setzero_si128(), k, a)
14817}
14818
14819/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14820/// store the results in dst.
14821///
14822/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu32)
14823#[inline]
14824#[target_feature(enable = "avx512fp16,avx512vl")]
14825#[cfg_attr(test, assert_instr(vcvttph2udq))]
14826#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14827pub fn _mm256_cvttph_epu32(a: __m128h) -> __m256i {
14828    _mm256_mask_cvttph_epu32(_mm256_undefined_si256(), 0xff, a)
14829}
14830
14831/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14832/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14833///
14834/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu32)
14835#[inline]
14836#[target_feature(enable = "avx512fp16,avx512vl")]
14837#[cfg_attr(test, assert_instr(vcvttph2udq))]
14838#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14839pub fn _mm256_mask_cvttph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14840    unsafe { transmute(vcvttph2udq_256(a, src.as_u32x8(), k)) }
14841}
14842
14843/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14844/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14845///
14846/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu32)
14847#[inline]
14848#[target_feature(enable = "avx512fp16,avx512vl")]
14849#[cfg_attr(test, assert_instr(vcvttph2udq))]
14850#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14851pub fn _mm256_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m256i {
14852    _mm256_mask_cvttph_epu32(_mm256_setzero_si256(), k, a)
14853}
14854
14855/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14856/// store the results in dst.
14857///
14858/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu32)
14859#[inline]
14860#[target_feature(enable = "avx512fp16")]
14861#[cfg_attr(test, assert_instr(vcvttph2udq))]
14862#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14863pub fn _mm512_cvttph_epu32(a: __m256h) -> __m512i {
14864    _mm512_mask_cvttph_epu32(_mm512_undefined_epi32(), 0xffff, a)
14865}
14866
14867/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14868/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14869///
14870/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu32)
14871#[inline]
14872#[target_feature(enable = "avx512fp16")]
14873#[cfg_attr(test, assert_instr(vcvttph2udq))]
14874#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14875pub fn _mm512_mask_cvttph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14876    unsafe {
14877        transmute(vcvttph2udq_512(
14878            a,
14879            src.as_u32x16(),
14880            k,
14881            _MM_FROUND_CUR_DIRECTION,
14882        ))
14883    }
14884}
14885
14886/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14887/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14888///
14889/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu32)
14890#[inline]
14891#[target_feature(enable = "avx512fp16")]
14892#[cfg_attr(test, assert_instr(vcvttph2udq))]
14893#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14894pub fn _mm512_maskz_cvttph_epu32(k: __mmask16, a: __m256h) -> __m512i {
14895    _mm512_mask_cvttph_epu32(_mm512_setzero_si512(), k, a)
14896}
14897
14898/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14899/// store the results in dst.
14900///
14901/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14902///
14903/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu32)
14904#[inline]
14905#[target_feature(enable = "avx512fp16")]
14906#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
14907#[rustc_legacy_const_generics(1)]
14908#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14909pub fn _mm512_cvtt_roundph_epu32<const SAE: i32>(a: __m256h) -> __m512i {
14910    static_assert_sae!(SAE);
14911    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
14912}
14913
14914/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14915/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14916///
14917/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14918///
14919/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu32)
14920#[inline]
14921#[target_feature(enable = "avx512fp16")]
14922#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
14923#[rustc_legacy_const_generics(3)]
14924#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14925pub fn _mm512_mask_cvtt_roundph_epu32<const SAE: i32>(
14926    src: __m512i,
14927    k: __mmask16,
14928    a: __m256h,
14929) -> __m512i {
14930    unsafe {
14931        static_assert_sae!(SAE);
14932        transmute(vcvttph2udq_512(a, src.as_u32x16(), k, SAE))
14933    }
14934}
14935
14936/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14937/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14938///
14939/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14940///
14941/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu32)
14942#[inline]
14943#[target_feature(enable = "avx512fp16")]
14944#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
14945#[rustc_legacy_const_generics(2)]
14946#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14947pub fn _mm512_maskz_cvtt_roundph_epu32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
14948    static_assert_sae!(SAE);
14949    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_setzero_si512(), k, a)
14950}
14951
14952/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
14953/// the result in dst.
14954///
14955/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_u32)
14956#[inline]
14957#[target_feature(enable = "avx512fp16")]
14958#[cfg_attr(test, assert_instr(vcvttsh2usi))]
14959#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14960pub fn _mm_cvttsh_u32(a: __m128h) -> u32 {
14961    unsafe { vcvttsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
14962}
14963
14964/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
14965/// the result in dst.
14966///
14967/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14968///
14969/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_u32)
14970#[inline]
14971#[target_feature(enable = "avx512fp16")]
14972#[cfg_attr(test, assert_instr(vcvttsh2usi, SAE = 8))]
14973#[rustc_legacy_const_generics(1)]
14974#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14975pub fn _mm_cvtt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
14976    unsafe {
14977        static_assert_sae!(SAE);
14978        vcvttsh2usi32(a, SAE)
14979    }
14980}
14981
14982/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14983/// store the results in dst.
14984///
14985/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi64)
14986#[inline]
14987#[target_feature(enable = "avx512fp16,avx512vl")]
14988#[cfg_attr(test, assert_instr(vcvtph2qq))]
14989#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14990pub fn _mm_cvtph_epi64(a: __m128h) -> __m128i {
14991    _mm_mask_cvtph_epi64(_mm_undefined_si128(), 0xff, a)
14992}
14993
14994/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14995/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14996///
14997/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi64)
14998#[inline]
14999#[target_feature(enable = "avx512fp16,avx512vl")]
15000#[cfg_attr(test, assert_instr(vcvtph2qq))]
15001#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15002pub fn _mm_mask_cvtph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15003    unsafe { transmute(vcvtph2qq_128(a, src.as_i64x2(), k)) }
15004}
15005
15006/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15007/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15008///
15009/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi64)
15010#[inline]
15011#[target_feature(enable = "avx512fp16,avx512vl")]
15012#[cfg_attr(test, assert_instr(vcvtph2qq))]
15013#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15014pub fn _mm_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m128i {
15015    _mm_mask_cvtph_epi64(_mm_setzero_si128(), k, a)
15016}
15017
15018/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15019/// store the results in dst.
15020///
15021/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi64)
15022#[inline]
15023#[target_feature(enable = "avx512fp16,avx512vl")]
15024#[cfg_attr(test, assert_instr(vcvtph2qq))]
15025#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15026pub fn _mm256_cvtph_epi64(a: __m128h) -> __m256i {
15027    _mm256_mask_cvtph_epi64(_mm256_undefined_si256(), 0xff, a)
15028}
15029
15030/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15031/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15032///
15033/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi64)
15034#[inline]
15035#[target_feature(enable = "avx512fp16,avx512vl")]
15036#[cfg_attr(test, assert_instr(vcvtph2qq))]
15037#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15038pub fn _mm256_mask_cvtph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15039    unsafe { transmute(vcvtph2qq_256(a, src.as_i64x4(), k)) }
15040}
15041
15042/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15043/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15044///
15045/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi64)
15046#[inline]
15047#[target_feature(enable = "avx512fp16,avx512vl")]
15048#[cfg_attr(test, assert_instr(vcvtph2qq))]
15049#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15050pub fn _mm256_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m256i {
15051    _mm256_mask_cvtph_epi64(_mm256_setzero_si256(), k, a)
15052}
15053
15054/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15055/// store the results in dst.
15056///
15057/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi64)
15058#[inline]
15059#[target_feature(enable = "avx512fp16")]
15060#[cfg_attr(test, assert_instr(vcvtph2qq))]
15061#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15062pub fn _mm512_cvtph_epi64(a: __m128h) -> __m512i {
15063    _mm512_mask_cvtph_epi64(_mm512_undefined_epi32(), 0xff, a)
15064}
15065
15066/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15067/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15068///
15069/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi64)
15070#[inline]
15071#[target_feature(enable = "avx512fp16")]
15072#[cfg_attr(test, assert_instr(vcvtph2qq))]
15073#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15074pub fn _mm512_mask_cvtph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15075    unsafe {
15076        transmute(vcvtph2qq_512(
15077            a,
15078            src.as_i64x8(),
15079            k,
15080            _MM_FROUND_CUR_DIRECTION,
15081        ))
15082    }
15083}
15084
15085/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15086/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15087///
15088/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi64)
15089#[inline]
15090#[target_feature(enable = "avx512fp16")]
15091#[cfg_attr(test, assert_instr(vcvtph2qq))]
15092#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15093pub fn _mm512_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m512i {
15094    _mm512_mask_cvtph_epi64(_mm512_setzero_si512(), k, a)
15095}
15096
15097/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15098/// store the results in dst.
15099///
15100/// Rounding is done according to the rounding parameter, which can be one of:
15101///
15102/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15103/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15104/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15105/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15106/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15107///
15108/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi64)
15109#[inline]
15110#[target_feature(enable = "avx512fp16")]
15111#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15112#[rustc_legacy_const_generics(1)]
15113#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15114pub fn _mm512_cvt_roundph_epi64<const ROUNDING: i32>(a: __m128h) -> __m512i {
15115    static_assert_rounding!(ROUNDING);
15116    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
15117}
15118
15119/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15120/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15121///
15122/// Rounding is done according to the rounding parameter, which can be one of:
15123///
15124/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15125/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15126/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15127/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15128/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15129///
15130/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi64)
15131#[inline]
15132#[target_feature(enable = "avx512fp16")]
15133#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15134#[rustc_legacy_const_generics(3)]
15135#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15136pub fn _mm512_mask_cvt_roundph_epi64<const ROUNDING: i32>(
15137    src: __m512i,
15138    k: __mmask8,
15139    a: __m128h,
15140) -> __m512i {
15141    unsafe {
15142        static_assert_rounding!(ROUNDING);
15143        transmute(vcvtph2qq_512(a, src.as_i64x8(), k, ROUNDING))
15144    }
15145}
15146
15147/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15148/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15149///
15150/// Rounding is done according to the rounding parameter, which can be one of:
15151///
15152/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15153/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15154/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15155/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15156/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15157///
15158/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi64)
15159#[inline]
15160#[target_feature(enable = "avx512fp16")]
15161#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15162#[rustc_legacy_const_generics(2)]
15163#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15164pub fn _mm512_maskz_cvt_roundph_epi64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
15165    static_assert_rounding!(ROUNDING);
15166    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_setzero_si512(), k, a)
15167}
15168
15169/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15170/// store the results in dst.
15171///
15172/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu64)
15173#[inline]
15174#[target_feature(enable = "avx512fp16,avx512vl")]
15175#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15176#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15177pub fn _mm_cvtph_epu64(a: __m128h) -> __m128i {
15178    _mm_mask_cvtph_epu64(_mm_undefined_si128(), 0xff, a)
15179}
15180
15181/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15182/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15183///
15184/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu64)
15185#[inline]
15186#[target_feature(enable = "avx512fp16,avx512vl")]
15187#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15188#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15189pub fn _mm_mask_cvtph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15190    unsafe { transmute(vcvtph2uqq_128(a, src.as_u64x2(), k)) }
15191}
15192
15193/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15194/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15195///
15196/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu64)
15197#[inline]
15198#[target_feature(enable = "avx512fp16,avx512vl")]
15199#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15200#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15201pub fn _mm_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m128i {
15202    _mm_mask_cvtph_epu64(_mm_setzero_si128(), k, a)
15203}
15204
15205/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15206/// store the results in dst.
15207///
15208/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu64)
15209#[inline]
15210#[target_feature(enable = "avx512fp16,avx512vl")]
15211#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15212#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15213pub fn _mm256_cvtph_epu64(a: __m128h) -> __m256i {
15214    _mm256_mask_cvtph_epu64(_mm256_undefined_si256(), 0xff, a)
15215}
15216
15217/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15218/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15219///
15220/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu64)
15221#[inline]
15222#[target_feature(enable = "avx512fp16,avx512vl")]
15223#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15224#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15225pub fn _mm256_mask_cvtph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15226    unsafe { transmute(vcvtph2uqq_256(a, src.as_u64x4(), k)) }
15227}
15228
15229/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15230/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15231///
15232/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu64)
15233#[inline]
15234#[target_feature(enable = "avx512fp16,avx512vl")]
15235#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15236#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15237pub fn _mm256_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m256i {
15238    _mm256_mask_cvtph_epu64(_mm256_setzero_si256(), k, a)
15239}
15240
15241/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15242/// store the results in dst.
15243///
15244/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu64)
15245#[inline]
15246#[target_feature(enable = "avx512fp16")]
15247#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15248#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15249pub fn _mm512_cvtph_epu64(a: __m128h) -> __m512i {
15250    _mm512_mask_cvtph_epu64(_mm512_undefined_epi32(), 0xff, a)
15251}
15252
15253/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15254/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15255///
15256/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu64)
15257#[inline]
15258#[target_feature(enable = "avx512fp16")]
15259#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15260#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15261pub fn _mm512_mask_cvtph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15262    unsafe {
15263        transmute(vcvtph2uqq_512(
15264            a,
15265            src.as_u64x8(),
15266            k,
15267            _MM_FROUND_CUR_DIRECTION,
15268        ))
15269    }
15270}
15271
15272/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15273/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15274///
15275/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu64)
15276#[inline]
15277#[target_feature(enable = "avx512fp16")]
15278#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15279#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15280pub fn _mm512_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m512i {
15281    _mm512_mask_cvtph_epu64(_mm512_setzero_si512(), k, a)
15282}
15283
15284/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15285/// store the results in dst.
15286///
15287/// Rounding is done according to the rounding parameter, which can be one of:
15288///
15289/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15290/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15291/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15292/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15293/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15294///
15295/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu64)
15296#[inline]
15297#[target_feature(enable = "avx512fp16")]
15298#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15299#[rustc_legacy_const_generics(1)]
15300#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15301pub fn _mm512_cvt_roundph_epu64<const ROUNDING: i32>(a: __m128h) -> __m512i {
15302    static_assert_rounding!(ROUNDING);
15303    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
15304}
15305
15306/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15307/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15308///
15309/// Rounding is done according to the rounding parameter, which can be one of:
15310///
15311/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15312/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15313/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15314/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15315/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15316///
15317/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu64)
15318#[inline]
15319#[target_feature(enable = "avx512fp16")]
15320#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15321#[rustc_legacy_const_generics(3)]
15322#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15323pub fn _mm512_mask_cvt_roundph_epu64<const ROUNDING: i32>(
15324    src: __m512i,
15325    k: __mmask8,
15326    a: __m128h,
15327) -> __m512i {
15328    unsafe {
15329        static_assert_rounding!(ROUNDING);
15330        transmute(vcvtph2uqq_512(a, src.as_u64x8(), k, ROUNDING))
15331    }
15332}
15333
15334/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15335/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15336///
15337/// Rounding is done according to the rounding parameter, which can be one of:
15338///
15339/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15340/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15341/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15342/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15343/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15344///
15345/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu64)
15346#[inline]
15347#[target_feature(enable = "avx512fp16")]
15348#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15349#[rustc_legacy_const_generics(2)]
15350#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15351pub fn _mm512_maskz_cvt_roundph_epu64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
15352    static_assert_rounding!(ROUNDING);
15353    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_setzero_si512(), k, a)
15354}
15355
15356/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15357/// store the results in dst.
15358///
15359/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi64)
15360#[inline]
15361#[target_feature(enable = "avx512fp16,avx512vl")]
15362#[cfg_attr(test, assert_instr(vcvttph2qq))]
15363#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15364pub fn _mm_cvttph_epi64(a: __m128h) -> __m128i {
15365    _mm_mask_cvttph_epi64(_mm_undefined_si128(), 0xff, a)
15366}
15367
15368/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15369/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15370///
15371/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi64)
15372#[inline]
15373#[target_feature(enable = "avx512fp16,avx512vl")]
15374#[cfg_attr(test, assert_instr(vcvttph2qq))]
15375#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15376pub fn _mm_mask_cvttph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15377    unsafe { transmute(vcvttph2qq_128(a, src.as_i64x2(), k)) }
15378}
15379
15380/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15381/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15382///
15383/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi64)
15384#[inline]
15385#[target_feature(enable = "avx512fp16,avx512vl")]
15386#[cfg_attr(test, assert_instr(vcvttph2qq))]
15387#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15388pub fn _mm_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m128i {
15389    _mm_mask_cvttph_epi64(_mm_setzero_si128(), k, a)
15390}
15391
15392/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15393/// store the results in dst.
15394///
15395/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi64)
15396#[inline]
15397#[target_feature(enable = "avx512fp16,avx512vl")]
15398#[cfg_attr(test, assert_instr(vcvttph2qq))]
15399#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15400pub fn _mm256_cvttph_epi64(a: __m128h) -> __m256i {
15401    _mm256_mask_cvttph_epi64(_mm256_undefined_si256(), 0xff, a)
15402}
15403
15404/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15405/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15406///
15407/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi64)
15408#[inline]
15409#[target_feature(enable = "avx512fp16,avx512vl")]
15410#[cfg_attr(test, assert_instr(vcvttph2qq))]
15411#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15412pub fn _mm256_mask_cvttph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15413    unsafe { transmute(vcvttph2qq_256(a, src.as_i64x4(), k)) }
15414}
15415
15416/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15417/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15418///
15419/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi64)
15420#[inline]
15421#[target_feature(enable = "avx512fp16,avx512vl")]
15422#[cfg_attr(test, assert_instr(vcvttph2qq))]
15423#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15424pub fn _mm256_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m256i {
15425    _mm256_mask_cvttph_epi64(_mm256_setzero_si256(), k, a)
15426}
15427
15428/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15429/// store the results in dst.
15430///
15431/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi64)
15432#[inline]
15433#[target_feature(enable = "avx512fp16")]
15434#[cfg_attr(test, assert_instr(vcvttph2qq))]
15435#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15436pub fn _mm512_cvttph_epi64(a: __m128h) -> __m512i {
15437    _mm512_mask_cvttph_epi64(_mm512_undefined_epi32(), 0xff, a)
15438}
15439
15440/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15441/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15442///
15443/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi64)
15444#[inline]
15445#[target_feature(enable = "avx512fp16")]
15446#[cfg_attr(test, assert_instr(vcvttph2qq))]
15447#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15448pub fn _mm512_mask_cvttph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15449    unsafe {
15450        transmute(vcvttph2qq_512(
15451            a,
15452            src.as_i64x8(),
15453            k,
15454            _MM_FROUND_CUR_DIRECTION,
15455        ))
15456    }
15457}
15458
15459/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15460/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15461///
15462/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi64)
15463#[inline]
15464#[target_feature(enable = "avx512fp16")]
15465#[cfg_attr(test, assert_instr(vcvttph2qq))]
15466#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15467pub fn _mm512_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m512i {
15468    _mm512_mask_cvttph_epi64(_mm512_setzero_si512(), k, a)
15469}
15470
15471/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15472/// store the results in dst.
15473///
15474/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15475///
15476/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi64)
15477#[inline]
15478#[target_feature(enable = "avx512fp16")]
15479#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15480#[rustc_legacy_const_generics(1)]
15481#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15482pub fn _mm512_cvtt_roundph_epi64<const SAE: i32>(a: __m128h) -> __m512i {
15483    static_assert_sae!(SAE);
15484    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
15485}
15486
15487/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15488/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15489///
15490/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15491///
15492/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi64)
15493#[inline]
15494#[target_feature(enable = "avx512fp16")]
15495#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15496#[rustc_legacy_const_generics(3)]
15497#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15498pub fn _mm512_mask_cvtt_roundph_epi64<const SAE: i32>(
15499    src: __m512i,
15500    k: __mmask8,
15501    a: __m128h,
15502) -> __m512i {
15503    unsafe {
15504        static_assert_sae!(SAE);
15505        transmute(vcvttph2qq_512(a, src.as_i64x8(), k, SAE))
15506    }
15507}
15508
15509/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15510/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15511///
15512/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15513///
15514/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi64)
15515#[inline]
15516#[target_feature(enable = "avx512fp16")]
15517#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15518#[rustc_legacy_const_generics(2)]
15519#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15520pub fn _mm512_maskz_cvtt_roundph_epi64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
15521    static_assert_sae!(SAE);
15522    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_setzero_si512(), k, a)
15523}
15524
15525/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15526/// store the results in dst.
15527///
15528/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu64)
15529#[inline]
15530#[target_feature(enable = "avx512fp16,avx512vl")]
15531#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15532#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15533pub fn _mm_cvttph_epu64(a: __m128h) -> __m128i {
15534    _mm_mask_cvttph_epu64(_mm_undefined_si128(), 0xff, a)
15535}
15536
15537/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15538/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15539///
15540/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu64)
15541#[inline]
15542#[target_feature(enable = "avx512fp16,avx512vl")]
15543#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15544#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15545pub fn _mm_mask_cvttph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15546    unsafe { transmute(vcvttph2uqq_128(a, src.as_u64x2(), k)) }
15547}
15548
15549/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15550/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15551///
15552/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu64)
15553#[inline]
15554#[target_feature(enable = "avx512fp16,avx512vl")]
15555#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15556#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15557pub fn _mm_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m128i {
15558    _mm_mask_cvttph_epu64(_mm_setzero_si128(), k, a)
15559}
15560
15561/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15562/// store the results in dst.
15563///
15564/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu64)
15565#[inline]
15566#[target_feature(enable = "avx512fp16,avx512vl")]
15567#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15568#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15569pub fn _mm256_cvttph_epu64(a: __m128h) -> __m256i {
15570    _mm256_mask_cvttph_epu64(_mm256_undefined_si256(), 0xff, a)
15571}
15572
15573/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15574/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15575///
15576/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu64)
15577#[inline]
15578#[target_feature(enable = "avx512fp16,avx512vl")]
15579#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15580#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15581pub fn _mm256_mask_cvttph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15582    unsafe { transmute(vcvttph2uqq_256(a, src.as_u64x4(), k)) }
15583}
15584
15585/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15586/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15587///
15588/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu64)
15589#[inline]
15590#[target_feature(enable = "avx512fp16,avx512vl")]
15591#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15592#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15593pub fn _mm256_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m256i {
15594    _mm256_mask_cvttph_epu64(_mm256_setzero_si256(), k, a)
15595}
15596
15597/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15598/// store the results in dst.
15599///
15600/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu64)
15601#[inline]
15602#[target_feature(enable = "avx512fp16")]
15603#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15604#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15605pub fn _mm512_cvttph_epu64(a: __m128h) -> __m512i {
15606    _mm512_mask_cvttph_epu64(_mm512_undefined_epi32(), 0xff, a)
15607}
15608
15609/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15610/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15611///
15612/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu64)
15613#[inline]
15614#[target_feature(enable = "avx512fp16")]
15615#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15616#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15617pub fn _mm512_mask_cvttph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15618    unsafe {
15619        transmute(vcvttph2uqq_512(
15620            a,
15621            src.as_u64x8(),
15622            k,
15623            _MM_FROUND_CUR_DIRECTION,
15624        ))
15625    }
15626}
15627
15628/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15629/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15630///
15631/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu64)
15632#[inline]
15633#[target_feature(enable = "avx512fp16")]
15634#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15635#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15636pub fn _mm512_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m512i {
15637    _mm512_mask_cvttph_epu64(_mm512_setzero_si512(), k, a)
15638}
15639
15640/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15641/// store the results in dst.
15642///
15643/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15644///
15645/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu64)
15646#[inline]
15647#[target_feature(enable = "avx512fp16")]
15648#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15649#[rustc_legacy_const_generics(1)]
15650#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15651pub fn _mm512_cvtt_roundph_epu64<const SAE: i32>(a: __m128h) -> __m512i {
15652    static_assert_sae!(SAE);
15653    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
15654}
15655
15656/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15657/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15658///
15659/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15660///
15661/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu64)
15662#[inline]
15663#[target_feature(enable = "avx512fp16")]
15664#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15665#[rustc_legacy_const_generics(3)]
15666#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15667pub fn _mm512_mask_cvtt_roundph_epu64<const SAE: i32>(
15668    src: __m512i,
15669    k: __mmask8,
15670    a: __m128h,
15671) -> __m512i {
15672    unsafe {
15673        static_assert_sae!(SAE);
15674        transmute(vcvttph2uqq_512(a, src.as_u64x8(), k, SAE))
15675    }
15676}
15677
15678/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15679/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15680///
15681/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15682///
15683/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu64)
15684#[inline]
15685#[target_feature(enable = "avx512fp16")]
15686#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15687#[rustc_legacy_const_generics(2)]
15688#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15689pub fn _mm512_maskz_cvtt_roundph_epu64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
15690    static_assert_sae!(SAE);
15691    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_setzero_si512(), k, a)
15692}
15693
15694/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15695/// floating-point elements, and store the results in dst.
15696///
15697/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxph_ps)
15698#[inline]
15699#[target_feature(enable = "avx512fp16,avx512vl")]
15700#[cfg_attr(test, assert_instr(vcvtph2psx))]
15701#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15702pub fn _mm_cvtxph_ps(a: __m128h) -> __m128 {
15703    _mm_mask_cvtxph_ps(_mm_setzero_ps(), 0xff, a)
15704}
15705
15706/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15707/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15708/// dst when the corresponding mask bit is not set).
15709///
15710/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxph_ps)
15711#[inline]
15712#[target_feature(enable = "avx512fp16,avx512vl")]
15713#[cfg_attr(test, assert_instr(vcvtph2psx))]
15714#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15715pub fn _mm_mask_cvtxph_ps(src: __m128, k: __mmask8, a: __m128h) -> __m128 {
15716    unsafe { vcvtph2psx_128(a, src, k) }
15717}
15718
15719/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15720/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15721/// corresponding mask bit is not set).
15722///
15723/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxph_ps)
15724#[inline]
15725#[target_feature(enable = "avx512fp16,avx512vl")]
15726#[cfg_attr(test, assert_instr(vcvtph2psx))]
15727#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15728pub fn _mm_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m128 {
15729    _mm_mask_cvtxph_ps(_mm_setzero_ps(), k, a)
15730}
15731
15732/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15733/// floating-point elements, and store the results in dst.
15734///
15735/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxph_ps)
15736#[inline]
15737#[target_feature(enable = "avx512fp16,avx512vl")]
15738#[cfg_attr(test, assert_instr(vcvtph2psx))]
15739#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15740pub fn _mm256_cvtxph_ps(a: __m128h) -> __m256 {
15741    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), 0xff, a)
15742}
15743
15744/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15745/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15746/// dst when the corresponding mask bit is not set).
15747///
15748/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxph_ps)
15749#[inline]
15750#[target_feature(enable = "avx512fp16,avx512vl")]
15751#[cfg_attr(test, assert_instr(vcvtph2psx))]
15752#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15753pub fn _mm256_mask_cvtxph_ps(src: __m256, k: __mmask8, a: __m128h) -> __m256 {
15754    unsafe { vcvtph2psx_256(a, src, k) }
15755}
15756
15757/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15758/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15759/// corresponding mask bit is not set).
15760///
15761/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxph_ps)
15762#[inline]
15763#[target_feature(enable = "avx512fp16,avx512vl")]
15764#[cfg_attr(test, assert_instr(vcvtph2psx))]
15765#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15766pub fn _mm256_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m256 {
15767    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), k, a)
15768}
15769
15770/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15771/// floating-point elements, and store the results in dst.
15772///
15773/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxph_ps)
15774#[inline]
15775#[target_feature(enable = "avx512fp16")]
15776#[cfg_attr(test, assert_instr(vcvtph2psx))]
15777#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15778pub fn _mm512_cvtxph_ps(a: __m256h) -> __m512 {
15779    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), 0xffff, a)
15780}
15781
15782/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15783/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15784/// dst when the corresponding mask bit is not set).
15785///
15786/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxph_ps)
15787#[inline]
15788#[target_feature(enable = "avx512fp16")]
15789#[cfg_attr(test, assert_instr(vcvtph2psx))]
15790#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15791pub fn _mm512_mask_cvtxph_ps(src: __m512, k: __mmask16, a: __m256h) -> __m512 {
15792    unsafe { vcvtph2psx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
15793}
15794
15795/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15796/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15797/// corresponding mask bit is not set).
15798///
15799/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxph_ps)
15800#[inline]
15801#[target_feature(enable = "avx512fp16")]
15802#[cfg_attr(test, assert_instr(vcvtph2psx))]
15803#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15804pub fn _mm512_maskz_cvtxph_ps(k: __mmask16, a: __m256h) -> __m512 {
15805    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), k, a)
15806}
15807
15808/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15809/// floating-point elements, and store the results in dst.
15810///
15811/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15812///
15813/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundph_ps)
15814#[inline]
15815#[target_feature(enable = "avx512fp16")]
15816#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
15817#[rustc_legacy_const_generics(1)]
15818#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15819pub fn _mm512_cvtx_roundph_ps<const SAE: i32>(a: __m256h) -> __m512 {
15820    static_assert_sae!(SAE);
15821    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), 0xffff, a)
15822}
15823
15824/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15825/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15826/// dst when the corresponding mask bit is not set).
15827///
15828/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15829///
15830/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundph_ps)
15831#[inline]
15832#[target_feature(enable = "avx512fp16")]
15833#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
15834#[rustc_legacy_const_generics(3)]
15835#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15836pub fn _mm512_mask_cvtx_roundph_ps<const SAE: i32>(
15837    src: __m512,
15838    k: __mmask16,
15839    a: __m256h,
15840) -> __m512 {
15841    unsafe {
15842        static_assert_sae!(SAE);
15843        vcvtph2psx_512(a, src, k, SAE)
15844    }
15845}
15846
15847/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15848/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15849/// corresponding mask bit is not set).
15850///
15851/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15852///
15853/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundph_ps)
15854#[inline]
15855#[target_feature(enable = "avx512fp16")]
15856#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
15857#[rustc_legacy_const_generics(2)]
15858#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15859pub fn _mm512_maskz_cvtx_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512 {
15860    static_assert_sae!(SAE);
15861    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), k, a)
15862}
15863
15864/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15865/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed
15866/// elements from a to the upper elements of dst.
15867///
15868/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_ss)
15869#[inline]
15870#[target_feature(enable = "avx512fp16")]
15871#[cfg_attr(test, assert_instr(vcvtsh2ss))]
15872#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15873pub fn _mm_cvtsh_ss(a: __m128, b: __m128h) -> __m128 {
15874    _mm_mask_cvtsh_ss(a, 0xff, a, b)
15875}
15876
15877/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15878/// floating-point element, store the result in the lower element of dst using writemask k (the element is
15879/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
15880/// upper elements of dst.
15881///
15882/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_ss)
15883#[inline]
15884#[target_feature(enable = "avx512fp16")]
15885#[cfg_attr(test, assert_instr(vcvtsh2ss))]
15886#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15887pub fn _mm_mask_cvtsh_ss(src: __m128, k: __mmask8, a: __m128, b: __m128h) -> __m128 {
15888    unsafe { vcvtsh2ss(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
15889}
15890
15891/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15892/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
15893/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
15894/// of dst.
15895///
15896/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_ss)
15897#[inline]
15898#[target_feature(enable = "avx512fp16")]
15899#[cfg_attr(test, assert_instr(vcvtsh2ss))]
15900#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15901pub fn _mm_maskz_cvtsh_ss(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
15902    _mm_mask_cvtsh_ss(_mm_set_ss(0.0), k, a, b)
15903}
15904
15905/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15906/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements
15907/// from a to the upper elements of dst.
15908///
15909/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15910///
15911/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_ss)
15912#[inline]
15913#[target_feature(enable = "avx512fp16")]
15914#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
15915#[rustc_legacy_const_generics(2)]
15916#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15917pub fn _mm_cvt_roundsh_ss<const SAE: i32>(a: __m128, b: __m128h) -> __m128 {
15918    static_assert_sae!(SAE);
15919    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_undefined_ps(), 0xff, a, b)
15920}
15921
15922/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15923/// floating-point element, store the result in the lower element of dst using writemask k (the element is
15924/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
15925/// upper elements of dst.
15926///
15927/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15928///
15929/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_ss)
15930#[inline]
15931#[target_feature(enable = "avx512fp16")]
15932#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
15933#[rustc_legacy_const_generics(4)]
15934#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15935pub fn _mm_mask_cvt_roundsh_ss<const SAE: i32>(
15936    src: __m128,
15937    k: __mmask8,
15938    a: __m128,
15939    b: __m128h,
15940) -> __m128 {
15941    unsafe {
15942        static_assert_sae!(SAE);
15943        vcvtsh2ss(a, b, src, k, SAE)
15944    }
15945}
15946
15947/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15948/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
15949/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
15950/// of dst.
15951///
15952/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15953///
15954/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_ss)
15955#[inline]
15956#[target_feature(enable = "avx512fp16")]
15957#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
15958#[rustc_legacy_const_generics(3)]
15959#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15960pub fn _mm_maskz_cvt_roundsh_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
15961    static_assert_sae!(SAE);
15962    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_set_ss(0.0), k, a, b)
15963}
15964
15965/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15966/// floating-point elements, and store the results in dst.
15967///
15968/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_pd)
15969#[inline]
15970#[target_feature(enable = "avx512fp16,avx512vl")]
15971#[cfg_attr(test, assert_instr(vcvtph2pd))]
15972#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15973pub fn _mm_cvtph_pd(a: __m128h) -> __m128d {
15974    _mm_mask_cvtph_pd(_mm_setzero_pd(), 0xff, a)
15975}
15976
15977/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15978/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15979/// dst when the corresponding mask bit is not set).
15980///
15981/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_pd)
15982#[inline]
15983#[target_feature(enable = "avx512fp16,avx512vl")]
15984#[cfg_attr(test, assert_instr(vcvtph2pd))]
15985#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15986pub fn _mm_mask_cvtph_pd(src: __m128d, k: __mmask8, a: __m128h) -> __m128d {
15987    unsafe { vcvtph2pd_128(a, src, k) }
15988}
15989
15990/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15991/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15992/// corresponding mask bit is not set).
15993///
15994/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_pd)
15995#[inline]
15996#[target_feature(enable = "avx512fp16,avx512vl")]
15997#[cfg_attr(test, assert_instr(vcvtph2pd))]
15998#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15999pub fn _mm_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m128d {
16000    _mm_mask_cvtph_pd(_mm_setzero_pd(), k, a)
16001}
16002
16003/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16004/// floating-point elements, and store the results in dst.
16005///
16006/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_pd)
16007#[inline]
16008#[target_feature(enable = "avx512fp16,avx512vl")]
16009#[cfg_attr(test, assert_instr(vcvtph2pd))]
16010#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16011pub fn _mm256_cvtph_pd(a: __m128h) -> __m256d {
16012    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), 0xff, a)
16013}
16014
16015/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16016/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16017/// dst when the corresponding mask bit is not set).
16018///
16019/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_pd)
16020#[inline]
16021#[target_feature(enable = "avx512fp16,avx512vl")]
16022#[cfg_attr(test, assert_instr(vcvtph2pd))]
16023#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16024pub fn _mm256_mask_cvtph_pd(src: __m256d, k: __mmask8, a: __m128h) -> __m256d {
16025    unsafe { vcvtph2pd_256(a, src, k) }
16026}
16027
16028/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16029/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16030/// corresponding mask bit is not set).
16031///
16032/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_pd)
16033#[inline]
16034#[target_feature(enable = "avx512fp16,avx512vl")]
16035#[cfg_attr(test, assert_instr(vcvtph2pd))]
16036#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16037pub fn _mm256_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m256d {
16038    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), k, a)
16039}
16040
16041/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16042/// floating-point elements, and store the results in dst.
16043///
16044/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_pd)
16045#[inline]
16046#[target_feature(enable = "avx512fp16")]
16047#[cfg_attr(test, assert_instr(vcvtph2pd))]
16048#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16049pub fn _mm512_cvtph_pd(a: __m128h) -> __m512d {
16050    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), 0xff, a)
16051}
16052
16053/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16054/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16055/// dst when the corresponding mask bit is not set).
16056///
16057/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_pd)
16058#[inline]
16059#[target_feature(enable = "avx512fp16")]
16060#[cfg_attr(test, assert_instr(vcvtph2pd))]
16061#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16062pub fn _mm512_mask_cvtph_pd(src: __m512d, k: __mmask8, a: __m128h) -> __m512d {
16063    unsafe { vcvtph2pd_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
16064}
16065
16066/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16067/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16068/// corresponding mask bit is not set).
16069///
16070/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_pd)
16071#[inline]
16072#[target_feature(enable = "avx512fp16")]
16073#[cfg_attr(test, assert_instr(vcvtph2pd))]
16074#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16075pub fn _mm512_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m512d {
16076    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), k, a)
16077}
16078
16079/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16080/// floating-point elements, and store the results in dst.
16081///
16082/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16083///
16084/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_pd)
16085#[inline]
16086#[target_feature(enable = "avx512fp16")]
16087#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16088#[rustc_legacy_const_generics(1)]
16089#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16090pub fn _mm512_cvt_roundph_pd<const SAE: i32>(a: __m128h) -> __m512d {
16091    static_assert_sae!(SAE);
16092    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), 0xff, a)
16093}
16094
16095/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16096/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16097/// dst when the corresponding mask bit is not set).
16098///
16099/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16100///
16101/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_pd)
16102#[inline]
16103#[target_feature(enable = "avx512fp16")]
16104#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16105#[rustc_legacy_const_generics(3)]
16106#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16107pub fn _mm512_mask_cvt_roundph_pd<const SAE: i32>(
16108    src: __m512d,
16109    k: __mmask8,
16110    a: __m128h,
16111) -> __m512d {
16112    unsafe {
16113        static_assert_sae!(SAE);
16114        vcvtph2pd_512(a, src, k, SAE)
16115    }
16116}
16117
16118/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16119/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16120/// corresponding mask bit is not set).
16121///
16122/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16123///
16124/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_pd)
16125#[inline]
16126#[target_feature(enable = "avx512fp16")]
16127#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16128#[rustc_legacy_const_generics(2)]
16129#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16130pub fn _mm512_maskz_cvt_roundph_pd<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512d {
16131    static_assert_sae!(SAE);
16132    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), k, a)
16133}
16134
16135/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16136/// floating-point element, store the result in the lower element of dst, and copy the upper element
16137/// from a to the upper element of dst.
16138///
16139/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_sd)
16140#[inline]
16141#[target_feature(enable = "avx512fp16")]
16142#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16143#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16144pub fn _mm_cvtsh_sd(a: __m128d, b: __m128h) -> __m128d {
16145    _mm_mask_cvtsh_sd(a, 0xff, a, b)
16146}
16147
16148/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16149/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16150/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
16151/// of dst.
16152///
16153/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_sd)
16154#[inline]
16155#[target_feature(enable = "avx512fp16")]
16156#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16157#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16158pub fn _mm_mask_cvtsh_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16159    unsafe { vcvtsh2sd(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
16160}
16161
16162/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16163/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16164/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
16165///
16166/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_sd)
16167#[inline]
16168#[target_feature(enable = "avx512fp16")]
16169#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16170#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16171pub fn _mm_maskz_cvtsh_sd(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16172    _mm_mask_cvtsh_sd(_mm_set_sd(0.0), k, a, b)
16173}
16174
16175/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16176/// floating-point element, store the result in the lower element of dst, and copy the upper element from a
16177/// to the upper element of dst.
16178///
16179/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16180///
16181/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_sd)
16182#[inline]
16183#[target_feature(enable = "avx512fp16")]
16184#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16185#[rustc_legacy_const_generics(2)]
16186#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16187pub fn _mm_cvt_roundsh_sd<const SAE: i32>(a: __m128d, b: __m128h) -> __m128d {
16188    static_assert_sae!(SAE);
16189    _mm_mask_cvt_roundsh_sd::<SAE>(a, 0xff, a, b)
16190}
16191
16192/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16193/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16194/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
16195/// of dst.
16196///
16197/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16198///
16199/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_sd)
16200#[inline]
16201#[target_feature(enable = "avx512fp16")]
16202#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16203#[rustc_legacy_const_generics(4)]
16204#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16205pub fn _mm_mask_cvt_roundsh_sd<const SAE: i32>(
16206    src: __m128d,
16207    k: __mmask8,
16208    a: __m128d,
16209    b: __m128h,
16210) -> __m128d {
16211    unsafe {
16212        static_assert_sae!(SAE);
16213        vcvtsh2sd(a, b, src, k, SAE)
16214    }
16215}
16216
16217/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16218/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16219/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
16220///
16221/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16222///
16223/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_sd)
16224#[inline]
16225#[target_feature(enable = "avx512fp16")]
16226#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16227#[rustc_legacy_const_generics(3)]
16228#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16229pub fn _mm_maskz_cvt_roundsh_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16230    static_assert_sae!(SAE);
16231    _mm_mask_cvt_roundsh_sd::<SAE>(_mm_set_sd(0.0), k, a, b)
16232}
16233
16234/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16235///
16236/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_h)
16237#[inline]
16238#[target_feature(enable = "avx512fp16")]
16239#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16240pub fn _mm_cvtsh_h(a: __m128h) -> f16 {
16241    unsafe { simd_extract!(a, 0) }
16242}
16243
16244/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16245///
16246/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsh_h)
16247#[inline]
16248#[target_feature(enable = "avx512fp16")]
16249#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16250pub fn _mm256_cvtsh_h(a: __m256h) -> f16 {
16251    unsafe { simd_extract!(a, 0) }
16252}
16253
16254/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16255///
16256/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsh_h)
16257#[inline]
16258#[target_feature(enable = "avx512fp16")]
16259#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16260pub fn _mm512_cvtsh_h(a: __m512h) -> f16 {
16261    unsafe { simd_extract!(a, 0) }
16262}
16263
16264/// Copy the lower 16-bit integer in a to dst.
16265///
16266/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si16)
16267#[inline]
16268#[target_feature(enable = "avx512fp16")]
16269#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16270pub fn _mm_cvtsi128_si16(a: __m128i) -> i16 {
16271    unsafe { simd_extract!(a.as_i16x8(), 0) }
16272}
16273
16274/// Copy 16-bit integer a to the lower elements of dst, and zero the upper elements of dst.
16275///
16276/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi16_si128)
16277#[inline]
16278#[target_feature(enable = "avx512fp16")]
16279#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16280pub fn _mm_cvtsi16_si128(a: i16) -> __m128i {
16281    unsafe { transmute(simd_insert!(i16x8::ZERO, 0, a)) }
16282}
16283
16284#[allow(improper_ctypes)]
16285unsafe extern "C" {
16286    #[link_name = "llvm.x86.avx512fp16.mask.cmp.sh"]
16287    fn vcmpsh(a: __m128h, b: __m128h, imm8: i32, mask: __mmask8, sae: i32) -> __mmask8;
16288    #[link_name = "llvm.x86.avx512fp16.vcomi.sh"]
16289    fn vcomish(a: __m128h, b: __m128h, imm8: i32, sae: i32) -> i32;
16290
16291    #[link_name = "llvm.x86.avx512fp16.add.ph.512"]
16292    fn vaddph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16293    #[link_name = "llvm.x86.avx512fp16.sub.ph.512"]
16294    fn vsubph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16295    #[link_name = "llvm.x86.avx512fp16.mul.ph.512"]
16296    fn vmulph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16297    #[link_name = "llvm.x86.avx512fp16.div.ph.512"]
16298    fn vdivph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16299
16300    #[link_name = "llvm.x86.avx512fp16.mask.add.sh.round"]
16301    fn vaddsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16302    #[link_name = "llvm.x86.avx512fp16.mask.sub.sh.round"]
16303    fn vsubsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16304    #[link_name = "llvm.x86.avx512fp16.mask.mul.sh.round"]
16305    fn vmulsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16306    #[link_name = "llvm.x86.avx512fp16.mask.div.sh.round"]
16307    fn vdivsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16308
16309    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.128"]
16310    fn vfmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
16311    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.256"]
16312    fn vfmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
16313    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.512"]
16314    fn vfmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
16315    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.csh"]
16316    fn vfmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
16317
16318    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.128"]
16319    fn vfcmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
16320    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.256"]
16321    fn vfcmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
16322    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.512"]
16323    fn vfcmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
16324    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.csh"]
16325    fn vfcmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
16326
16327    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.128"]
16328    fn vfmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16329    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.128"]
16330    fn vfmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16331    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.256"]
16332    fn vfmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16333    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.256"]
16334    fn vfmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16335    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.512"]
16336    fn vfmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
16337    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.512"]
16338    fn vfmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
16339    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.csh"]
16340    fn vfmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16341    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.csh"]
16342    fn vfmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16343
16344    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.128"]
16345    fn vfcmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16346    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.128"]
16347    fn vfcmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16348    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.256"]
16349    fn vfcmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16350    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.256"]
16351    fn vfcmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16352    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.512"]
16353    fn vfcmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
16354    -> __m512;
16355    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.512"]
16356    fn vfcmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
16357    -> __m512;
16358    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.csh"]
16359    fn vfcmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16360    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.csh"]
16361    fn vfcmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16362
16363    #[link_name = "llvm.x86.avx512fp16.vfmadd.ph.512"]
16364    fn vfmaddph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
16365    #[link_name = "llvm.x86.avx512fp16.vfmadd.f16"]
16366    fn vfmaddsh(a: f16, b: f16, c: f16, rounding: i32) -> f16;
16367
16368    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.128"]
16369    fn vfmaddsubph_128(a: __m128h, b: __m128h, c: __m128h) -> __m128h;
16370    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.256"]
16371    fn vfmaddsubph_256(a: __m256h, b: __m256h, c: __m256h) -> __m256h;
16372    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.512"]
16373    fn vfmaddsubph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
16374
16375    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.128"]
16376    fn vrcpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16377    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.256"]
16378    fn vrcpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16379    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.512"]
16380    fn vrcpph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
16381    #[link_name = "llvm.x86.avx512fp16.mask.rcp.sh"]
16382    fn vrcpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16383
16384    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.128"]
16385    fn vrsqrtph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16386    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.256"]
16387    fn vrsqrtph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16388    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.512"]
16389    fn vrsqrtph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
16390    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.sh"]
16391    fn vrsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16392
16393    #[link_name = "llvm.x86.avx512fp16.sqrt.ph.512"]
16394    fn vsqrtph_512(a: __m512h, rounding: i32) -> __m512h;
16395    #[link_name = "llvm.x86.avx512fp16.mask.sqrt.sh"]
16396    fn vsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16397
16398    #[link_name = "llvm.x86.avx512fp16.max.ph.128"]
16399    fn vmaxph_128(a: __m128h, b: __m128h) -> __m128h;
16400    #[link_name = "llvm.x86.avx512fp16.max.ph.256"]
16401    fn vmaxph_256(a: __m256h, b: __m256h) -> __m256h;
16402    #[link_name = "llvm.x86.avx512fp16.max.ph.512"]
16403    fn vmaxph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
16404    #[link_name = "llvm.x86.avx512fp16.mask.max.sh.round"]
16405    fn vmaxsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16406
16407    #[link_name = "llvm.x86.avx512fp16.min.ph.128"]
16408    fn vminph_128(a: __m128h, b: __m128h) -> __m128h;
16409    #[link_name = "llvm.x86.avx512fp16.min.ph.256"]
16410    fn vminph_256(a: __m256h, b: __m256h) -> __m256h;
16411    #[link_name = "llvm.x86.avx512fp16.min.ph.512"]
16412    fn vminph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
16413    #[link_name = "llvm.x86.avx512fp16.mask.min.sh.round"]
16414    fn vminsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16415
16416    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.128"]
16417    fn vgetexpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16418    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.256"]
16419    fn vgetexpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16420    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.512"]
16421    fn vgetexpph_512(a: __m512h, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16422    #[link_name = "llvm.x86.avx512fp16.mask.getexp.sh"]
16423    fn vgetexpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16424
16425    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.128"]
16426    fn vgetmantph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16427    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.256"]
16428    fn vgetmantph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16429    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.512"]
16430    fn vgetmantph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16431    #[link_name = "llvm.x86.avx512fp16.mask.getmant.sh"]
16432    fn vgetmantsh(
16433        a: __m128h,
16434        b: __m128h,
16435        imm8: i32,
16436        src: __m128h,
16437        k: __mmask8,
16438        sae: i32,
16439    ) -> __m128h;
16440
16441    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.128"]
16442    fn vrndscaleph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16443    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.256"]
16444    fn vrndscaleph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16445    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.512"]
16446    fn vrndscaleph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16447    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.sh"]
16448    fn vrndscalesh(
16449        a: __m128h,
16450        b: __m128h,
16451        src: __m128h,
16452        k: __mmask8,
16453        imm8: i32,
16454        sae: i32,
16455    ) -> __m128h;
16456
16457    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.128"]
16458    fn vscalefph_128(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16459    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.256"]
16460    fn vscalefph_256(a: __m256h, b: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16461    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.512"]
16462    fn vscalefph_512(a: __m512h, b: __m512h, src: __m512h, k: __mmask32, rounding: i32) -> __m512h;
16463    #[link_name = "llvm.x86.avx512fp16.mask.scalef.sh"]
16464    fn vscalefsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16465
16466    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.128"]
16467    fn vreduceph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16468    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.256"]
16469    fn vreduceph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16470    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.512"]
16471    fn vreduceph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16472    #[link_name = "llvm.x86.avx512fp16.mask.reduce.sh"]
16473    fn vreducesh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, imm8: i32, sae: i32)
16474    -> __m128h;
16475
16476    #[link_name = "llvm.x86.avx512fp16.mask.fpclass.sh"]
16477    fn vfpclasssh(a: __m128h, imm8: i32, k: __mmask8) -> __mmask8;
16478
16479    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i16"]
16480    fn vcvtw2ph_128(a: i16x8, rounding: i32) -> __m128h;
16481    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i16"]
16482    fn vcvtw2ph_256(a: i16x16, rounding: i32) -> __m256h;
16483    #[link_name = "llvm.x86.avx512.sitofp.round.v32f16.v32i16"]
16484    fn vcvtw2ph_512(a: i16x32, rounding: i32) -> __m512h;
16485    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8u16"]
16486    fn vcvtuw2ph_128(a: u16x8, rounding: i32) -> __m128h;
16487    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16u16"]
16488    fn vcvtuw2ph_256(a: u16x16, rounding: i32) -> __m256h;
16489    #[link_name = "llvm.x86.avx512.uitofp.round.v32f16.v32u16"]
16490    fn vcvtuw2ph_512(a: u16x32, rounding: i32) -> __m512h;
16491
16492    #[link_name = "llvm.x86.avx512fp16.mask.vcvtdq2ph.128"]
16493    fn vcvtdq2ph_128(a: i32x4, src: __m128h, k: __mmask8) -> __m128h;
16494    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i32"]
16495    fn vcvtdq2ph_256(a: i32x8, rounding: i32) -> __m128h;
16496    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i32"]
16497    fn vcvtdq2ph_512(a: i32x16, rounding: i32) -> __m256h;
16498    #[link_name = "llvm.x86.avx512fp16.vcvtsi2sh"]
16499    fn vcvtsi2sh(a: __m128h, b: i32, rounding: i32) -> __m128h;
16500    #[link_name = "llvm.x86.avx512fp16.mask.vcvtudq2ph.128"]
16501    fn vcvtudq2ph_128(a: u32x4, src: __m128h, k: __mmask8) -> __m128h;
16502    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8u32"]
16503    fn vcvtudq2ph_256(a: u32x8, rounding: i32) -> __m128h;
16504    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16u32"]
16505    fn vcvtudq2ph_512(a: u32x16, rounding: i32) -> __m256h;
16506    #[link_name = "llvm.x86.avx512fp16.vcvtusi2sh"]
16507    fn vcvtusi2sh(a: __m128h, b: u32, rounding: i32) -> __m128h;
16508
16509    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.128"]
16510    fn vcvtqq2ph_128(a: i64x2, src: __m128h, k: __mmask8) -> __m128h;
16511    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.256"]
16512    fn vcvtqq2ph_256(a: i64x4, src: __m128h, k: __mmask8) -> __m128h;
16513    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i64"]
16514    fn vcvtqq2ph_512(a: i64x8, rounding: i32) -> __m128h;
16515    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.128"]
16516    fn vcvtuqq2ph_128(a: u64x2, src: __m128h, k: __mmask8) -> __m128h;
16517    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.256"]
16518    fn vcvtuqq2ph_256(a: u64x4, src: __m128h, k: __mmask8) -> __m128h;
16519    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8u64"]
16520    fn vcvtuqq2ph_512(a: u64x8, rounding: i32) -> __m128h;
16521
16522    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.128"]
16523    fn vcvtps2phx_128(a: __m128, src: __m128h, k: __mmask8) -> __m128h;
16524    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.256"]
16525    fn vcvtps2phx_256(a: __m256, src: __m128h, k: __mmask8) -> __m128h;
16526    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.512"]
16527    fn vcvtps2phx_512(a: __m512, src: __m256h, k: __mmask16, rounding: i32) -> __m256h;
16528    #[link_name = "llvm.x86.avx512fp16.mask.vcvtss2sh.round"]
16529    fn vcvtss2sh(a: __m128h, b: __m128, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16530
16531    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.128"]
16532    fn vcvtpd2ph_128(a: __m128d, src: __m128h, k: __mmask8) -> __m128h;
16533    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.256"]
16534    fn vcvtpd2ph_256(a: __m256d, src: __m128h, k: __mmask8) -> __m128h;
16535    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.512"]
16536    fn vcvtpd2ph_512(a: __m512d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16537    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsd2sh.round"]
16538    fn vcvtsd2sh(a: __m128h, b: __m128d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16539
16540    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.128"]
16541    fn vcvtph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
16542    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.256"]
16543    fn vcvtph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
16544    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.512"]
16545    fn vcvtph2w_512(a: __m512h, src: i16x32, k: __mmask32, rounding: i32) -> i16x32;
16546    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.128"]
16547    fn vcvtph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
16548    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.256"]
16549    fn vcvtph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
16550    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.512"]
16551    fn vcvtph2uw_512(a: __m512h, src: u16x32, k: __mmask32, rounding: i32) -> u16x32;
16552
16553    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.128"]
16554    fn vcvttph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
16555    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.256"]
16556    fn vcvttph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
16557    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.512"]
16558    fn vcvttph2w_512(a: __m512h, src: i16x32, k: __mmask32, sae: i32) -> i16x32;
16559    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.128"]
16560    fn vcvttph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
16561    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.256"]
16562    fn vcvttph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
16563    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.512"]
16564    fn vcvttph2uw_512(a: __m512h, src: u16x32, k: __mmask32, sae: i32) -> u16x32;
16565
16566    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.128"]
16567    fn vcvtph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
16568    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.256"]
16569    fn vcvtph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
16570    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.512"]
16571    fn vcvtph2dq_512(a: __m256h, src: i32x16, k: __mmask16, rounding: i32) -> i32x16;
16572    #[link_name = "llvm.x86.avx512fp16.vcvtsh2si32"]
16573    fn vcvtsh2si32(a: __m128h, rounding: i32) -> i32;
16574    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.128"]
16575    fn vcvtph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
16576    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.256"]
16577    fn vcvtph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
16578    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.512"]
16579    fn vcvtph2udq_512(a: __m256h, src: u32x16, k: __mmask16, rounding: i32) -> u32x16;
16580    #[link_name = "llvm.x86.avx512fp16.vcvtsh2usi32"]
16581    fn vcvtsh2usi32(a: __m128h, sae: i32) -> u32;
16582
16583    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.128"]
16584    fn vcvttph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
16585    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.256"]
16586    fn vcvttph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
16587    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.512"]
16588    fn vcvttph2dq_512(a: __m256h, src: i32x16, k: __mmask16, sae: i32) -> i32x16;
16589    #[link_name = "llvm.x86.avx512fp16.vcvttsh2si32"]
16590    fn vcvttsh2si32(a: __m128h, sae: i32) -> i32;
16591    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.128"]
16592    fn vcvttph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
16593    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.256"]
16594    fn vcvttph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
16595    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.512"]
16596    fn vcvttph2udq_512(a: __m256h, src: u32x16, k: __mmask16, sae: i32) -> u32x16;
16597    #[link_name = "llvm.x86.avx512fp16.vcvttsh2usi32"]
16598    fn vcvttsh2usi32(a: __m128h, sae: i32) -> u32;
16599
16600    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.128"]
16601    fn vcvtph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
16602    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.256"]
16603    fn vcvtph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
16604    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.512"]
16605    fn vcvtph2qq_512(a: __m128h, src: i64x8, k: __mmask8, rounding: i32) -> i64x8;
16606    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.128"]
16607    fn vcvtph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
16608    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.256"]
16609    fn vcvtph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
16610    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.512"]
16611    fn vcvtph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, rounding: i32) -> u64x8;
16612
16613    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.128"]
16614    fn vcvttph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
16615    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.256"]
16616    fn vcvttph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
16617    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.512"]
16618    fn vcvttph2qq_512(a: __m128h, src: i64x8, k: __mmask8, sae: i32) -> i64x8;
16619    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.128"]
16620    fn vcvttph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
16621    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.256"]
16622    fn vcvttph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
16623    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.512"]
16624    fn vcvttph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, sae: i32) -> u64x8;
16625
16626    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.128"]
16627    fn vcvtph2psx_128(a: __m128h, src: __m128, k: __mmask8) -> __m128;
16628    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.256"]
16629    fn vcvtph2psx_256(a: __m128h, src: __m256, k: __mmask8) -> __m256;
16630    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.512"]
16631    fn vcvtph2psx_512(a: __m256h, src: __m512, k: __mmask16, sae: i32) -> __m512;
16632    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2ss.round"]
16633    fn vcvtsh2ss(a: __m128, b: __m128h, src: __m128, k: __mmask8, sae: i32) -> __m128;
16634
16635    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.128"]
16636    fn vcvtph2pd_128(a: __m128h, src: __m128d, k: __mmask8) -> __m128d;
16637    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.256"]
16638    fn vcvtph2pd_256(a: __m128h, src: __m256d, k: __mmask8) -> __m256d;
16639    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.512"]
16640    fn vcvtph2pd_512(a: __m128h, src: __m512d, k: __mmask8, sae: i32) -> __m512d;
16641    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2sd.round"]
16642    fn vcvtsh2sd(a: __m128d, b: __m128h, src: __m128d, k: __mmask8, sae: i32) -> __m128d;
16643
16644}
16645
16646#[cfg(test)]
16647mod tests {
16648    use crate::core_arch::x86::*;
16649    use crate::mem::transmute;
16650    use crate::ptr::{addr_of, addr_of_mut};
16651    use stdarch_test::simd_test;
16652
16653    #[target_feature(enable = "avx512fp16")]
16654    unsafe fn _mm_set1_pch(re: f16, im: f16) -> __m128h {
16655        _mm_setr_ph(re, im, re, im, re, im, re, im)
16656    }
16657
16658    #[target_feature(enable = "avx512fp16")]
16659    unsafe fn _mm256_set1_pch(re: f16, im: f16) -> __m256h {
16660        _mm256_setr_ph(
16661            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
16662        )
16663    }
16664
16665    #[target_feature(enable = "avx512fp16")]
16666    unsafe fn _mm512_set1_pch(re: f16, im: f16) -> __m512h {
16667        _mm512_setr_ph(
16668            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
16669            re, im, re, im, re, im, re, im, re, im,
16670        )
16671    }
16672
16673    #[simd_test(enable = "avx512fp16,avx512vl")]
16674    unsafe fn test_mm_set_ph() {
16675        let r = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
16676        let e = _mm_setr_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
16677        assert_eq_m128h(r, e);
16678    }
16679
16680    #[simd_test(enable = "avx512fp16,avx512vl")]
16681    unsafe fn test_mm256_set_ph() {
16682        let r = _mm256_set_ph(
16683            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16684        );
16685        let e = _mm256_setr_ph(
16686            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
16687        );
16688        assert_eq_m256h(r, e);
16689    }
16690
16691    #[simd_test(enable = "avx512fp16")]
16692    unsafe fn test_mm512_set_ph() {
16693        let r = _mm512_set_ph(
16694            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16695            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16696            31.0, 32.0,
16697        );
16698        let e = _mm512_setr_ph(
16699            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
16700            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
16701            3.0, 2.0, 1.0,
16702        );
16703        assert_eq_m512h(r, e);
16704    }
16705
16706    #[simd_test(enable = "avx512fp16,avx512vl")]
16707    unsafe fn test_mm_set_sh() {
16708        let r = _mm_set_sh(1.0);
16709        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0);
16710        assert_eq_m128h(r, e);
16711    }
16712
16713    #[simd_test(enable = "avx512fp16,avx512vl")]
16714    unsafe fn test_mm_set1_ph() {
16715        let r = _mm_set1_ph(1.0);
16716        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
16717        assert_eq_m128h(r, e);
16718    }
16719
16720    #[simd_test(enable = "avx512fp16,avx512vl")]
16721    unsafe fn test_mm256_set1_ph() {
16722        let r = _mm256_set1_ph(1.0);
16723        let e = _mm256_set_ph(
16724            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
16725        );
16726        assert_eq_m256h(r, e);
16727    }
16728
16729    #[simd_test(enable = "avx512fp16")]
16730    unsafe fn test_mm512_set1_ph() {
16731        let r = _mm512_set1_ph(1.0);
16732        let e = _mm512_set_ph(
16733            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
16734            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
16735        );
16736        assert_eq_m512h(r, e);
16737    }
16738
16739    #[simd_test(enable = "avx512fp16,avx512vl")]
16740    unsafe fn test_mm_setr_ph() {
16741        let r = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
16742        let e = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
16743        assert_eq_m128h(r, e);
16744    }
16745
16746    #[simd_test(enable = "avx512fp16,avx512vl")]
16747    unsafe fn test_mm256_setr_ph() {
16748        let r = _mm256_setr_ph(
16749            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16750        );
16751        let e = _mm256_set_ph(
16752            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
16753        );
16754        assert_eq_m256h(r, e);
16755    }
16756
16757    #[simd_test(enable = "avx512fp16")]
16758    unsafe fn test_mm512_setr_ph() {
16759        let r = _mm512_setr_ph(
16760            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16761            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16762            31.0, 32.0,
16763        );
16764        let e = _mm512_set_ph(
16765            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
16766            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
16767            3.0, 2.0, 1.0,
16768        );
16769        assert_eq_m512h(r, e);
16770    }
16771
16772    #[simd_test(enable = "avx512fp16,avx512vl")]
16773    unsafe fn test_mm_setzero_ph() {
16774        let r = _mm_setzero_ph();
16775        let e = _mm_set1_ph(0.0);
16776        assert_eq_m128h(r, e);
16777    }
16778
16779    #[simd_test(enable = "avx512fp16,avx512vl")]
16780    unsafe fn test_mm256_setzero_ph() {
16781        let r = _mm256_setzero_ph();
16782        let e = _mm256_set1_ph(0.0);
16783        assert_eq_m256h(r, e);
16784    }
16785
16786    #[simd_test(enable = "avx512fp16")]
16787    unsafe fn test_mm512_setzero_ph() {
16788        let r = _mm512_setzero_ph();
16789        let e = _mm512_set1_ph(0.0);
16790        assert_eq_m512h(r, e);
16791    }
16792
16793    #[simd_test(enable = "avx512fp16,avx512vl")]
16794    unsafe fn test_mm_castsi128_ph() {
16795        let a = _mm_set1_epi16(0x3c00);
16796        let r = _mm_castsi128_ph(a);
16797        let e = _mm_set1_ph(1.0);
16798        assert_eq_m128h(r, e);
16799    }
16800
16801    #[simd_test(enable = "avx512fp16,avx512vl")]
16802    unsafe fn test_mm256_castsi256_ph() {
16803        let a = _mm256_set1_epi16(0x3c00);
16804        let r = _mm256_castsi256_ph(a);
16805        let e = _mm256_set1_ph(1.0);
16806        assert_eq_m256h(r, e);
16807    }
16808
16809    #[simd_test(enable = "avx512fp16")]
16810    unsafe fn test_mm512_castsi512_ph() {
16811        let a = _mm512_set1_epi16(0x3c00);
16812        let r = _mm512_castsi512_ph(a);
16813        let e = _mm512_set1_ph(1.0);
16814        assert_eq_m512h(r, e);
16815    }
16816
16817    #[simd_test(enable = "avx512fp16")]
16818    unsafe fn test_mm_castph_si128() {
16819        let a = _mm_set1_ph(1.0);
16820        let r = _mm_castph_si128(a);
16821        let e = _mm_set1_epi16(0x3c00);
16822        assert_eq_m128i(r, e);
16823    }
16824
16825    #[simd_test(enable = "avx512fp16")]
16826    unsafe fn test_mm256_castph_si256() {
16827        let a = _mm256_set1_ph(1.0);
16828        let r = _mm256_castph_si256(a);
16829        let e = _mm256_set1_epi16(0x3c00);
16830        assert_eq_m256i(r, e);
16831    }
16832
16833    #[simd_test(enable = "avx512fp16")]
16834    unsafe fn test_mm512_castph_si512() {
16835        let a = _mm512_set1_ph(1.0);
16836        let r = _mm512_castph_si512(a);
16837        let e = _mm512_set1_epi16(0x3c00);
16838        assert_eq_m512i(r, e);
16839    }
16840
16841    #[simd_test(enable = "avx512fp16,avx512vl")]
16842    unsafe fn test_mm_castps_ph() {
16843        let a = _mm_castsi128_ps(_mm_set1_epi16(0x3c00));
16844        let r = _mm_castps_ph(a);
16845        let e = _mm_set1_ph(1.0);
16846        assert_eq_m128h(r, e);
16847    }
16848
16849    #[simd_test(enable = "avx512fp16,avx512vl")]
16850    unsafe fn test_mm256_castps_ph() {
16851        let a = _mm256_castsi256_ps(_mm256_set1_epi16(0x3c00));
16852        let r = _mm256_castps_ph(a);
16853        let e = _mm256_set1_ph(1.0);
16854        assert_eq_m256h(r, e);
16855    }
16856
16857    #[simd_test(enable = "avx512fp16")]
16858    unsafe fn test_mm512_castps_ph() {
16859        let a = _mm512_castsi512_ps(_mm512_set1_epi16(0x3c00));
16860        let r = _mm512_castps_ph(a);
16861        let e = _mm512_set1_ph(1.0);
16862        assert_eq_m512h(r, e);
16863    }
16864
16865    #[simd_test(enable = "avx512fp16")]
16866    unsafe fn test_mm_castph_ps() {
16867        let a = _mm_castsi128_ph(_mm_set1_epi32(0x3f800000));
16868        let r = _mm_castph_ps(a);
16869        let e = _mm_set1_ps(1.0);
16870        assert_eq_m128(r, e);
16871    }
16872
16873    #[simd_test(enable = "avx512fp16")]
16874    unsafe fn test_mm256_castph_ps() {
16875        let a = _mm256_castsi256_ph(_mm256_set1_epi32(0x3f800000));
16876        let r = _mm256_castph_ps(a);
16877        let e = _mm256_set1_ps(1.0);
16878        assert_eq_m256(r, e);
16879    }
16880
16881    #[simd_test(enable = "avx512fp16")]
16882    unsafe fn test_mm512_castph_ps() {
16883        let a = _mm512_castsi512_ph(_mm512_set1_epi32(0x3f800000));
16884        let r = _mm512_castph_ps(a);
16885        let e = _mm512_set1_ps(1.0);
16886        assert_eq_m512(r, e);
16887    }
16888
16889    #[simd_test(enable = "avx512fp16,avx512vl")]
16890    unsafe fn test_mm_castpd_ph() {
16891        let a = _mm_castsi128_pd(_mm_set1_epi16(0x3c00));
16892        let r = _mm_castpd_ph(a);
16893        let e = _mm_set1_ph(1.0);
16894        assert_eq_m128h(r, e);
16895    }
16896
16897    #[simd_test(enable = "avx512fp16,avx512vl")]
16898    unsafe fn test_mm256_castpd_ph() {
16899        let a = _mm256_castsi256_pd(_mm256_set1_epi16(0x3c00));
16900        let r = _mm256_castpd_ph(a);
16901        let e = _mm256_set1_ph(1.0);
16902        assert_eq_m256h(r, e);
16903    }
16904
16905    #[simd_test(enable = "avx512fp16")]
16906    unsafe fn test_mm512_castpd_ph() {
16907        let a = _mm512_castsi512_pd(_mm512_set1_epi16(0x3c00));
16908        let r = _mm512_castpd_ph(a);
16909        let e = _mm512_set1_ph(1.0);
16910        assert_eq_m512h(r, e);
16911    }
16912
16913    #[simd_test(enable = "avx512fp16")]
16914    unsafe fn test_mm_castph_pd() {
16915        let a = _mm_castsi128_ph(_mm_set1_epi64x(0x3ff0000000000000));
16916        let r = _mm_castph_pd(a);
16917        let e = _mm_set1_pd(1.0);
16918        assert_eq_m128d(r, e);
16919    }
16920
16921    #[simd_test(enable = "avx512fp16")]
16922    unsafe fn test_mm256_castph_pd() {
16923        let a = _mm256_castsi256_ph(_mm256_set1_epi64x(0x3ff0000000000000));
16924        let r = _mm256_castph_pd(a);
16925        let e = _mm256_set1_pd(1.0);
16926        assert_eq_m256d(r, e);
16927    }
16928
16929    #[simd_test(enable = "avx512fp16")]
16930    unsafe fn test_mm512_castph_pd() {
16931        let a = _mm512_castsi512_ph(_mm512_set1_epi64(0x3ff0000000000000));
16932        let r = _mm512_castph_pd(a);
16933        let e = _mm512_set1_pd(1.0);
16934        assert_eq_m512d(r, e);
16935    }
16936
16937    #[simd_test(enable = "avx512fp16,avx512vl")]
16938    unsafe fn test_mm256_castph256_ph128() {
16939        let a = _mm256_setr_ph(
16940            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
16941        );
16942        let r = _mm256_castph256_ph128(a);
16943        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16944        assert_eq_m128h(r, e);
16945    }
16946
16947    #[simd_test(enable = "avx512fp16,avx512vl")]
16948    unsafe fn test_mm512_castph512_ph128() {
16949        let a = _mm512_setr_ph(
16950            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
16951            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
16952        );
16953        let r = _mm512_castph512_ph128(a);
16954        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16955        assert_eq_m128h(r, e);
16956    }
16957
16958    #[simd_test(enable = "avx512fp16,avx512vl")]
16959    unsafe fn test_mm512_castph512_ph256() {
16960        let a = _mm512_setr_ph(
16961            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
16962            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
16963        );
16964        let r = _mm512_castph512_ph256(a);
16965        let e = _mm256_setr_ph(
16966            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
16967        );
16968        assert_eq_m256h(r, e);
16969    }
16970
16971    #[simd_test(enable = "avx512fp16,avx512vl")]
16972    unsafe fn test_mm256_castph128_ph256() {
16973        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16974        let r = _mm256_castph128_ph256(a);
16975        assert_eq_m128h(_mm256_castph256_ph128(r), a);
16976    }
16977
16978    #[simd_test(enable = "avx512fp16,avx512vl")]
16979    unsafe fn test_mm512_castph128_ph512() {
16980        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16981        let r = _mm512_castph128_ph512(a);
16982        assert_eq_m128h(_mm512_castph512_ph128(r), a);
16983    }
16984
16985    #[simd_test(enable = "avx512fp16,avx512vl")]
16986    unsafe fn test_mm512_castph256_ph512() {
16987        let a = _mm256_setr_ph(
16988            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
16989        );
16990        let r = _mm512_castph256_ph512(a);
16991        assert_eq_m256h(_mm512_castph512_ph256(r), a);
16992    }
16993
16994    #[simd_test(enable = "avx512fp16,avx512vl")]
16995    unsafe fn test_mm256_zextph128_ph256() {
16996        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16997        let r = _mm256_zextph128_ph256(a);
16998        let e = _mm256_setr_ph(
16999            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
17000        );
17001        assert_eq_m256h(r, e);
17002    }
17003
17004    #[simd_test(enable = "avx512fp16")]
17005    unsafe fn test_mm512_zextph128_ph512() {
17006        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17007        let r = _mm512_zextph128_ph512(a);
17008        let e = _mm512_setr_ph(
17009            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
17010            0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
17011        );
17012        assert_eq_m512h(r, e);
17013    }
17014
17015    #[simd_test(enable = "avx512fp16")]
17016    unsafe fn test_mm512_zextph256_ph512() {
17017        let a = _mm256_setr_ph(
17018            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
17019        );
17020        let r = _mm512_zextph256_ph512(a);
17021        let e = _mm512_setr_ph(
17022            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 0., 0., 0., 0.,
17023            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
17024        );
17025        assert_eq_m512h(r, e);
17026    }
17027
17028    #[simd_test(enable = "avx512fp16,avx512vl")]
17029    unsafe fn test_mm_cmp_ph_mask() {
17030        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17031        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
17032        let r = _mm_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17033        assert_eq!(r, 0b11110000);
17034    }
17035
17036    #[simd_test(enable = "avx512fp16,avx512vl")]
17037    unsafe fn test_mm_mask_cmp_ph_mask() {
17038        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17039        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
17040        let r = _mm_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101, a, b);
17041        assert_eq!(r, 0b01010000);
17042    }
17043
17044    #[simd_test(enable = "avx512fp16,avx512vl")]
17045    unsafe fn test_mm256_cmp_ph_mask() {
17046        let a = _mm256_set_ph(
17047            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17048        );
17049        let b = _mm256_set_ph(
17050            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17051            -16.0,
17052        );
17053        let r = _mm256_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17054        assert_eq!(r, 0b1111000011110000);
17055    }
17056
17057    #[simd_test(enable = "avx512fp16,avx512vl")]
17058    unsafe fn test_mm256_mask_cmp_ph_mask() {
17059        let a = _mm256_set_ph(
17060            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17061        );
17062        let b = _mm256_set_ph(
17063            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17064            -16.0,
17065        );
17066        let r = _mm256_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b0101010101010101, a, b);
17067        assert_eq!(r, 0b0101000001010000);
17068    }
17069
17070    #[simd_test(enable = "avx512fp16")]
17071    unsafe fn test_mm512_cmp_ph_mask() {
17072        let a = _mm512_set_ph(
17073            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17074            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17075            31.0, 32.0,
17076        );
17077        let b = _mm512_set_ph(
17078            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17079            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17080            -29.0, -30.0, -31.0, -32.0,
17081        );
17082        let r = _mm512_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17083        assert_eq!(r, 0b11110000111100001111000011110000);
17084    }
17085
17086    #[simd_test(enable = "avx512fp16")]
17087    unsafe fn test_mm512_mask_cmp_ph_mask() {
17088        let a = _mm512_set_ph(
17089            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17090            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17091            31.0, 32.0,
17092        );
17093        let b = _mm512_set_ph(
17094            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17095            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17096            -29.0, -30.0, -31.0, -32.0,
17097        );
17098        let r = _mm512_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101010101010101010101010101, a, b);
17099        assert_eq!(r, 0b01010000010100000101000001010000);
17100    }
17101
17102    #[simd_test(enable = "avx512fp16")]
17103    unsafe fn test_mm512_cmp_round_ph_mask() {
17104        let a = _mm512_set_ph(
17105            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17106            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17107            31.0, 32.0,
17108        );
17109        let b = _mm512_set_ph(
17110            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17111            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17112            -29.0, -30.0, -31.0, -32.0,
17113        );
17114        let r = _mm512_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17115        assert_eq!(r, 0b11110000111100001111000011110000);
17116    }
17117
17118    #[simd_test(enable = "avx512fp16")]
17119    unsafe fn test_mm512_mask_cmp_round_ph_mask() {
17120        let a = _mm512_set_ph(
17121            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17122            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17123            31.0, 32.0,
17124        );
17125        let b = _mm512_set_ph(
17126            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17127            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17128            -29.0, -30.0, -31.0, -32.0,
17129        );
17130        let r = _mm512_mask_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(
17131            0b01010101010101010101010101010101,
17132            a,
17133            b,
17134        );
17135        assert_eq!(r, 0b01010000010100000101000001010000);
17136    }
17137
17138    #[simd_test(enable = "avx512fp16")]
17139    unsafe fn test_mm_cmp_round_sh_mask() {
17140        let a = _mm_set_sh(1.0);
17141        let b = _mm_set_sh(1.0);
17142        let r = _mm_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17143        assert_eq!(r, 1);
17144    }
17145
17146    #[simd_test(enable = "avx512fp16")]
17147    unsafe fn test_mm_mask_cmp_round_sh_mask() {
17148        let a = _mm_set_sh(1.0);
17149        let b = _mm_set_sh(1.0);
17150        let r = _mm_mask_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(0, a, b);
17151        assert_eq!(r, 0);
17152    }
17153
17154    #[simd_test(enable = "avx512fp16")]
17155    unsafe fn test_mm_cmp_sh_mask() {
17156        let a = _mm_set_sh(1.0);
17157        let b = _mm_set_sh(1.0);
17158        let r = _mm_cmp_sh_mask::<_CMP_EQ_OQ>(a, b);
17159        assert_eq!(r, 1);
17160    }
17161
17162    #[simd_test(enable = "avx512fp16")]
17163    unsafe fn test_mm_mask_cmp_sh_mask() {
17164        let a = _mm_set_sh(1.0);
17165        let b = _mm_set_sh(1.0);
17166        let r = _mm_mask_cmp_sh_mask::<_CMP_EQ_OQ>(0, a, b);
17167        assert_eq!(r, 0);
17168    }
17169
17170    #[simd_test(enable = "avx512fp16")]
17171    unsafe fn test_mm_comi_round_sh() {
17172        let a = _mm_set_sh(1.0);
17173        let b = _mm_set_sh(1.0);
17174        let r = _mm_comi_round_sh::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17175        assert_eq!(r, 1);
17176    }
17177
17178    #[simd_test(enable = "avx512fp16")]
17179    unsafe fn test_mm_comi_sh() {
17180        let a = _mm_set_sh(1.0);
17181        let b = _mm_set_sh(1.0);
17182        let r = _mm_comi_sh::<_CMP_EQ_OQ>(a, b);
17183        assert_eq!(r, 1);
17184    }
17185
17186    #[simd_test(enable = "avx512fp16")]
17187    unsafe fn test_mm_comieq_sh() {
17188        let a = _mm_set_sh(1.0);
17189        let b = _mm_set_sh(1.0);
17190        let r = _mm_comieq_sh(a, b);
17191        assert_eq!(r, 1);
17192    }
17193
17194    #[simd_test(enable = "avx512fp16")]
17195    unsafe fn test_mm_comige_sh() {
17196        let a = _mm_set_sh(2.0);
17197        let b = _mm_set_sh(1.0);
17198        let r = _mm_comige_sh(a, b);
17199        assert_eq!(r, 1);
17200    }
17201
17202    #[simd_test(enable = "avx512fp16")]
17203    unsafe fn test_mm_comigt_sh() {
17204        let a = _mm_set_sh(2.0);
17205        let b = _mm_set_sh(1.0);
17206        let r = _mm_comigt_sh(a, b);
17207        assert_eq!(r, 1);
17208    }
17209
17210    #[simd_test(enable = "avx512fp16")]
17211    unsafe fn test_mm_comile_sh() {
17212        let a = _mm_set_sh(1.0);
17213        let b = _mm_set_sh(2.0);
17214        let r = _mm_comile_sh(a, b);
17215        assert_eq!(r, 1);
17216    }
17217
17218    #[simd_test(enable = "avx512fp16")]
17219    unsafe fn test_mm_comilt_sh() {
17220        let a = _mm_set_sh(1.0);
17221        let b = _mm_set_sh(2.0);
17222        let r = _mm_comilt_sh(a, b);
17223        assert_eq!(r, 1);
17224    }
17225
17226    #[simd_test(enable = "avx512fp16")]
17227    unsafe fn test_mm_comineq_sh() {
17228        let a = _mm_set_sh(1.0);
17229        let b = _mm_set_sh(2.0);
17230        let r = _mm_comineq_sh(a, b);
17231        assert_eq!(r, 1);
17232    }
17233
17234    #[simd_test(enable = "avx512fp16")]
17235    unsafe fn test_mm_ucomieq_sh() {
17236        let a = _mm_set_sh(1.0);
17237        let b = _mm_set_sh(1.0);
17238        let r = _mm_ucomieq_sh(a, b);
17239        assert_eq!(r, 1);
17240    }
17241
17242    #[simd_test(enable = "avx512fp16")]
17243    unsafe fn test_mm_ucomige_sh() {
17244        let a = _mm_set_sh(2.0);
17245        let b = _mm_set_sh(1.0);
17246        let r = _mm_ucomige_sh(a, b);
17247        assert_eq!(r, 1);
17248    }
17249
17250    #[simd_test(enable = "avx512fp16")]
17251    unsafe fn test_mm_ucomigt_sh() {
17252        let a = _mm_set_sh(2.0);
17253        let b = _mm_set_sh(1.0);
17254        let r = _mm_ucomigt_sh(a, b);
17255        assert_eq!(r, 1);
17256    }
17257
17258    #[simd_test(enable = "avx512fp16")]
17259    unsafe fn test_mm_ucomile_sh() {
17260        let a = _mm_set_sh(1.0);
17261        let b = _mm_set_sh(2.0);
17262        let r = _mm_ucomile_sh(a, b);
17263        assert_eq!(r, 1);
17264    }
17265
17266    #[simd_test(enable = "avx512fp16")]
17267    unsafe fn test_mm_ucomilt_sh() {
17268        let a = _mm_set_sh(1.0);
17269        let b = _mm_set_sh(2.0);
17270        let r = _mm_ucomilt_sh(a, b);
17271        assert_eq!(r, 1);
17272    }
17273
17274    #[simd_test(enable = "avx512fp16")]
17275    unsafe fn test_mm_ucomineq_sh() {
17276        let a = _mm_set_sh(1.0);
17277        let b = _mm_set_sh(2.0);
17278        let r = _mm_ucomineq_sh(a, b);
17279        assert_eq!(r, 1);
17280    }
17281
17282    #[simd_test(enable = "avx512fp16,avx512vl")]
17283    unsafe fn test_mm_load_ph() {
17284        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17285        let b = _mm_load_ph(addr_of!(a).cast());
17286        assert_eq_m128h(a, b);
17287    }
17288
17289    #[simd_test(enable = "avx512fp16,avx512vl")]
17290    unsafe fn test_mm256_load_ph() {
17291        let a = _mm256_set_ph(
17292            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17293        );
17294        let b = _mm256_load_ph(addr_of!(a).cast());
17295        assert_eq_m256h(a, b);
17296    }
17297
17298    #[simd_test(enable = "avx512fp16")]
17299    unsafe fn test_mm512_load_ph() {
17300        let a = _mm512_set_ph(
17301            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17302            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17303            31.0, 32.0,
17304        );
17305        let b = _mm512_load_ph(addr_of!(a).cast());
17306        assert_eq_m512h(a, b);
17307    }
17308
17309    #[simd_test(enable = "avx512fp16")]
17310    unsafe fn test_mm_load_sh() {
17311        let a = _mm_set_sh(1.0);
17312        let b = _mm_load_sh(addr_of!(a).cast());
17313        assert_eq_m128h(a, b);
17314    }
17315
17316    #[simd_test(enable = "avx512fp16")]
17317    unsafe fn test_mm_mask_load_sh() {
17318        let a = _mm_set_sh(1.0);
17319        let src = _mm_set_sh(2.);
17320        let b = _mm_mask_load_sh(src, 1, addr_of!(a).cast());
17321        assert_eq_m128h(a, b);
17322        let b = _mm_mask_load_sh(src, 0, addr_of!(a).cast());
17323        assert_eq_m128h(src, b);
17324    }
17325
17326    #[simd_test(enable = "avx512fp16")]
17327    unsafe fn test_mm_maskz_load_sh() {
17328        let a = _mm_set_sh(1.0);
17329        let b = _mm_maskz_load_sh(1, addr_of!(a).cast());
17330        assert_eq_m128h(a, b);
17331        let b = _mm_maskz_load_sh(0, addr_of!(a).cast());
17332        assert_eq_m128h(_mm_setzero_ph(), b);
17333    }
17334
17335    #[simd_test(enable = "avx512fp16,avx512vl")]
17336    unsafe fn test_mm_loadu_ph() {
17337        let array = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
17338        let r = _mm_loadu_ph(array.as_ptr());
17339        let e = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17340        assert_eq_m128h(r, e);
17341    }
17342
17343    #[simd_test(enable = "avx512fp16,avx512vl")]
17344    unsafe fn test_mm256_loadu_ph() {
17345        let array = [
17346            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17347        ];
17348        let r = _mm256_loadu_ph(array.as_ptr());
17349        let e = _mm256_setr_ph(
17350            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17351        );
17352        assert_eq_m256h(r, e);
17353    }
17354
17355    #[simd_test(enable = "avx512fp16")]
17356    unsafe fn test_mm512_loadu_ph() {
17357        let array = [
17358            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17359            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17360            31.0, 32.0,
17361        ];
17362        let r = _mm512_loadu_ph(array.as_ptr());
17363        let e = _mm512_setr_ph(
17364            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17365            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17366            31.0, 32.0,
17367        );
17368        assert_eq_m512h(r, e);
17369    }
17370
17371    #[simd_test(enable = "avx512fp16")]
17372    unsafe fn test_mm_move_sh() {
17373        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17374        let b = _mm_set_sh(9.0);
17375        let r = _mm_move_sh(a, b);
17376        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 9.0);
17377        assert_eq_m128h(r, e);
17378    }
17379
17380    #[simd_test(enable = "avx512fp16")]
17381    unsafe fn test_mm_mask_move_sh() {
17382        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17383        let b = _mm_set_sh(9.0);
17384        let src = _mm_set_sh(10.0);
17385        let r = _mm_mask_move_sh(src, 0, a, b);
17386        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 10.0);
17387        assert_eq_m128h(r, e);
17388    }
17389
17390    #[simd_test(enable = "avx512fp16")]
17391    unsafe fn test_mm_maskz_move_sh() {
17392        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17393        let b = _mm_set_sh(9.0);
17394        let r = _mm_maskz_move_sh(0, a, b);
17395        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 0.0);
17396        assert_eq_m128h(r, e);
17397    }
17398
17399    #[simd_test(enable = "avx512fp16,avx512vl")]
17400    unsafe fn test_mm_store_ph() {
17401        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17402        let mut b = _mm_setzero_ph();
17403        _mm_store_ph(addr_of_mut!(b).cast(), a);
17404        assert_eq_m128h(a, b);
17405    }
17406
17407    #[simd_test(enable = "avx512fp16,avx512vl")]
17408    unsafe fn test_mm256_store_ph() {
17409        let a = _mm256_set_ph(
17410            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17411        );
17412        let mut b = _mm256_setzero_ph();
17413        _mm256_store_ph(addr_of_mut!(b).cast(), a);
17414        assert_eq_m256h(a, b);
17415    }
17416
17417    #[simd_test(enable = "avx512fp16")]
17418    unsafe fn test_mm512_store_ph() {
17419        let a = _mm512_set_ph(
17420            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17421            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17422            31.0, 32.0,
17423        );
17424        let mut b = _mm512_setzero_ph();
17425        _mm512_store_ph(addr_of_mut!(b).cast(), a);
17426        assert_eq_m512h(a, b);
17427    }
17428
17429    #[simd_test(enable = "avx512fp16")]
17430    unsafe fn test_mm_store_sh() {
17431        let a = _mm_set_sh(1.0);
17432        let mut b = _mm_setzero_ph();
17433        _mm_store_sh(addr_of_mut!(b).cast(), a);
17434        assert_eq_m128h(a, b);
17435    }
17436
17437    #[simd_test(enable = "avx512fp16")]
17438    unsafe fn test_mm_mask_store_sh() {
17439        let a = _mm_set_sh(1.0);
17440        let mut b = _mm_setzero_ph();
17441        _mm_mask_store_sh(addr_of_mut!(b).cast(), 0, a);
17442        assert_eq_m128h(_mm_setzero_ph(), b);
17443        _mm_mask_store_sh(addr_of_mut!(b).cast(), 1, a);
17444        assert_eq_m128h(a, b);
17445    }
17446
17447    #[simd_test(enable = "avx512fp16,avx512vl")]
17448    unsafe fn test_mm_storeu_ph() {
17449        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17450        let mut array = [0.0; 8];
17451        _mm_storeu_ph(array.as_mut_ptr(), a);
17452        assert_eq_m128h(a, _mm_loadu_ph(array.as_ptr()));
17453    }
17454
17455    #[simd_test(enable = "avx512fp16,avx512vl")]
17456    unsafe fn test_mm256_storeu_ph() {
17457        let a = _mm256_set_ph(
17458            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17459        );
17460        let mut array = [0.0; 16];
17461        _mm256_storeu_ph(array.as_mut_ptr(), a);
17462        assert_eq_m256h(a, _mm256_loadu_ph(array.as_ptr()));
17463    }
17464
17465    #[simd_test(enable = "avx512fp16")]
17466    unsafe fn test_mm512_storeu_ph() {
17467        let a = _mm512_set_ph(
17468            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17469            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17470            31.0, 32.0,
17471        );
17472        let mut array = [0.0; 32];
17473        _mm512_storeu_ph(array.as_mut_ptr(), a);
17474        assert_eq_m512h(a, _mm512_loadu_ph(array.as_ptr()));
17475    }
17476
17477    #[simd_test(enable = "avx512fp16,avx512vl")]
17478    unsafe fn test_mm_add_ph() {
17479        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17480        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17481        let r = _mm_add_ph(a, b);
17482        let e = _mm_set1_ph(9.0);
17483        assert_eq_m128h(r, e);
17484    }
17485
17486    #[simd_test(enable = "avx512fp16,avx512vl")]
17487    unsafe fn test_mm_mask_add_ph() {
17488        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17489        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17490        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
17491        let r = _mm_mask_add_ph(src, 0b01010101, a, b);
17492        let e = _mm_set_ph(10., 9., 12., 9., 14., 9., 16., 9.);
17493        assert_eq_m128h(r, e);
17494    }
17495
17496    #[simd_test(enable = "avx512fp16,avx512vl")]
17497    unsafe fn test_mm_maskz_add_ph() {
17498        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17499        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17500        let r = _mm_maskz_add_ph(0b01010101, a, b);
17501        let e = _mm_set_ph(0., 9., 0., 9., 0., 9., 0., 9.);
17502        assert_eq_m128h(r, e);
17503    }
17504
17505    #[simd_test(enable = "avx512fp16,avx512vl")]
17506    unsafe fn test_mm256_add_ph() {
17507        let a = _mm256_set_ph(
17508            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17509        );
17510        let b = _mm256_set_ph(
17511            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17512        );
17513        let r = _mm256_add_ph(a, b);
17514        let e = _mm256_set1_ph(17.0);
17515        assert_eq_m256h(r, e);
17516    }
17517
17518    #[simd_test(enable = "avx512fp16,avx512vl")]
17519    unsafe fn test_mm256_mask_add_ph() {
17520        let a = _mm256_set_ph(
17521            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17522        );
17523        let b = _mm256_set_ph(
17524            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17525        );
17526        let src = _mm256_set_ph(
17527            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
17528        );
17529        let r = _mm256_mask_add_ph(src, 0b0101010101010101, a, b);
17530        let e = _mm256_set_ph(
17531            18., 17., 20., 17., 22., 17., 24., 17., 26., 17., 28., 17., 30., 17., 32., 17.,
17532        );
17533        assert_eq_m256h(r, e);
17534    }
17535
17536    #[simd_test(enable = "avx512fp16,avx512vl")]
17537    unsafe fn test_mm256_maskz_add_ph() {
17538        let a = _mm256_set_ph(
17539            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17540        );
17541        let b = _mm256_set_ph(
17542            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17543        );
17544        let r = _mm256_maskz_add_ph(0b0101010101010101, a, b);
17545        let e = _mm256_set_ph(
17546            0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17.,
17547        );
17548        assert_eq_m256h(r, e);
17549    }
17550
17551    #[simd_test(enable = "avx512fp16")]
17552    unsafe fn test_mm512_add_ph() {
17553        let a = _mm512_set_ph(
17554            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17555            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17556            31.0, 32.0,
17557        );
17558        let b = _mm512_set_ph(
17559            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17560            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17561            3.0, 2.0, 1.0,
17562        );
17563        let r = _mm512_add_ph(a, b);
17564        let e = _mm512_set1_ph(33.0);
17565        assert_eq_m512h(r, e);
17566    }
17567
17568    #[simd_test(enable = "avx512fp16")]
17569    unsafe fn test_mm512_mask_add_ph() {
17570        let a = _mm512_set_ph(
17571            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17572            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17573            31.0, 32.0,
17574        );
17575        let b = _mm512_set_ph(
17576            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17577            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17578            3.0, 2.0, 1.0,
17579        );
17580        let src = _mm512_set_ph(
17581            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17582            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17583        );
17584        let r = _mm512_mask_add_ph(src, 0b01010101010101010101010101010101, a, b);
17585        let e = _mm512_set_ph(
17586            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
17587            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
17588        );
17589        assert_eq_m512h(r, e);
17590    }
17591
17592    #[simd_test(enable = "avx512fp16")]
17593    unsafe fn test_mm512_maskz_add_ph() {
17594        let a = _mm512_set_ph(
17595            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17596            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17597            31.0, 32.0,
17598        );
17599        let b = _mm512_set_ph(
17600            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17601            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17602            3.0, 2.0, 1.0,
17603        );
17604        let r = _mm512_maskz_add_ph(0b01010101010101010101010101010101, a, b);
17605        let e = _mm512_set_ph(
17606            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
17607            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
17608        );
17609        assert_eq_m512h(r, e);
17610    }
17611
17612    #[simd_test(enable = "avx512fp16")]
17613    unsafe fn test_mm512_add_round_ph() {
17614        let a = _mm512_set_ph(
17615            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17616            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17617            31.0, 32.0,
17618        );
17619        let b = _mm512_set_ph(
17620            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17621            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17622            3.0, 2.0, 1.0,
17623        );
17624        let r = _mm512_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17625        let e = _mm512_set1_ph(33.0);
17626        assert_eq_m512h(r, e);
17627    }
17628
17629    #[simd_test(enable = "avx512fp16")]
17630    unsafe fn test_mm512_mask_add_round_ph() {
17631        let a = _mm512_set_ph(
17632            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17633            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17634            31.0, 32.0,
17635        );
17636        let b = _mm512_set_ph(
17637            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17638            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17639            3.0, 2.0, 1.0,
17640        );
17641        let src = _mm512_set_ph(
17642            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17643            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17644        );
17645        let r = _mm512_mask_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17646            src,
17647            0b01010101010101010101010101010101,
17648            a,
17649            b,
17650        );
17651        let e = _mm512_set_ph(
17652            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
17653            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
17654        );
17655        assert_eq_m512h(r, e);
17656    }
17657
17658    #[simd_test(enable = "avx512fp16")]
17659    unsafe fn test_mm512_maskz_add_round_ph() {
17660        let a = _mm512_set_ph(
17661            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17662            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17663            31.0, 32.0,
17664        );
17665        let b = _mm512_set_ph(
17666            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17667            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17668            3.0, 2.0, 1.0,
17669        );
17670        let r = _mm512_maskz_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17671            0b01010101010101010101010101010101,
17672            a,
17673            b,
17674        );
17675        let e = _mm512_set_ph(
17676            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
17677            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
17678        );
17679        assert_eq_m512h(r, e);
17680    }
17681
17682    #[simd_test(enable = "avx512fp16")]
17683    unsafe fn test_mm_add_round_sh() {
17684        let a = _mm_set_sh(1.0);
17685        let b = _mm_set_sh(2.0);
17686        let r = _mm_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17687        let e = _mm_set_sh(3.0);
17688        assert_eq_m128h(r, e);
17689    }
17690
17691    #[simd_test(enable = "avx512fp16")]
17692    unsafe fn test_mm_mask_add_round_sh() {
17693        let a = _mm_set_sh(1.0);
17694        let b = _mm_set_sh(2.0);
17695        let src = _mm_set_sh(4.0);
17696        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17697            src, 0, a, b,
17698        );
17699        let e = _mm_set_sh(4.0);
17700        assert_eq_m128h(r, e);
17701        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17702            src, 1, a, b,
17703        );
17704        let e = _mm_set_sh(3.0);
17705        assert_eq_m128h(r, e);
17706    }
17707
17708    #[simd_test(enable = "avx512fp16")]
17709    unsafe fn test_mm_maskz_add_round_sh() {
17710        let a = _mm_set_sh(1.0);
17711        let b = _mm_set_sh(2.0);
17712        let r =
17713            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
17714        let e = _mm_set_sh(0.0);
17715        assert_eq_m128h(r, e);
17716        let r =
17717            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
17718        let e = _mm_set_sh(3.0);
17719        assert_eq_m128h(r, e);
17720    }
17721
17722    #[simd_test(enable = "avx512fp16")]
17723    unsafe fn test_mm_add_sh() {
17724        let a = _mm_set_sh(1.0);
17725        let b = _mm_set_sh(2.0);
17726        let r = _mm_add_sh(a, b);
17727        let e = _mm_set_sh(3.0);
17728        assert_eq_m128h(r, e);
17729    }
17730
17731    #[simd_test(enable = "avx512fp16")]
17732    unsafe fn test_mm_mask_add_sh() {
17733        let a = _mm_set_sh(1.0);
17734        let b = _mm_set_sh(2.0);
17735        let src = _mm_set_sh(4.0);
17736        let r = _mm_mask_add_sh(src, 0, a, b);
17737        let e = _mm_set_sh(4.0);
17738        assert_eq_m128h(r, e);
17739        let r = _mm_mask_add_sh(src, 1, a, b);
17740        let e = _mm_set_sh(3.0);
17741        assert_eq_m128h(r, e);
17742    }
17743
17744    #[simd_test(enable = "avx512fp16")]
17745    unsafe fn test_mm_maskz_add_sh() {
17746        let a = _mm_set_sh(1.0);
17747        let b = _mm_set_sh(2.0);
17748        let r = _mm_maskz_add_sh(0, a, b);
17749        let e = _mm_set_sh(0.0);
17750        assert_eq_m128h(r, e);
17751        let r = _mm_maskz_add_sh(1, a, b);
17752        let e = _mm_set_sh(3.0);
17753        assert_eq_m128h(r, e);
17754    }
17755
17756    #[simd_test(enable = "avx512fp16,avx512vl")]
17757    unsafe fn test_mm_sub_ph() {
17758        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17759        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17760        let r = _mm_sub_ph(a, b);
17761        let e = _mm_set_ph(-7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0);
17762        assert_eq_m128h(r, e);
17763    }
17764
17765    #[simd_test(enable = "avx512fp16,avx512vl")]
17766    unsafe fn test_mm_mask_sub_ph() {
17767        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17768        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17769        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
17770        let r = _mm_mask_sub_ph(src, 0b01010101, a, b);
17771        let e = _mm_set_ph(10., -5., 12., -1., 14., 3., 16., 7.);
17772        assert_eq_m128h(r, e);
17773    }
17774
17775    #[simd_test(enable = "avx512fp16,avx512vl")]
17776    unsafe fn test_mm_maskz_sub_ph() {
17777        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17778        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17779        let r = _mm_maskz_sub_ph(0b01010101, a, b);
17780        let e = _mm_set_ph(0., -5., 0., -1., 0., 3., 0., 7.);
17781        assert_eq_m128h(r, e);
17782    }
17783
17784    #[simd_test(enable = "avx512fp16,avx512vl")]
17785    unsafe fn test_mm256_sub_ph() {
17786        let a = _mm256_set_ph(
17787            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17788        );
17789        let b = _mm256_set_ph(
17790            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17791        );
17792        let r = _mm256_sub_ph(a, b);
17793        let e = _mm256_set_ph(
17794            -15.0, -13.0, -11.0, -9.0, -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0,
17795            15.0,
17796        );
17797        assert_eq_m256h(r, e);
17798    }
17799
17800    #[simd_test(enable = "avx512fp16,avx512vl")]
17801    unsafe fn test_mm256_mask_sub_ph() {
17802        let a = _mm256_set_ph(
17803            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17804        );
17805        let b = _mm256_set_ph(
17806            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17807        );
17808        let src = _mm256_set_ph(
17809            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
17810        );
17811        let r = _mm256_mask_sub_ph(src, 0b0101010101010101, a, b);
17812        let e = _mm256_set_ph(
17813            18., -13., 20., -9., 22., -5., 24., -1., 26., 3., 28., 7., 30., 11., 32., 15.,
17814        );
17815        assert_eq_m256h(r, e);
17816    }
17817
17818    #[simd_test(enable = "avx512fp16,avx512vl")]
17819    unsafe fn test_mm256_maskz_sub_ph() {
17820        let a = _mm256_set_ph(
17821            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17822        );
17823        let b = _mm256_set_ph(
17824            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17825        );
17826        let r = _mm256_maskz_sub_ph(0b0101010101010101, a, b);
17827        let e = _mm256_set_ph(
17828            0., -13., 0., -9., 0., -5., 0., -1., 0., 3., 0., 7., 0., 11., 0., 15.,
17829        );
17830        assert_eq_m256h(r, e);
17831    }
17832
17833    #[simd_test(enable = "avx512fp16")]
17834    unsafe fn test_mm512_sub_ph() {
17835        let a = _mm512_set_ph(
17836            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17837            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17838            31.0, 32.0,
17839        );
17840        let b = _mm512_set_ph(
17841            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17842            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17843            3.0, 2.0, 1.0,
17844        );
17845        let r = _mm512_sub_ph(a, b);
17846        let e = _mm512_set_ph(
17847            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
17848            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
17849            23.0, 25.0, 27.0, 29.0, 31.0,
17850        );
17851        assert_eq_m512h(r, e);
17852    }
17853
17854    #[simd_test(enable = "avx512fp16")]
17855    unsafe fn test_mm512_mask_sub_ph() {
17856        let a = _mm512_set_ph(
17857            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17858            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17859            31.0, 32.0,
17860        );
17861        let b = _mm512_set_ph(
17862            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17863            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17864            3.0, 2.0, 1.0,
17865        );
17866        let src = _mm512_set_ph(
17867            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17868            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17869        );
17870        let r = _mm512_mask_sub_ph(src, 0b01010101010101010101010101010101, a, b);
17871        let e = _mm512_set_ph(
17872            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
17873            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
17874        );
17875        assert_eq_m512h(r, e);
17876    }
17877
17878    #[simd_test(enable = "avx512fp16")]
17879    unsafe fn test_mm512_maskz_sub_ph() {
17880        let a = _mm512_set_ph(
17881            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17882            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17883            31.0, 32.0,
17884        );
17885        let b = _mm512_set_ph(
17886            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17887            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17888            3.0, 2.0, 1.0,
17889        );
17890        let r = _mm512_maskz_sub_ph(0b01010101010101010101010101010101, a, b);
17891        let e = _mm512_set_ph(
17892            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
17893            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
17894        );
17895        assert_eq_m512h(r, e);
17896    }
17897
17898    #[simd_test(enable = "avx512fp16")]
17899    unsafe fn test_mm512_sub_round_ph() {
17900        let a = _mm512_set_ph(
17901            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17902            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17903            31.0, 32.0,
17904        );
17905        let b = _mm512_set_ph(
17906            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17907            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17908            3.0, 2.0, 1.0,
17909        );
17910        let r = _mm512_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17911        let e = _mm512_set_ph(
17912            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
17913            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
17914            23.0, 25.0, 27.0, 29.0, 31.0,
17915        );
17916        assert_eq_m512h(r, e);
17917    }
17918
17919    #[simd_test(enable = "avx512fp16")]
17920    unsafe fn test_mm512_mask_sub_round_ph() {
17921        let a = _mm512_set_ph(
17922            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17923            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17924            31.0, 32.0,
17925        );
17926        let b = _mm512_set_ph(
17927            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17928            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17929            3.0, 2.0, 1.0,
17930        );
17931        let src = _mm512_set_ph(
17932            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17933            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17934        );
17935        let r = _mm512_mask_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17936            src,
17937            0b01010101010101010101010101010101,
17938            a,
17939            b,
17940        );
17941        let e = _mm512_set_ph(
17942            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
17943            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
17944        );
17945        assert_eq_m512h(r, e);
17946    }
17947
17948    #[simd_test(enable = "avx512fp16")]
17949    unsafe fn test_mm512_maskz_sub_round_ph() {
17950        let a = _mm512_set_ph(
17951            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17952            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17953            31.0, 32.0,
17954        );
17955        let b = _mm512_set_ph(
17956            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17957            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17958            3.0, 2.0, 1.0,
17959        );
17960        let r = _mm512_maskz_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17961            0b01010101010101010101010101010101,
17962            a,
17963            b,
17964        );
17965        let e = _mm512_set_ph(
17966            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
17967            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
17968        );
17969        assert_eq_m512h(r, e);
17970    }
17971
17972    #[simd_test(enable = "avx512fp16")]
17973    unsafe fn test_mm_sub_round_sh() {
17974        let a = _mm_set_sh(1.0);
17975        let b = _mm_set_sh(2.0);
17976        let r = _mm_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17977        let e = _mm_set_sh(-1.0);
17978        assert_eq_m128h(r, e);
17979    }
17980
17981    #[simd_test(enable = "avx512fp16")]
17982    unsafe fn test_mm_mask_sub_round_sh() {
17983        let a = _mm_set_sh(1.0);
17984        let b = _mm_set_sh(2.0);
17985        let src = _mm_set_sh(4.0);
17986        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17987            src, 0, a, b,
17988        );
17989        let e = _mm_set_sh(4.0);
17990        assert_eq_m128h(r, e);
17991        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17992            src, 1, a, b,
17993        );
17994        let e = _mm_set_sh(-1.0);
17995        assert_eq_m128h(r, e);
17996    }
17997
17998    #[simd_test(enable = "avx512fp16")]
17999    unsafe fn test_mm_maskz_sub_round_sh() {
18000        let a = _mm_set_sh(1.0);
18001        let b = _mm_set_sh(2.0);
18002        let r =
18003            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18004        let e = _mm_set_sh(0.0);
18005        assert_eq_m128h(r, e);
18006        let r =
18007            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18008        let e = _mm_set_sh(-1.0);
18009        assert_eq_m128h(r, e);
18010    }
18011
18012    #[simd_test(enable = "avx512fp16")]
18013    unsafe fn test_mm_sub_sh() {
18014        let a = _mm_set_sh(1.0);
18015        let b = _mm_set_sh(2.0);
18016        let r = _mm_sub_sh(a, b);
18017        let e = _mm_set_sh(-1.0);
18018        assert_eq_m128h(r, e);
18019    }
18020
18021    #[simd_test(enable = "avx512fp16")]
18022    unsafe fn test_mm_mask_sub_sh() {
18023        let a = _mm_set_sh(1.0);
18024        let b = _mm_set_sh(2.0);
18025        let src = _mm_set_sh(4.0);
18026        let r = _mm_mask_sub_sh(src, 0, a, b);
18027        let e = _mm_set_sh(4.0);
18028        assert_eq_m128h(r, e);
18029        let r = _mm_mask_sub_sh(src, 1, a, b);
18030        let e = _mm_set_sh(-1.0);
18031        assert_eq_m128h(r, e);
18032    }
18033
18034    #[simd_test(enable = "avx512fp16")]
18035    unsafe fn test_mm_maskz_sub_sh() {
18036        let a = _mm_set_sh(1.0);
18037        let b = _mm_set_sh(2.0);
18038        let r = _mm_maskz_sub_sh(0, a, b);
18039        let e = _mm_set_sh(0.0);
18040        assert_eq_m128h(r, e);
18041        let r = _mm_maskz_sub_sh(1, a, b);
18042        let e = _mm_set_sh(-1.0);
18043        assert_eq_m128h(r, e);
18044    }
18045
18046    #[simd_test(enable = "avx512fp16,avx512vl")]
18047    unsafe fn test_mm_mul_ph() {
18048        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18049        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18050        let r = _mm_mul_ph(a, b);
18051        let e = _mm_set_ph(8.0, 14.0, 18.0, 20.0, 20.0, 18.0, 14.0, 8.0);
18052        assert_eq_m128h(r, e);
18053    }
18054
18055    #[simd_test(enable = "avx512fp16,avx512vl")]
18056    unsafe fn test_mm_mask_mul_ph() {
18057        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18058        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18059        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
18060        let r = _mm_mask_mul_ph(src, 0b01010101, a, b);
18061        let e = _mm_set_ph(10., 14., 12., 20., 14., 18., 16., 8.);
18062        assert_eq_m128h(r, e);
18063    }
18064
18065    #[simd_test(enable = "avx512fp16,avx512vl")]
18066    unsafe fn test_mm_maskz_mul_ph() {
18067        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18068        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18069        let r = _mm_maskz_mul_ph(0b01010101, a, b);
18070        let e = _mm_set_ph(0., 14., 0., 20., 0., 18., 0., 8.);
18071        assert_eq_m128h(r, e);
18072    }
18073
18074    #[simd_test(enable = "avx512fp16,avx512vl")]
18075    unsafe fn test_mm256_mul_ph() {
18076        let a = _mm256_set_ph(
18077            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18078        );
18079        let b = _mm256_set_ph(
18080            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18081        );
18082        let r = _mm256_mul_ph(a, b);
18083        let e = _mm256_set_ph(
18084            16.0, 30.0, 42.0, 52.0, 60.0, 66.0, 70.0, 72.0, 72.0, 70.0, 66.0, 60.0, 52.0, 42.0,
18085            30.0, 16.0,
18086        );
18087        assert_eq_m256h(r, e);
18088    }
18089
18090    #[simd_test(enable = "avx512fp16,avx512vl")]
18091    unsafe fn test_mm256_mask_mul_ph() {
18092        let a = _mm256_set_ph(
18093            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18094        );
18095        let b = _mm256_set_ph(
18096            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18097        );
18098        let src = _mm256_set_ph(
18099            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
18100        );
18101        let r = _mm256_mask_mul_ph(src, 0b0101010101010101, a, b);
18102        let e = _mm256_set_ph(
18103            18., 30., 20., 52., 22., 66., 24., 72., 26., 70., 28., 60., 30., 42., 32., 16.,
18104        );
18105        assert_eq_m256h(r, e);
18106    }
18107
18108    #[simd_test(enable = "avx512fp16,avx512vl")]
18109    unsafe fn test_mm256_maskz_mul_ph() {
18110        let a = _mm256_set_ph(
18111            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18112        );
18113        let b = _mm256_set_ph(
18114            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18115        );
18116        let r = _mm256_maskz_mul_ph(0b0101010101010101, a, b);
18117        let e = _mm256_set_ph(
18118            0., 30., 0., 52., 0., 66., 0., 72., 0., 70., 0., 60., 0., 42., 0., 16.,
18119        );
18120        assert_eq_m256h(r, e);
18121    }
18122
18123    #[simd_test(enable = "avx512fp16")]
18124    unsafe fn test_mm512_mul_ph() {
18125        let a = _mm512_set_ph(
18126            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18127            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18128            31.0, 32.0,
18129        );
18130        let b = _mm512_set_ph(
18131            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18132            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18133            3.0, 2.0, 1.0,
18134        );
18135        let r = _mm512_mul_ph(a, b);
18136        let e = _mm512_set_ph(
18137            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
18138            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
18139            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
18140        );
18141        assert_eq_m512h(r, e);
18142    }
18143
18144    #[simd_test(enable = "avx512fp16")]
18145    unsafe fn test_mm512_mask_mul_ph() {
18146        let a = _mm512_set_ph(
18147            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18148            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18149            31.0, 32.0,
18150        );
18151        let b = _mm512_set_ph(
18152            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18153            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18154            3.0, 2.0, 1.0,
18155        );
18156        let src = _mm512_set_ph(
18157            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18158            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18159        );
18160        let r = _mm512_mask_mul_ph(src, 0b01010101010101010101010101010101, a, b);
18161        let e = _mm512_set_ph(
18162            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
18163            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
18164        );
18165        assert_eq_m512h(r, e);
18166    }
18167
18168    #[simd_test(enable = "avx512fp16")]
18169    unsafe fn test_mm512_maskz_mul_ph() {
18170        let a = _mm512_set_ph(
18171            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18172            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18173            31.0, 32.0,
18174        );
18175        let b = _mm512_set_ph(
18176            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18177            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18178            3.0, 2.0, 1.0,
18179        );
18180        let r = _mm512_maskz_mul_ph(0b01010101010101010101010101010101, a, b);
18181        let e = _mm512_set_ph(
18182            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
18183            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
18184        );
18185        assert_eq_m512h(r, e);
18186    }
18187
18188    #[simd_test(enable = "avx512fp16")]
18189    unsafe fn test_mm512_mul_round_ph() {
18190        let a = _mm512_set_ph(
18191            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18192            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18193            31.0, 32.0,
18194        );
18195        let b = _mm512_set_ph(
18196            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18197            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18198            3.0, 2.0, 1.0,
18199        );
18200        let r = _mm512_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18201        let e = _mm512_set_ph(
18202            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
18203            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
18204            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
18205        );
18206        assert_eq_m512h(r, e);
18207    }
18208
18209    #[simd_test(enable = "avx512fp16")]
18210    unsafe fn test_mm512_mask_mul_round_ph() {
18211        let a = _mm512_set_ph(
18212            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18213            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18214            31.0, 32.0,
18215        );
18216        let b = _mm512_set_ph(
18217            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18218            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18219            3.0, 2.0, 1.0,
18220        );
18221        let src = _mm512_set_ph(
18222            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18223            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18224        );
18225        let r = _mm512_mask_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18226            src,
18227            0b01010101010101010101010101010101,
18228            a,
18229            b,
18230        );
18231        let e = _mm512_set_ph(
18232            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
18233            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
18234        );
18235        assert_eq_m512h(r, e);
18236    }
18237
18238    #[simd_test(enable = "avx512fp16")]
18239    unsafe fn test_mm512_maskz_mul_round_ph() {
18240        let a = _mm512_set_ph(
18241            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18242            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18243            31.0, 32.0,
18244        );
18245        let b = _mm512_set_ph(
18246            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18247            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18248            3.0, 2.0, 1.0,
18249        );
18250        let r = _mm512_maskz_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18251            0b01010101010101010101010101010101,
18252            a,
18253            b,
18254        );
18255        let e = _mm512_set_ph(
18256            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
18257            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
18258        );
18259        assert_eq_m512h(r, e);
18260    }
18261
18262    #[simd_test(enable = "avx512fp16")]
18263    unsafe fn test_mm_mul_round_sh() {
18264        let a = _mm_set_sh(1.0);
18265        let b = _mm_set_sh(2.0);
18266        let r = _mm_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18267        let e = _mm_set_sh(2.0);
18268        assert_eq_m128h(r, e);
18269    }
18270
18271    #[simd_test(enable = "avx512fp16")]
18272    unsafe fn test_mm_mask_mul_round_sh() {
18273        let a = _mm_set_sh(1.0);
18274        let b = _mm_set_sh(2.0);
18275        let src = _mm_set_sh(4.0);
18276        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18277            src, 0, a, b,
18278        );
18279        let e = _mm_set_sh(4.0);
18280        assert_eq_m128h(r, e);
18281        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18282            src, 1, a, b,
18283        );
18284        let e = _mm_set_sh(2.0);
18285        assert_eq_m128h(r, e);
18286    }
18287
18288    #[simd_test(enable = "avx512fp16")]
18289    unsafe fn test_mm_maskz_mul_round_sh() {
18290        let a = _mm_set_sh(1.0);
18291        let b = _mm_set_sh(2.0);
18292        let r =
18293            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18294        let e = _mm_set_sh(0.0);
18295        assert_eq_m128h(r, e);
18296        let r =
18297            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18298        let e = _mm_set_sh(2.0);
18299        assert_eq_m128h(r, e);
18300    }
18301
18302    #[simd_test(enable = "avx512fp16")]
18303    unsafe fn test_mm_mul_sh() {
18304        let a = _mm_set_sh(1.0);
18305        let b = _mm_set_sh(2.0);
18306        let r = _mm_mul_sh(a, b);
18307        let e = _mm_set_sh(2.0);
18308        assert_eq_m128h(r, e);
18309    }
18310
18311    #[simd_test(enable = "avx512fp16")]
18312    unsafe fn test_mm_mask_mul_sh() {
18313        let a = _mm_set_sh(1.0);
18314        let b = _mm_set_sh(2.0);
18315        let src = _mm_set_sh(4.0);
18316        let r = _mm_mask_mul_sh(src, 0, a, b);
18317        let e = _mm_set_sh(4.0);
18318        assert_eq_m128h(r, e);
18319        let r = _mm_mask_mul_sh(src, 1, a, b);
18320        let e = _mm_set_sh(2.0);
18321        assert_eq_m128h(r, e);
18322    }
18323
18324    #[simd_test(enable = "avx512fp16")]
18325    unsafe fn test_mm_maskz_mul_sh() {
18326        let a = _mm_set_sh(1.0);
18327        let b = _mm_set_sh(2.0);
18328        let r = _mm_maskz_mul_sh(0, a, b);
18329        let e = _mm_set_sh(0.0);
18330        assert_eq_m128h(r, e);
18331        let r = _mm_maskz_mul_sh(1, a, b);
18332        let e = _mm_set_sh(2.0);
18333        assert_eq_m128h(r, e);
18334    }
18335
18336    #[simd_test(enable = "avx512fp16,avx512vl")]
18337    unsafe fn test_mm_div_ph() {
18338        let a = _mm_set1_ph(1.0);
18339        let b = _mm_set1_ph(2.0);
18340        let r = _mm_div_ph(a, b);
18341        let e = _mm_set1_ph(0.5);
18342        assert_eq_m128h(r, e);
18343    }
18344
18345    #[simd_test(enable = "avx512fp16,avx512vl")]
18346    unsafe fn test_mm_mask_div_ph() {
18347        let a = _mm_set1_ph(1.0);
18348        let b = _mm_set1_ph(2.0);
18349        let src = _mm_set_ph(4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0);
18350        let r = _mm_mask_div_ph(src, 0b01010101, a, b);
18351        let e = _mm_set_ph(4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5);
18352        assert_eq_m128h(r, e);
18353    }
18354
18355    #[simd_test(enable = "avx512fp16,avx512vl")]
18356    unsafe fn test_mm_maskz_div_ph() {
18357        let a = _mm_set1_ph(1.0);
18358        let b = _mm_set1_ph(2.0);
18359        let r = _mm_maskz_div_ph(0b01010101, a, b);
18360        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
18361        assert_eq_m128h(r, e);
18362    }
18363
18364    #[simd_test(enable = "avx512fp16,avx512vl")]
18365    unsafe fn test_mm256_div_ph() {
18366        let a = _mm256_set1_ph(1.0);
18367        let b = _mm256_set1_ph(2.0);
18368        let r = _mm256_div_ph(a, b);
18369        let e = _mm256_set1_ph(0.5);
18370        assert_eq_m256h(r, e);
18371    }
18372
18373    #[simd_test(enable = "avx512fp16,avx512vl")]
18374    unsafe fn test_mm256_mask_div_ph() {
18375        let a = _mm256_set1_ph(1.0);
18376        let b = _mm256_set1_ph(2.0);
18377        let src = _mm256_set_ph(
18378            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18379            19.0,
18380        );
18381        let r = _mm256_mask_div_ph(src, 0b0101010101010101, a, b);
18382        let e = _mm256_set_ph(
18383            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18384        );
18385        assert_eq_m256h(r, e);
18386    }
18387
18388    #[simd_test(enable = "avx512fp16,avx512vl")]
18389    unsafe fn test_mm256_maskz_div_ph() {
18390        let a = _mm256_set1_ph(1.0);
18391        let b = _mm256_set1_ph(2.0);
18392        let r = _mm256_maskz_div_ph(0b0101010101010101, a, b);
18393        let e = _mm256_set_ph(
18394            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18395        );
18396        assert_eq_m256h(r, e);
18397    }
18398
18399    #[simd_test(enable = "avx512fp16")]
18400    unsafe fn test_mm512_div_ph() {
18401        let a = _mm512_set1_ph(1.0);
18402        let b = _mm512_set1_ph(2.0);
18403        let r = _mm512_div_ph(a, b);
18404        let e = _mm512_set1_ph(0.5);
18405        assert_eq_m512h(r, e);
18406    }
18407
18408    #[simd_test(enable = "avx512fp16")]
18409    unsafe fn test_mm512_mask_div_ph() {
18410        let a = _mm512_set1_ph(1.0);
18411        let b = _mm512_set1_ph(2.0);
18412        let src = _mm512_set_ph(
18413            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18414            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
18415            33.0, 34.0, 35.0,
18416        );
18417        let r = _mm512_mask_div_ph(src, 0b01010101010101010101010101010101, a, b);
18418        let e = _mm512_set_ph(
18419            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18420            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
18421        );
18422        assert_eq_m512h(r, e);
18423    }
18424
18425    #[simd_test(enable = "avx512fp16")]
18426    unsafe fn test_mm512_maskz_div_ph() {
18427        let a = _mm512_set1_ph(1.0);
18428        let b = _mm512_set1_ph(2.0);
18429        let r = _mm512_maskz_div_ph(0b01010101010101010101010101010101, a, b);
18430        let e = _mm512_set_ph(
18431            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
18432            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18433        );
18434        assert_eq_m512h(r, e);
18435    }
18436
18437    #[simd_test(enable = "avx512fp16")]
18438    unsafe fn test_mm512_div_round_ph() {
18439        let a = _mm512_set1_ph(1.0);
18440        let b = _mm512_set1_ph(2.0);
18441        let r = _mm512_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18442        let e = _mm512_set1_ph(0.5);
18443        assert_eq_m512h(r, e);
18444    }
18445
18446    #[simd_test(enable = "avx512fp16")]
18447    unsafe fn test_mm512_mask_div_round_ph() {
18448        let a = _mm512_set1_ph(1.0);
18449        let b = _mm512_set1_ph(2.0);
18450        let src = _mm512_set_ph(
18451            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18452            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
18453            33.0, 34.0, 35.0,
18454        );
18455        let r = _mm512_mask_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18456            src,
18457            0b01010101010101010101010101010101,
18458            a,
18459            b,
18460        );
18461        let e = _mm512_set_ph(
18462            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18463            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
18464        );
18465        assert_eq_m512h(r, e);
18466    }
18467
18468    #[simd_test(enable = "avx512fp16")]
18469    unsafe fn test_mm512_maskz_div_round_ph() {
18470        let a = _mm512_set1_ph(1.0);
18471        let b = _mm512_set1_ph(2.0);
18472        let r = _mm512_maskz_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18473            0b01010101010101010101010101010101,
18474            a,
18475            b,
18476        );
18477        let e = _mm512_set_ph(
18478            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
18479            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18480        );
18481        assert_eq_m512h(r, e);
18482    }
18483
18484    #[simd_test(enable = "avx512fp16")]
18485    unsafe fn test_mm_div_round_sh() {
18486        let a = _mm_set_sh(1.0);
18487        let b = _mm_set_sh(2.0);
18488        let r = _mm_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18489        let e = _mm_set_sh(0.5);
18490        assert_eq_m128h(r, e);
18491    }
18492
18493    #[simd_test(enable = "avx512fp16")]
18494    unsafe fn test_mm_mask_div_round_sh() {
18495        let a = _mm_set_sh(1.0);
18496        let b = _mm_set_sh(2.0);
18497        let src = _mm_set_sh(4.0);
18498        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18499            src, 0, a, b,
18500        );
18501        let e = _mm_set_sh(4.0);
18502        assert_eq_m128h(r, e);
18503        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18504            src, 1, a, b,
18505        );
18506        let e = _mm_set_sh(0.5);
18507        assert_eq_m128h(r, e);
18508    }
18509
18510    #[simd_test(enable = "avx512fp16")]
18511    unsafe fn test_mm_maskz_div_round_sh() {
18512        let a = _mm_set_sh(1.0);
18513        let b = _mm_set_sh(2.0);
18514        let r =
18515            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18516        let e = _mm_set_sh(0.0);
18517        assert_eq_m128h(r, e);
18518        let r =
18519            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18520        let e = _mm_set_sh(0.5);
18521        assert_eq_m128h(r, e);
18522    }
18523
18524    #[simd_test(enable = "avx512fp16")]
18525    unsafe fn test_mm_div_sh() {
18526        let a = _mm_set_sh(1.0);
18527        let b = _mm_set_sh(2.0);
18528        let r = _mm_div_sh(a, b);
18529        let e = _mm_set_sh(0.5);
18530        assert_eq_m128h(r, e);
18531    }
18532
18533    #[simd_test(enable = "avx512fp16")]
18534    unsafe fn test_mm_mask_div_sh() {
18535        let a = _mm_set_sh(1.0);
18536        let b = _mm_set_sh(2.0);
18537        let src = _mm_set_sh(4.0);
18538        let r = _mm_mask_div_sh(src, 0, a, b);
18539        let e = _mm_set_sh(4.0);
18540        assert_eq_m128h(r, e);
18541        let r = _mm_mask_div_sh(src, 1, a, b);
18542        let e = _mm_set_sh(0.5);
18543        assert_eq_m128h(r, e);
18544    }
18545
18546    #[simd_test(enable = "avx512fp16")]
18547    unsafe fn test_mm_maskz_div_sh() {
18548        let a = _mm_set_sh(1.0);
18549        let b = _mm_set_sh(2.0);
18550        let r = _mm_maskz_div_sh(0, a, b);
18551        let e = _mm_set_sh(0.0);
18552        assert_eq_m128h(r, e);
18553        let r = _mm_maskz_div_sh(1, a, b);
18554        let e = _mm_set_sh(0.5);
18555        assert_eq_m128h(r, e);
18556    }
18557
18558    #[simd_test(enable = "avx512fp16,avx512vl")]
18559    unsafe fn test_mm_mul_pch() {
18560        let a = _mm_set1_pch(0.0, 1.0);
18561        let b = _mm_set1_pch(0.0, 1.0);
18562        let r = _mm_mul_pch(a, b);
18563        let e = _mm_set1_pch(-1.0, 0.0);
18564        assert_eq_m128h(r, e);
18565    }
18566
18567    #[simd_test(enable = "avx512fp16,avx512vl")]
18568    unsafe fn test_mm_mask_mul_pch() {
18569        let a = _mm_set1_pch(0.0, 1.0);
18570        let b = _mm_set1_pch(0.0, 1.0);
18571        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18572        let r = _mm_mask_mul_pch(src, 0b0101, a, b);
18573        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18574        assert_eq_m128h(r, e);
18575    }
18576
18577    #[simd_test(enable = "avx512fp16,avx512vl")]
18578    unsafe fn test_mm_maskz_mul_pch() {
18579        let a = _mm_set1_pch(0.0, 1.0);
18580        let b = _mm_set1_pch(0.0, 1.0);
18581        let r = _mm_maskz_mul_pch(0b0101, a, b);
18582        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
18583        assert_eq_m128h(r, e);
18584    }
18585
18586    #[simd_test(enable = "avx512fp16,avx512vl")]
18587    unsafe fn test_mm256_mul_pch() {
18588        let a = _mm256_set1_pch(0.0, 1.0);
18589        let b = _mm256_set1_pch(0.0, 1.0);
18590        let r = _mm256_mul_pch(a, b);
18591        let e = _mm256_set1_pch(-1.0, 0.0);
18592        assert_eq_m256h(r, e);
18593    }
18594
18595    #[simd_test(enable = "avx512fp16,avx512vl")]
18596    unsafe fn test_mm256_mask_mul_pch() {
18597        let a = _mm256_set1_pch(0.0, 1.0);
18598        let b = _mm256_set1_pch(0.0, 1.0);
18599        let src = _mm256_setr_ph(
18600            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18601        );
18602        let r = _mm256_mask_mul_pch(src, 0b01010101, a, b);
18603        let e = _mm256_setr_ph(
18604            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18605        );
18606        assert_eq_m256h(r, e);
18607    }
18608
18609    #[simd_test(enable = "avx512fp16,avx512vl")]
18610    unsafe fn test_mm256_maskz_mul_pch() {
18611        let a = _mm256_set1_pch(0.0, 1.0);
18612        let b = _mm256_set1_pch(0.0, 1.0);
18613        let r = _mm256_maskz_mul_pch(0b01010101, a, b);
18614        let e = _mm256_setr_ph(
18615            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18616        );
18617        assert_eq_m256h(r, e);
18618    }
18619
18620    #[simd_test(enable = "avx512fp16")]
18621    unsafe fn test_mm512_mul_pch() {
18622        let a = _mm512_set1_pch(0.0, 1.0);
18623        let b = _mm512_set1_pch(0.0, 1.0);
18624        let r = _mm512_mul_pch(a, b);
18625        let e = _mm512_set1_pch(-1.0, 0.0);
18626        assert_eq_m512h(r, e);
18627    }
18628
18629    #[simd_test(enable = "avx512fp16")]
18630    unsafe fn test_mm512_mask_mul_pch() {
18631        let a = _mm512_set1_pch(0.0, 1.0);
18632        let b = _mm512_set1_pch(0.0, 1.0);
18633        let src = _mm512_setr_ph(
18634            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18635            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18636            32.0, 33.0,
18637        );
18638        let r = _mm512_mask_mul_pch(src, 0b0101010101010101, a, b);
18639        let e = _mm512_setr_ph(
18640            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18641            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18642            33.0,
18643        );
18644        assert_eq_m512h(r, e);
18645    }
18646
18647    #[simd_test(enable = "avx512fp16")]
18648    unsafe fn test_mm512_maskz_mul_pch() {
18649        let a = _mm512_set1_pch(0.0, 1.0);
18650        let b = _mm512_set1_pch(0.0, 1.0);
18651        let r = _mm512_maskz_mul_pch(0b0101010101010101, a, b);
18652        let e = _mm512_setr_ph(
18653            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18654            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18655        );
18656        assert_eq_m512h(r, e);
18657    }
18658
18659    #[simd_test(enable = "avx512fp16")]
18660    unsafe fn test_mm512_mul_round_pch() {
18661        let a = _mm512_set1_pch(0.0, 1.0);
18662        let b = _mm512_set1_pch(0.0, 1.0);
18663        let r = _mm512_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18664        let e = _mm512_set1_pch(-1.0, 0.0);
18665        assert_eq_m512h(r, e);
18666    }
18667
18668    #[simd_test(enable = "avx512fp16")]
18669    unsafe fn test_mm512_mask_mul_round_pch() {
18670        let a = _mm512_set1_pch(0.0, 1.0);
18671        let b = _mm512_set1_pch(0.0, 1.0);
18672        let src = _mm512_setr_ph(
18673            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18674            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18675            32.0, 33.0,
18676        );
18677        let r = _mm512_mask_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18678            src,
18679            0b0101010101010101,
18680            a,
18681            b,
18682        );
18683        let e = _mm512_setr_ph(
18684            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18685            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18686            33.0,
18687        );
18688        assert_eq_m512h(r, e);
18689    }
18690
18691    #[simd_test(enable = "avx512fp16")]
18692    unsafe fn test_mm512_maskz_mul_round_pch() {
18693        let a = _mm512_set1_pch(0.0, 1.0);
18694        let b = _mm512_set1_pch(0.0, 1.0);
18695        let r = _mm512_maskz_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18696            0b0101010101010101,
18697            a,
18698            b,
18699        );
18700        let e = _mm512_setr_ph(
18701            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18702            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18703        );
18704        assert_eq_m512h(r, e);
18705    }
18706
18707    #[simd_test(enable = "avx512fp16")]
18708    unsafe fn test_mm_mul_round_sch() {
18709        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18710        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18711        let r = _mm_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18712        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18713        assert_eq_m128h(r, e);
18714    }
18715
18716    #[simd_test(enable = "avx512fp16")]
18717    unsafe fn test_mm_mask_mul_round_sch() {
18718        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18719        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18720        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18721        let r = _mm_mask_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18722            src, 0, a, b,
18723        );
18724        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18725        assert_eq_m128h(r, e);
18726    }
18727
18728    #[simd_test(enable = "avx512fp16")]
18729    unsafe fn test_mm_maskz_mul_round_sch() {
18730        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18731        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18732        let r =
18733            _mm_maskz_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18734        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18735        assert_eq_m128h(r, e);
18736    }
18737
18738    #[simd_test(enable = "avx512fp16")]
18739    unsafe fn test_mm_mul_sch() {
18740        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18741        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18742        let r = _mm_mul_sch(a, b);
18743        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18744        assert_eq_m128h(r, e);
18745    }
18746
18747    #[simd_test(enable = "avx512fp16")]
18748    unsafe fn test_mm_mask_mul_sch() {
18749        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18750        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18751        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18752        let r = _mm_mask_mul_sch(src, 0, a, b);
18753        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18754        assert_eq_m128h(r, e);
18755    }
18756
18757    #[simd_test(enable = "avx512fp16")]
18758    unsafe fn test_mm_maskz_mul_sch() {
18759        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18760        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18761        let r = _mm_maskz_mul_sch(0, a, b);
18762        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18763        assert_eq_m128h(r, e);
18764    }
18765
18766    #[simd_test(enable = "avx512fp16,avx512vl")]
18767    unsafe fn test_mm_fmul_pch() {
18768        let a = _mm_set1_pch(0.0, 1.0);
18769        let b = _mm_set1_pch(0.0, 1.0);
18770        let r = _mm_fmul_pch(a, b);
18771        let e = _mm_set1_pch(-1.0, 0.0);
18772        assert_eq_m128h(r, e);
18773    }
18774
18775    #[simd_test(enable = "avx512fp16,avx512vl")]
18776    unsafe fn test_mm_mask_fmul_pch() {
18777        let a = _mm_set1_pch(0.0, 1.0);
18778        let b = _mm_set1_pch(0.0, 1.0);
18779        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18780        let r = _mm_mask_fmul_pch(src, 0b0101, a, b);
18781        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18782        assert_eq_m128h(r, e);
18783    }
18784
18785    #[simd_test(enable = "avx512fp16,avx512vl")]
18786    unsafe fn test_mm_maskz_fmul_pch() {
18787        let a = _mm_set1_pch(0.0, 1.0);
18788        let b = _mm_set1_pch(0.0, 1.0);
18789        let r = _mm_maskz_fmul_pch(0b0101, a, b);
18790        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
18791        assert_eq_m128h(r, e);
18792    }
18793
18794    #[simd_test(enable = "avx512fp16,avx512vl")]
18795    unsafe fn test_mm256_fmul_pch() {
18796        let a = _mm256_set1_pch(0.0, 1.0);
18797        let b = _mm256_set1_pch(0.0, 1.0);
18798        let r = _mm256_fmul_pch(a, b);
18799        let e = _mm256_set1_pch(-1.0, 0.0);
18800        assert_eq_m256h(r, e);
18801    }
18802
18803    #[simd_test(enable = "avx512fp16,avx512vl")]
18804    unsafe fn test_mm256_mask_fmul_pch() {
18805        let a = _mm256_set1_pch(0.0, 1.0);
18806        let b = _mm256_set1_pch(0.0, 1.0);
18807        let src = _mm256_setr_ph(
18808            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18809        );
18810        let r = _mm256_mask_fmul_pch(src, 0b01010101, a, b);
18811        let e = _mm256_setr_ph(
18812            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18813        );
18814        assert_eq_m256h(r, e);
18815    }
18816
18817    #[simd_test(enable = "avx512fp16,avx512vl")]
18818    unsafe fn test_mm256_maskz_fmul_pch() {
18819        let a = _mm256_set1_pch(0.0, 1.0);
18820        let b = _mm256_set1_pch(0.0, 1.0);
18821        let r = _mm256_maskz_fmul_pch(0b01010101, a, b);
18822        let e = _mm256_setr_ph(
18823            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18824        );
18825        assert_eq_m256h(r, e);
18826    }
18827
18828    #[simd_test(enable = "avx512fp16")]
18829    unsafe fn test_mm512_fmul_pch() {
18830        let a = _mm512_set1_pch(0.0, 1.0);
18831        let b = _mm512_set1_pch(0.0, 1.0);
18832        let r = _mm512_fmul_pch(a, b);
18833        let e = _mm512_set1_pch(-1.0, 0.0);
18834        assert_eq_m512h(r, e);
18835    }
18836
18837    #[simd_test(enable = "avx512fp16")]
18838    unsafe fn test_mm512_mask_fmul_pch() {
18839        let a = _mm512_set1_pch(0.0, 1.0);
18840        let b = _mm512_set1_pch(0.0, 1.0);
18841        let src = _mm512_setr_ph(
18842            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18843            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18844            32.0, 33.0,
18845        );
18846        let r = _mm512_mask_fmul_pch(src, 0b0101010101010101, a, b);
18847        let e = _mm512_setr_ph(
18848            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18849            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18850            33.0,
18851        );
18852        assert_eq_m512h(r, e);
18853    }
18854
18855    #[simd_test(enable = "avx512fp16")]
18856    unsafe fn test_mm512_maskz_fmul_pch() {
18857        let a = _mm512_set1_pch(0.0, 1.0);
18858        let b = _mm512_set1_pch(0.0, 1.0);
18859        let r = _mm512_maskz_fmul_pch(0b0101010101010101, a, b);
18860        let e = _mm512_setr_ph(
18861            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18862            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18863        );
18864        assert_eq_m512h(r, e);
18865    }
18866
18867    #[simd_test(enable = "avx512fp16")]
18868    unsafe fn test_mm512_fmul_round_pch() {
18869        let a = _mm512_set1_pch(0.0, 1.0);
18870        let b = _mm512_set1_pch(0.0, 1.0);
18871        let r = _mm512_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18872        let e = _mm512_set1_pch(-1.0, 0.0);
18873        assert_eq_m512h(r, e);
18874    }
18875
18876    #[simd_test(enable = "avx512fp16")]
18877    unsafe fn test_mm512_mask_fmul_round_pch() {
18878        let a = _mm512_set1_pch(0.0, 1.0);
18879        let b = _mm512_set1_pch(0.0, 1.0);
18880        let src = _mm512_setr_ph(
18881            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18882            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18883            32.0, 33.0,
18884        );
18885        let r = _mm512_mask_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18886            src,
18887            0b0101010101010101,
18888            a,
18889            b,
18890        );
18891        let e = _mm512_setr_ph(
18892            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18893            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18894            33.0,
18895        );
18896        assert_eq_m512h(r, e);
18897    }
18898
18899    #[simd_test(enable = "avx512fp16")]
18900    unsafe fn test_mm512_maskz_fmul_round_pch() {
18901        let a = _mm512_set1_pch(0.0, 1.0);
18902        let b = _mm512_set1_pch(0.0, 1.0);
18903        let r = _mm512_maskz_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18904            0b0101010101010101,
18905            a,
18906            b,
18907        );
18908        let e = _mm512_setr_ph(
18909            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18910            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18911        );
18912        assert_eq_m512h(r, e);
18913    }
18914
18915    #[simd_test(enable = "avx512fp16")]
18916    unsafe fn test_mm_fmul_round_sch() {
18917        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18918        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18919        let r = _mm_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18920        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18921        assert_eq_m128h(r, e);
18922    }
18923
18924    #[simd_test(enable = "avx512fp16")]
18925    unsafe fn test_mm_mask_fmul_round_sch() {
18926        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18927        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18928        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18929        let r = _mm_mask_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18930            src, 0, a, b,
18931        );
18932        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18933        assert_eq_m128h(r, e);
18934    }
18935
18936    #[simd_test(enable = "avx512fp16")]
18937    unsafe fn test_mm_maskz_fmul_round_sch() {
18938        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18939        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18940        let r =
18941            _mm_maskz_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18942        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18943        assert_eq_m128h(r, e);
18944    }
18945
18946    #[simd_test(enable = "avx512fp16")]
18947    unsafe fn test_mm_fmul_sch() {
18948        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18949        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18950        let r = _mm_fmul_sch(a, b);
18951        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18952        assert_eq_m128h(r, e);
18953    }
18954
18955    #[simd_test(enable = "avx512fp16")]
18956    unsafe fn test_mm_mask_fmul_sch() {
18957        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18958        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18959        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18960        let r = _mm_mask_fmul_sch(src, 0, a, b);
18961        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18962        assert_eq_m128h(r, e);
18963    }
18964
18965    #[simd_test(enable = "avx512fp16")]
18966    unsafe fn test_mm_maskz_fmul_sch() {
18967        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18968        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18969        let r = _mm_maskz_fmul_sch(0, a, b);
18970        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18971        assert_eq_m128h(r, e);
18972    }
18973
18974    #[simd_test(enable = "avx512fp16,avx512vl")]
18975    unsafe fn test_mm_cmul_pch() {
18976        let a = _mm_set1_pch(0.0, 1.0);
18977        let b = _mm_set1_pch(0.0, -1.0);
18978        let r = _mm_cmul_pch(a, b);
18979        let e = _mm_set1_pch(-1.0, 0.0);
18980        assert_eq_m128h(r, e);
18981    }
18982
18983    #[simd_test(enable = "avx512fp16,avx512vl")]
18984    unsafe fn test_mm_mask_cmul_pch() {
18985        let a = _mm_set1_pch(0.0, 1.0);
18986        let b = _mm_set1_pch(0.0, -1.0);
18987        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18988        let r = _mm_mask_cmul_pch(src, 0b0101, a, b);
18989        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18990        assert_eq_m128h(r, e);
18991    }
18992
18993    #[simd_test(enable = "avx512fp16,avx512vl")]
18994    unsafe fn test_mm_maskz_cmul_pch() {
18995        let a = _mm_set1_pch(0.0, 1.0);
18996        let b = _mm_set1_pch(0.0, -1.0);
18997        let r = _mm_maskz_cmul_pch(0b0101, a, b);
18998        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
18999        assert_eq_m128h(r, e);
19000    }
19001
19002    #[simd_test(enable = "avx512fp16,avx512vl")]
19003    unsafe fn test_mm256_cmul_pch() {
19004        let a = _mm256_set1_pch(0.0, 1.0);
19005        let b = _mm256_set1_pch(0.0, -1.0);
19006        let r = _mm256_cmul_pch(a, b);
19007        let e = _mm256_set1_pch(-1.0, 0.0);
19008        assert_eq_m256h(r, e);
19009    }
19010
19011    #[simd_test(enable = "avx512fp16,avx512vl")]
19012    unsafe fn test_mm256_mask_cmul_pch() {
19013        let a = _mm256_set1_pch(0.0, 1.0);
19014        let b = _mm256_set1_pch(0.0, -1.0);
19015        let src = _mm256_setr_ph(
19016            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19017        );
19018        let r = _mm256_mask_cmul_pch(src, 0b01010101, a, b);
19019        let e = _mm256_setr_ph(
19020            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19021        );
19022        assert_eq_m256h(r, e);
19023    }
19024
19025    #[simd_test(enable = "avx512fp16,avx512vl")]
19026    unsafe fn test_mm256_maskz_cmul_pch() {
19027        let a = _mm256_set1_pch(0.0, 1.0);
19028        let b = _mm256_set1_pch(0.0, -1.0);
19029        let r = _mm256_maskz_cmul_pch(0b01010101, a, b);
19030        let e = _mm256_setr_ph(
19031            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19032        );
19033        assert_eq_m256h(r, e);
19034    }
19035
19036    #[simd_test(enable = "avx512fp16")]
19037    unsafe fn test_mm512_cmul_pch() {
19038        let a = _mm512_set1_pch(0.0, 1.0);
19039        let b = _mm512_set1_pch(0.0, -1.0);
19040        let r = _mm512_cmul_pch(a, b);
19041        let e = _mm512_set1_pch(-1.0, 0.0);
19042        assert_eq_m512h(r, e);
19043    }
19044
19045    #[simd_test(enable = "avx512fp16")]
19046    unsafe fn test_mm512_mask_cmul_pch() {
19047        let a = _mm512_set1_pch(0.0, 1.0);
19048        let b = _mm512_set1_pch(0.0, -1.0);
19049        let src = _mm512_setr_ph(
19050            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19051            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19052            32.0, 33.0,
19053        );
19054        let r = _mm512_mask_cmul_pch(src, 0b0101010101010101, a, b);
19055        let e = _mm512_setr_ph(
19056            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19057            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19058            33.0,
19059        );
19060        assert_eq_m512h(r, e);
19061    }
19062
19063    #[simd_test(enable = "avx512fp16")]
19064    unsafe fn test_mm512_maskz_cmul_pch() {
19065        let a = _mm512_set1_pch(0.0, 1.0);
19066        let b = _mm512_set1_pch(0.0, -1.0);
19067        let r = _mm512_maskz_cmul_pch(0b0101010101010101, a, b);
19068        let e = _mm512_setr_ph(
19069            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19070            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19071        );
19072        assert_eq_m512h(r, e);
19073    }
19074
19075    #[simd_test(enable = "avx512fp16")]
19076    unsafe fn test_mm512_cmul_round_pch() {
19077        let a = _mm512_set1_pch(0.0, 1.0);
19078        let b = _mm512_set1_pch(0.0, -1.0);
19079        let r = _mm512_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19080        let e = _mm512_set1_pch(-1.0, 0.0);
19081        assert_eq_m512h(r, e);
19082    }
19083
19084    #[simd_test(enable = "avx512fp16")]
19085    unsafe fn test_mm512_mask_cmul_round_pch() {
19086        let a = _mm512_set1_pch(0.0, 1.0);
19087        let b = _mm512_set1_pch(0.0, -1.0);
19088        let src = _mm512_setr_ph(
19089            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19090            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19091            32.0, 33.0,
19092        );
19093        let r = _mm512_mask_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19094            src,
19095            0b0101010101010101,
19096            a,
19097            b,
19098        );
19099        let e = _mm512_setr_ph(
19100            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19101            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19102            33.0,
19103        );
19104        assert_eq_m512h(r, e);
19105    }
19106
19107    #[simd_test(enable = "avx512fp16")]
19108    unsafe fn test_mm512_maskz_cmul_round_pch() {
19109        let a = _mm512_set1_pch(0.0, 1.0);
19110        let b = _mm512_set1_pch(0.0, -1.0);
19111        let r = _mm512_maskz_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19112            0b0101010101010101,
19113            a,
19114            b,
19115        );
19116        let e = _mm512_setr_ph(
19117            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19118            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19119        );
19120        assert_eq_m512h(r, e);
19121    }
19122
19123    #[simd_test(enable = "avx512fp16")]
19124    unsafe fn test_mm_cmul_sch() {
19125        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19126        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19127        let r = _mm_cmul_sch(a, b);
19128        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19129        assert_eq_m128h(r, e);
19130    }
19131
19132    #[simd_test(enable = "avx512fp16")]
19133    unsafe fn test_mm_mask_cmul_sch() {
19134        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19135        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19136        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19137        let r = _mm_mask_cmul_sch(src, 0, a, b);
19138        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19139        assert_eq_m128h(r, e);
19140    }
19141
19142    #[simd_test(enable = "avx512fp16")]
19143    unsafe fn test_mm_maskz_cmul_sch() {
19144        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19145        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19146        let r = _mm_maskz_cmul_sch(0, a, b);
19147        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19148        assert_eq_m128h(r, e);
19149    }
19150
19151    #[simd_test(enable = "avx512fp16")]
19152    unsafe fn test_mm_cmul_round_sch() {
19153        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19154        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19155        let r = _mm_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19156        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19157        assert_eq_m128h(r, e);
19158    }
19159
19160    #[simd_test(enable = "avx512fp16")]
19161    unsafe fn test_mm_mask_cmul_round_sch() {
19162        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19163        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19164        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19165        let r = _mm_mask_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19166            src, 0, a, b,
19167        );
19168        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19169        assert_eq_m128h(r, e);
19170    }
19171
19172    #[simd_test(enable = "avx512fp16")]
19173    unsafe fn test_mm_maskz_cmul_round_sch() {
19174        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19175        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19176        let r =
19177            _mm_maskz_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19178        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19179        assert_eq_m128h(r, e);
19180    }
19181
19182    #[simd_test(enable = "avx512fp16,avx512vl")]
19183    unsafe fn test_mm_fcmul_pch() {
19184        let a = _mm_set1_pch(0.0, 1.0);
19185        let b = _mm_set1_pch(0.0, -1.0);
19186        let r = _mm_fcmul_pch(a, b);
19187        let e = _mm_set1_pch(-1.0, 0.0);
19188        assert_eq_m128h(r, e);
19189    }
19190
19191    #[simd_test(enable = "avx512fp16,avx512vl")]
19192    unsafe fn test_mm_mask_fcmul_pch() {
19193        let a = _mm_set1_pch(0.0, 1.0);
19194        let b = _mm_set1_pch(0.0, -1.0);
19195        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19196        let r = _mm_mask_fcmul_pch(src, 0b0101, a, b);
19197        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
19198        assert_eq_m128h(r, e);
19199    }
19200
19201    #[simd_test(enable = "avx512fp16,avx512vl")]
19202    unsafe fn test_mm_maskz_fcmul_pch() {
19203        let a = _mm_set1_pch(0.0, 1.0);
19204        let b = _mm_set1_pch(0.0, -1.0);
19205        let r = _mm_maskz_fcmul_pch(0b0101, a, b);
19206        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
19207        assert_eq_m128h(r, e);
19208    }
19209
19210    #[simd_test(enable = "avx512fp16,avx512vl")]
19211    unsafe fn test_mm256_fcmul_pch() {
19212        let a = _mm256_set1_pch(0.0, 1.0);
19213        let b = _mm256_set1_pch(0.0, -1.0);
19214        let r = _mm256_fcmul_pch(a, b);
19215        let e = _mm256_set1_pch(-1.0, 0.0);
19216        assert_eq_m256h(r, e);
19217    }
19218
19219    #[simd_test(enable = "avx512fp16,avx512vl")]
19220    unsafe fn test_mm256_mask_fcmul_pch() {
19221        let a = _mm256_set1_pch(0.0, 1.0);
19222        let b = _mm256_set1_pch(0.0, -1.0);
19223        let src = _mm256_setr_ph(
19224            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19225        );
19226        let r = _mm256_mask_fcmul_pch(src, 0b01010101, a, b);
19227        let e = _mm256_setr_ph(
19228            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19229        );
19230        assert_eq_m256h(r, e);
19231    }
19232
19233    #[simd_test(enable = "avx512fp16,avx512vl")]
19234    unsafe fn test_mm256_maskz_fcmul_pch() {
19235        let a = _mm256_set1_pch(0.0, 1.0);
19236        let b = _mm256_set1_pch(0.0, -1.0);
19237        let r = _mm256_maskz_fcmul_pch(0b01010101, a, b);
19238        let e = _mm256_setr_ph(
19239            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19240        );
19241        assert_eq_m256h(r, e);
19242    }
19243
19244    #[simd_test(enable = "avx512fp16")]
19245    unsafe fn test_mm512_fcmul_pch() {
19246        let a = _mm512_set1_pch(0.0, 1.0);
19247        let b = _mm512_set1_pch(0.0, -1.0);
19248        let r = _mm512_fcmul_pch(a, b);
19249        let e = _mm512_set1_pch(-1.0, 0.0);
19250        assert_eq_m512h(r, e);
19251    }
19252
19253    #[simd_test(enable = "avx512fp16")]
19254    unsafe fn test_mm512_mask_fcmul_pch() {
19255        let a = _mm512_set1_pch(0.0, 1.0);
19256        let b = _mm512_set1_pch(0.0, -1.0);
19257        let src = _mm512_setr_ph(
19258            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19259            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19260            32.0, 33.0,
19261        );
19262        let r = _mm512_mask_fcmul_pch(src, 0b0101010101010101, a, b);
19263        let e = _mm512_setr_ph(
19264            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19265            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19266            33.0,
19267        );
19268        assert_eq_m512h(r, e);
19269    }
19270
19271    #[simd_test(enable = "avx512fp16")]
19272    unsafe fn test_mm512_maskz_fcmul_pch() {
19273        let a = _mm512_set1_pch(0.0, 1.0);
19274        let b = _mm512_set1_pch(0.0, -1.0);
19275        let r = _mm512_maskz_fcmul_pch(0b0101010101010101, a, b);
19276        let e = _mm512_setr_ph(
19277            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19278            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19279        );
19280        assert_eq_m512h(r, e);
19281    }
19282
19283    #[simd_test(enable = "avx512fp16")]
19284    unsafe fn test_mm512_fcmul_round_pch() {
19285        let a = _mm512_set1_pch(0.0, 1.0);
19286        let b = _mm512_set1_pch(0.0, -1.0);
19287        let r = _mm512_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19288        let e = _mm512_set1_pch(-1.0, 0.0);
19289        assert_eq_m512h(r, e);
19290    }
19291
19292    #[simd_test(enable = "avx512fp16")]
19293    unsafe fn test_mm512_mask_fcmul_round_pch() {
19294        let a = _mm512_set1_pch(0.0, 1.0);
19295        let b = _mm512_set1_pch(0.0, -1.0);
19296        let src = _mm512_setr_ph(
19297            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19298            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19299            32.0, 33.0,
19300        );
19301        let r = _mm512_mask_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19302            src,
19303            0b0101010101010101,
19304            a,
19305            b,
19306        );
19307        let e = _mm512_setr_ph(
19308            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19309            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19310            33.0,
19311        );
19312        assert_eq_m512h(r, e);
19313    }
19314
19315    #[simd_test(enable = "avx512fp16")]
19316    unsafe fn test_mm512_maskz_fcmul_round_pch() {
19317        let a = _mm512_set1_pch(0.0, 1.0);
19318        let b = _mm512_set1_pch(0.0, -1.0);
19319        let r = _mm512_maskz_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19320            0b0101010101010101,
19321            a,
19322            b,
19323        );
19324        let e = _mm512_setr_ph(
19325            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19326            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19327        );
19328        assert_eq_m512h(r, e);
19329    }
19330
19331    #[simd_test(enable = "avx512fp16")]
19332    unsafe fn test_mm_fcmul_sch() {
19333        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19334        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19335        let r = _mm_fcmul_sch(a, b);
19336        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19337        assert_eq_m128h(r, e);
19338    }
19339
19340    #[simd_test(enable = "avx512fp16")]
19341    unsafe fn test_mm_mask_fcmul_sch() {
19342        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19343        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19344        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19345        let r = _mm_mask_fcmul_sch(src, 0, a, b);
19346        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19347        assert_eq_m128h(r, e);
19348    }
19349
19350    #[simd_test(enable = "avx512fp16")]
19351    unsafe fn test_mm_maskz_fcmul_sch() {
19352        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19353        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19354        let r = _mm_maskz_fcmul_sch(0, a, b);
19355        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19356        assert_eq_m128h(r, e);
19357    }
19358
19359    #[simd_test(enable = "avx512fp16")]
19360    unsafe fn test_mm_fcmul_round_sch() {
19361        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19362        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19363        let r = _mm_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19364        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19365        assert_eq_m128h(r, e);
19366    }
19367
19368    #[simd_test(enable = "avx512fp16")]
19369    unsafe fn test_mm_mask_fcmul_round_sch() {
19370        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19371        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19372        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19373        let r = _mm_mask_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19374            src, 0, a, b,
19375        );
19376        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19377        assert_eq_m128h(r, e);
19378    }
19379
19380    #[simd_test(enable = "avx512fp16")]
19381    unsafe fn test_mm_maskz_fcmul_round_sch() {
19382        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19383        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19384        let r =
19385            _mm_maskz_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19386        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19387        assert_eq_m128h(r, e);
19388    }
19389
19390    #[simd_test(enable = "avx512fp16,avx512vl")]
19391    unsafe fn test_mm_abs_ph() {
19392        let a = _mm_set_ph(-1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0);
19393        let r = _mm_abs_ph(a);
19394        let e = _mm_set_ph(1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0);
19395        assert_eq_m128h(r, e);
19396    }
19397
19398    #[simd_test(enable = "avx512fp16,avx512vl")]
19399    unsafe fn test_mm256_abs_ph() {
19400        let a = _mm256_set_ph(
19401            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
19402            -14.0,
19403        );
19404        let r = _mm256_abs_ph(a);
19405        let e = _mm256_set_ph(
19406            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
19407        );
19408        assert_eq_m256h(r, e);
19409    }
19410
19411    #[simd_test(enable = "avx512fp16")]
19412    unsafe fn test_mm512_abs_ph() {
19413        let a = _mm512_set_ph(
19414            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
19415            -14.0, 15.0, -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0,
19416            27.0, -28.0, 29.0, -30.0,
19417        );
19418        let r = _mm512_abs_ph(a);
19419        let e = _mm512_set_ph(
19420            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
19421            15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0,
19422            29.0, 30.0,
19423        );
19424        assert_eq_m512h(r, e);
19425    }
19426
19427    #[simd_test(enable = "avx512fp16,avx512vl")]
19428    unsafe fn test_mm_conj_pch() {
19429        let a = _mm_set1_pch(0.0, 1.0);
19430        let r = _mm_conj_pch(a);
19431        let e = _mm_set1_pch(0.0, -1.0);
19432        assert_eq_m128h(r, e);
19433    }
19434
19435    #[simd_test(enable = "avx512fp16,avx512vl")]
19436    unsafe fn test_mm_mask_conj_pch() {
19437        let a = _mm_set1_pch(0.0, 1.0);
19438        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19439        let r = _mm_mask_conj_pch(src, 0b0101, a);
19440        let e = _mm_setr_ph(0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0);
19441        assert_eq_m128h(r, e);
19442    }
19443
19444    #[simd_test(enable = "avx512fp16,avx512vl")]
19445    unsafe fn test_mm_maskz_conj_pch() {
19446        let a = _mm_set1_pch(0.0, 1.0);
19447        let r = _mm_maskz_conj_pch(0b0101, a);
19448        let e = _mm_setr_ph(0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0);
19449        assert_eq_m128h(r, e);
19450    }
19451
19452    #[simd_test(enable = "avx512fp16,avx512vl")]
19453    unsafe fn test_mm256_conj_pch() {
19454        let a = _mm256_set1_pch(0.0, 1.0);
19455        let r = _mm256_conj_pch(a);
19456        let e = _mm256_set1_pch(0.0, -1.0);
19457        assert_eq_m256h(r, e);
19458    }
19459
19460    #[simd_test(enable = "avx512fp16,avx512vl")]
19461    unsafe fn test_mm256_mask_conj_pch() {
19462        let a = _mm256_set1_pch(0.0, 1.0);
19463        let src = _mm256_setr_ph(
19464            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19465        );
19466        let r = _mm256_mask_conj_pch(src, 0b01010101, a);
19467        let e = _mm256_setr_ph(
19468            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
19469        );
19470        assert_eq_m256h(r, e);
19471    }
19472
19473    #[simd_test(enable = "avx512fp16,avx512vl")]
19474    unsafe fn test_mm256_maskz_conj_pch() {
19475        let a = _mm256_set1_pch(0.0, 1.0);
19476        let r = _mm256_maskz_conj_pch(0b01010101, a);
19477        let e = _mm256_setr_ph(
19478            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19479        );
19480        assert_eq_m256h(r, e);
19481    }
19482
19483    #[simd_test(enable = "avx512fp16")]
19484    unsafe fn test_mm512_conj_pch() {
19485        let a = _mm512_set1_pch(0.0, 1.0);
19486        let r = _mm512_conj_pch(a);
19487        let e = _mm512_set1_pch(0.0, -1.0);
19488        assert_eq_m512h(r, e);
19489    }
19490
19491    #[simd_test(enable = "avx512fp16")]
19492    unsafe fn test_mm512_mask_conj_pch() {
19493        let a = _mm512_set1_pch(0.0, 1.0);
19494        let src = _mm512_setr_ph(
19495            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19496            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19497            32.0, 33.0,
19498        );
19499        let r = _mm512_mask_conj_pch(src, 0b0101010101010101, a);
19500        let e = _mm512_setr_ph(
19501            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
19502            0.0, -1.0, 20.0, 21.0, 0.0, -1.0, 24.0, 25.0, 0.0, -1.0, 28.0, 29.0, 0.0, -1.0, 32.0,
19503            33.0,
19504        );
19505        assert_eq_m512h(r, e);
19506    }
19507
19508    #[simd_test(enable = "avx512fp16")]
19509    unsafe fn test_mm512_maskz_conj_pch() {
19510        let a = _mm512_set1_pch(0.0, 1.0);
19511        let r = _mm512_maskz_conj_pch(0b0101010101010101, a);
19512        let e = _mm512_setr_ph(
19513            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19514            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19515        );
19516        assert_eq_m512h(r, e);
19517    }
19518
19519    #[simd_test(enable = "avx512fp16,avx512vl")]
19520    unsafe fn test_mm_fmadd_pch() {
19521        let a = _mm_set1_pch(0.0, 1.0);
19522        let b = _mm_set1_pch(0.0, 2.0);
19523        let c = _mm_set1_pch(0.0, 3.0);
19524        let r = _mm_fmadd_pch(a, b, c);
19525        let e = _mm_set1_pch(-2.0, 3.0);
19526        assert_eq_m128h(r, e);
19527    }
19528
19529    #[simd_test(enable = "avx512fp16,avx512vl")]
19530    unsafe fn test_mm_mask_fmadd_pch() {
19531        let a = _mm_set1_pch(0.0, 1.0);
19532        let b = _mm_set1_pch(0.0, 2.0);
19533        let c = _mm_set1_pch(0.0, 3.0);
19534        let r = _mm_mask_fmadd_pch(a, 0b0101, b, c);
19535        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0);
19536        assert_eq_m128h(r, e);
19537    }
19538
19539    #[simd_test(enable = "avx512fp16,avx512vl")]
19540    unsafe fn test_mm_mask3_fmadd_pch() {
19541        let a = _mm_set1_pch(0.0, 1.0);
19542        let b = _mm_set1_pch(0.0, 2.0);
19543        let c = _mm_set1_pch(0.0, 3.0);
19544        let r = _mm_mask3_fmadd_pch(a, b, c, 0b0101);
19545        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0);
19546        assert_eq_m128h(r, e);
19547    }
19548
19549    #[simd_test(enable = "avx512fp16,avx512vl")]
19550    unsafe fn test_mm_maskz_fmadd_pch() {
19551        let a = _mm_set1_pch(0.0, 1.0);
19552        let b = _mm_set1_pch(0.0, 2.0);
19553        let c = _mm_set1_pch(0.0, 3.0);
19554        let r = _mm_maskz_fmadd_pch(0b0101, a, b, c);
19555        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0);
19556        assert_eq_m128h(r, e);
19557    }
19558
19559    #[simd_test(enable = "avx512fp16,avx512vl")]
19560    unsafe fn test_mm256_fmadd_pch() {
19561        let a = _mm256_set1_pch(0.0, 1.0);
19562        let b = _mm256_set1_pch(0.0, 2.0);
19563        let c = _mm256_set1_pch(0.0, 3.0);
19564        let r = _mm256_fmadd_pch(a, b, c);
19565        let e = _mm256_set1_pch(-2.0, 3.0);
19566        assert_eq_m256h(r, e);
19567    }
19568
19569    #[simd_test(enable = "avx512fp16,avx512vl")]
19570    unsafe fn test_mm256_mask_fmadd_pch() {
19571        let a = _mm256_set1_pch(0.0, 1.0);
19572        let b = _mm256_set1_pch(0.0, 2.0);
19573        let c = _mm256_set1_pch(0.0, 3.0);
19574        let r = _mm256_mask_fmadd_pch(a, 0b01010101, b, c);
19575        let e = _mm256_setr_ph(
19576            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19577        );
19578        assert_eq_m256h(r, e);
19579    }
19580
19581    #[simd_test(enable = "avx512fp16,avx512vl")]
19582    unsafe fn test_mm256_mask3_fmadd_pch() {
19583        let a = _mm256_set1_pch(0.0, 1.0);
19584        let b = _mm256_set1_pch(0.0, 2.0);
19585        let c = _mm256_set1_pch(0.0, 3.0);
19586        let r = _mm256_mask3_fmadd_pch(a, b, c, 0b01010101);
19587        let e = _mm256_setr_ph(
19588            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19589        );
19590        assert_eq_m256h(r, e);
19591    }
19592
19593    #[simd_test(enable = "avx512fp16,avx512vl")]
19594    unsafe fn test_mm256_maskz_fmadd_pch() {
19595        let a = _mm256_set1_pch(0.0, 1.0);
19596        let b = _mm256_set1_pch(0.0, 2.0);
19597        let c = _mm256_set1_pch(0.0, 3.0);
19598        let r = _mm256_maskz_fmadd_pch(0b01010101, a, b, c);
19599        let e = _mm256_setr_ph(
19600            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19601        );
19602        assert_eq_m256h(r, e);
19603    }
19604
19605    #[simd_test(enable = "avx512fp16")]
19606    unsafe fn test_mm512_fmadd_pch() {
19607        let a = _mm512_set1_pch(0.0, 1.0);
19608        let b = _mm512_set1_pch(0.0, 2.0);
19609        let c = _mm512_set1_pch(0.0, 3.0);
19610        let r = _mm512_fmadd_pch(a, b, c);
19611        let e = _mm512_set1_pch(-2.0, 3.0);
19612        assert_eq_m512h(r, e);
19613    }
19614
19615    #[simd_test(enable = "avx512fp16")]
19616    unsafe fn test_mm512_mask_fmadd_pch() {
19617        let a = _mm512_set1_pch(0.0, 1.0);
19618        let b = _mm512_set1_pch(0.0, 2.0);
19619        let c = _mm512_set1_pch(0.0, 3.0);
19620        let r = _mm512_mask_fmadd_pch(a, 0b0101010101010101, b, c);
19621        let e = _mm512_setr_ph(
19622            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19623            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19624        );
19625        assert_eq_m512h(r, e);
19626    }
19627
19628    #[simd_test(enable = "avx512fp16")]
19629    unsafe fn test_mm512_mask3_fmadd_pch() {
19630        let a = _mm512_set1_pch(0.0, 1.0);
19631        let b = _mm512_set1_pch(0.0, 2.0);
19632        let c = _mm512_set1_pch(0.0, 3.0);
19633        let r = _mm512_mask3_fmadd_pch(a, b, c, 0b0101010101010101);
19634        let e = _mm512_setr_ph(
19635            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19636            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19637        );
19638        assert_eq_m512h(r, e);
19639    }
19640
19641    #[simd_test(enable = "avx512fp16")]
19642    unsafe fn test_mm512_maskz_fmadd_pch() {
19643        let a = _mm512_set1_pch(0.0, 1.0);
19644        let b = _mm512_set1_pch(0.0, 2.0);
19645        let c = _mm512_set1_pch(0.0, 3.0);
19646        let r = _mm512_maskz_fmadd_pch(0b0101010101010101, a, b, c);
19647        let e = _mm512_setr_ph(
19648            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19649            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19650        );
19651        assert_eq_m512h(r, e);
19652    }
19653
19654    #[simd_test(enable = "avx512fp16")]
19655    unsafe fn test_mm512_fmadd_round_pch() {
19656        let a = _mm512_set1_pch(0.0, 1.0);
19657        let b = _mm512_set1_pch(0.0, 2.0);
19658        let c = _mm512_set1_pch(0.0, 3.0);
19659        let r =
19660            _mm512_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19661        let e = _mm512_set1_pch(-2.0, 3.0);
19662        assert_eq_m512h(r, e);
19663    }
19664
19665    #[simd_test(enable = "avx512fp16")]
19666    unsafe fn test_mm512_mask_fmadd_round_pch() {
19667        let a = _mm512_set1_pch(0.0, 1.0);
19668        let b = _mm512_set1_pch(0.0, 2.0);
19669        let c = _mm512_set1_pch(0.0, 3.0);
19670        let r = _mm512_mask_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19671            a,
19672            0b0101010101010101,
19673            b,
19674            c,
19675        );
19676        let e = _mm512_setr_ph(
19677            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19678            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19679        );
19680        assert_eq_m512h(r, e);
19681    }
19682
19683    #[simd_test(enable = "avx512fp16")]
19684    unsafe fn test_mm512_mask3_fmadd_round_pch() {
19685        let a = _mm512_set1_pch(0.0, 1.0);
19686        let b = _mm512_set1_pch(0.0, 2.0);
19687        let c = _mm512_set1_pch(0.0, 3.0);
19688        let r = _mm512_mask3_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19689            a,
19690            b,
19691            c,
19692            0b0101010101010101,
19693        );
19694        let e = _mm512_setr_ph(
19695            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19696            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19697        );
19698        assert_eq_m512h(r, e);
19699    }
19700
19701    #[simd_test(enable = "avx512fp16")]
19702    unsafe fn test_mm512_maskz_fmadd_round_pch() {
19703        let a = _mm512_set1_pch(0.0, 1.0);
19704        let b = _mm512_set1_pch(0.0, 2.0);
19705        let c = _mm512_set1_pch(0.0, 3.0);
19706        let r = _mm512_maskz_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19707            0b0101010101010101,
19708            a,
19709            b,
19710            c,
19711        );
19712        let e = _mm512_setr_ph(
19713            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19714            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19715        );
19716        assert_eq_m512h(r, e);
19717    }
19718
19719    #[simd_test(enable = "avx512fp16")]
19720    unsafe fn test_mm_fmadd_sch() {
19721        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19722        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19723        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19724        let r = _mm_fmadd_sch(a, b, c);
19725        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19726        assert_eq_m128h(r, e);
19727    }
19728
19729    #[simd_test(enable = "avx512fp16")]
19730    unsafe fn test_mm_mask_fmadd_sch() {
19731        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19732        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19733        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19734        let r = _mm_mask_fmadd_sch(a, 0, b, c);
19735        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19736        assert_eq_m128h(r, e);
19737        let r = _mm_mask_fmadd_sch(a, 1, b, c);
19738        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19739        assert_eq_m128h(r, e);
19740    }
19741
19742    #[simd_test(enable = "avx512fp16")]
19743    unsafe fn test_mm_mask3_fmadd_sch() {
19744        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19745        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19746        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19747        let r = _mm_mask3_fmadd_sch(a, b, c, 0);
19748        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19749        assert_eq_m128h(r, e);
19750        let r = _mm_mask3_fmadd_sch(a, b, c, 1);
19751        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19752        assert_eq_m128h(r, e);
19753    }
19754
19755    #[simd_test(enable = "avx512fp16")]
19756    unsafe fn test_mm_maskz_fmadd_sch() {
19757        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19758        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19759        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19760        let r = _mm_maskz_fmadd_sch(0, a, b, c);
19761        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19762        assert_eq_m128h(r, e);
19763        let r = _mm_maskz_fmadd_sch(1, a, b, c);
19764        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19765        assert_eq_m128h(r, e);
19766    }
19767
19768    #[simd_test(enable = "avx512fp16")]
19769    unsafe fn test_mm_fmadd_round_sch() {
19770        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19771        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19772        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19773        let r = _mm_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19774        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19775        assert_eq_m128h(r, e);
19776    }
19777
19778    #[simd_test(enable = "avx512fp16")]
19779    unsafe fn test_mm_mask_fmadd_round_sch() {
19780        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19781        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19782        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19783        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19784            a, 0, b, c,
19785        );
19786        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19787        assert_eq_m128h(r, e);
19788        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19789            a, 1, b, c,
19790        );
19791        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19792        assert_eq_m128h(r, e);
19793    }
19794
19795    #[simd_test(enable = "avx512fp16")]
19796    unsafe fn test_mm_mask3_fmadd_round_sch() {
19797        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19798        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19799        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19800        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19801            a, b, c, 0,
19802        );
19803        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19804        assert_eq_m128h(r, e);
19805        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19806            a, b, c, 1,
19807        );
19808        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19809        assert_eq_m128h(r, e);
19810    }
19811
19812    #[simd_test(enable = "avx512fp16")]
19813    unsafe fn test_mm_maskz_fmadd_round_sch() {
19814        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19815        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19816        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19817        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19818            0, a, b, c,
19819        );
19820        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19821        assert_eq_m128h(r, e);
19822        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19823            1, a, b, c,
19824        );
19825        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19826        assert_eq_m128h(r, e);
19827    }
19828
19829    #[simd_test(enable = "avx512fp16,avx512vl")]
19830    unsafe fn test_mm_fcmadd_pch() {
19831        let a = _mm_set1_pch(0.0, 1.0);
19832        let b = _mm_set1_pch(0.0, 2.0);
19833        let c = _mm_set1_pch(0.0, 3.0);
19834        let r = _mm_fcmadd_pch(a, b, c);
19835        let e = _mm_set1_pch(2.0, 3.0);
19836        assert_eq_m128h(r, e);
19837    }
19838
19839    #[simd_test(enable = "avx512fp16,avx512vl")]
19840    unsafe fn test_mm_mask_fcmadd_pch() {
19841        let a = _mm_set1_pch(0.0, 1.0);
19842        let b = _mm_set1_pch(0.0, 2.0);
19843        let c = _mm_set1_pch(0.0, 3.0);
19844        let r = _mm_mask_fcmadd_pch(a, 0b0101, b, c);
19845        let e = _mm_setr_ph(2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0);
19846        assert_eq_m128h(r, e);
19847    }
19848
19849    #[simd_test(enable = "avx512fp16,avx512vl")]
19850    unsafe fn test_mm_mask3_fcmadd_pch() {
19851        let a = _mm_set1_pch(0.0, 1.0);
19852        let b = _mm_set1_pch(0.0, 2.0);
19853        let c = _mm_set1_pch(0.0, 3.0);
19854        let r = _mm_mask3_fcmadd_pch(a, b, c, 0b0101);
19855        let e = _mm_setr_ph(2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0);
19856        assert_eq_m128h(r, e);
19857    }
19858
19859    #[simd_test(enable = "avx512fp16,avx512vl")]
19860    unsafe fn test_mm_maskz_fcmadd_pch() {
19861        let a = _mm_set1_pch(0.0, 1.0);
19862        let b = _mm_set1_pch(0.0, 2.0);
19863        let c = _mm_set1_pch(0.0, 3.0);
19864        let r = _mm_maskz_fcmadd_pch(0b0101, a, b, c);
19865        let e = _mm_setr_ph(2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0);
19866        assert_eq_m128h(r, e);
19867    }
19868
19869    #[simd_test(enable = "avx512fp16,avx512vl")]
19870    unsafe fn test_mm256_fcmadd_pch() {
19871        let a = _mm256_set1_pch(0.0, 1.0);
19872        let b = _mm256_set1_pch(0.0, 2.0);
19873        let c = _mm256_set1_pch(0.0, 3.0);
19874        let r = _mm256_fcmadd_pch(a, b, c);
19875        let e = _mm256_set1_pch(2.0, 3.0);
19876        assert_eq_m256h(r, e);
19877    }
19878
19879    #[simd_test(enable = "avx512fp16,avx512vl")]
19880    unsafe fn test_mm256_mask_fcmadd_pch() {
19881        let a = _mm256_set1_pch(0.0, 1.0);
19882        let b = _mm256_set1_pch(0.0, 2.0);
19883        let c = _mm256_set1_pch(0.0, 3.0);
19884        let r = _mm256_mask_fcmadd_pch(a, 0b01010101, b, c);
19885        let e = _mm256_setr_ph(
19886            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
19887        );
19888        assert_eq_m256h(r, e);
19889    }
19890
19891    #[simd_test(enable = "avx512fp16,avx512vl")]
19892    unsafe fn test_mm256_mask3_fcmadd_pch() {
19893        let a = _mm256_set1_pch(0.0, 1.0);
19894        let b = _mm256_set1_pch(0.0, 2.0);
19895        let c = _mm256_set1_pch(0.0, 3.0);
19896        let r = _mm256_mask3_fcmadd_pch(a, b, c, 0b01010101);
19897        let e = _mm256_setr_ph(
19898            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
19899        );
19900        assert_eq_m256h(r, e);
19901    }
19902
19903    #[simd_test(enable = "avx512fp16,avx512vl")]
19904    unsafe fn test_mm256_maskz_fcmadd_pch() {
19905        let a = _mm256_set1_pch(0.0, 1.0);
19906        let b = _mm256_set1_pch(0.0, 2.0);
19907        let c = _mm256_set1_pch(0.0, 3.0);
19908        let r = _mm256_maskz_fcmadd_pch(0b01010101, a, b, c);
19909        let e = _mm256_setr_ph(
19910            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
19911        );
19912        assert_eq_m256h(r, e);
19913    }
19914
19915    #[simd_test(enable = "avx512fp16")]
19916    unsafe fn test_mm512_fcmadd_pch() {
19917        let a = _mm512_set1_pch(0.0, 1.0);
19918        let b = _mm512_set1_pch(0.0, 2.0);
19919        let c = _mm512_set1_pch(0.0, 3.0);
19920        let r = _mm512_fcmadd_pch(a, b, c);
19921        let e = _mm512_set1_pch(2.0, 3.0);
19922        assert_eq_m512h(r, e);
19923    }
19924
19925    #[simd_test(enable = "avx512fp16")]
19926    unsafe fn test_mm512_mask_fcmadd_pch() {
19927        let a = _mm512_set1_pch(0.0, 1.0);
19928        let b = _mm512_set1_pch(0.0, 2.0);
19929        let c = _mm512_set1_pch(0.0, 3.0);
19930        let r = _mm512_mask_fcmadd_pch(a, 0b0101010101010101, b, c);
19931        let e = _mm512_setr_ph(
19932            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
19933            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
19934        );
19935        assert_eq_m512h(r, e);
19936    }
19937
19938    #[simd_test(enable = "avx512fp16")]
19939    unsafe fn test_mm512_mask3_fcmadd_pch() {
19940        let a = _mm512_set1_pch(0.0, 1.0);
19941        let b = _mm512_set1_pch(0.0, 2.0);
19942        let c = _mm512_set1_pch(0.0, 3.0);
19943        let r = _mm512_mask3_fcmadd_pch(a, b, c, 0b0101010101010101);
19944        let e = _mm512_setr_ph(
19945            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
19946            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
19947        );
19948        assert_eq_m512h(r, e);
19949    }
19950
19951    #[simd_test(enable = "avx512fp16")]
19952    unsafe fn test_mm512_maskz_fcmadd_pch() {
19953        let a = _mm512_set1_pch(0.0, 1.0);
19954        let b = _mm512_set1_pch(0.0, 2.0);
19955        let c = _mm512_set1_pch(0.0, 3.0);
19956        let r = _mm512_maskz_fcmadd_pch(0b0101010101010101, a, b, c);
19957        let e = _mm512_setr_ph(
19958            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
19959            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
19960        );
19961        assert_eq_m512h(r, e);
19962    }
19963
19964    #[simd_test(enable = "avx512fp16")]
19965    unsafe fn test_mm512_fcmadd_round_pch() {
19966        let a = _mm512_set1_pch(0.0, 1.0);
19967        let b = _mm512_set1_pch(0.0, 2.0);
19968        let c = _mm512_set1_pch(0.0, 3.0);
19969        let r =
19970            _mm512_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19971        let e = _mm512_set1_pch(2.0, 3.0);
19972        assert_eq_m512h(r, e);
19973    }
19974
19975    #[simd_test(enable = "avx512fp16")]
19976    unsafe fn test_mm512_mask_fcmadd_round_pch() {
19977        let a = _mm512_set1_pch(0.0, 1.0);
19978        let b = _mm512_set1_pch(0.0, 2.0);
19979        let c = _mm512_set1_pch(0.0, 3.0);
19980        let r = _mm512_mask_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19981            a,
19982            0b0101010101010101,
19983            b,
19984            c,
19985        );
19986        let e = _mm512_setr_ph(
19987            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
19988            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
19989        );
19990        assert_eq_m512h(r, e);
19991    }
19992
19993    #[simd_test(enable = "avx512fp16")]
19994    unsafe fn test_mm512_mask3_fcmadd_round_pch() {
19995        let a = _mm512_set1_pch(0.0, 1.0);
19996        let b = _mm512_set1_pch(0.0, 2.0);
19997        let c = _mm512_set1_pch(0.0, 3.0);
19998        let r = _mm512_mask3_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19999            a,
20000            b,
20001            c,
20002            0b0101010101010101,
20003        );
20004        let e = _mm512_setr_ph(
20005            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
20006            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
20007        );
20008        assert_eq_m512h(r, e);
20009    }
20010
20011    #[simd_test(enable = "avx512fp16")]
20012    unsafe fn test_mm512_maskz_fcmadd_round_pch() {
20013        let a = _mm512_set1_pch(0.0, 1.0);
20014        let b = _mm512_set1_pch(0.0, 2.0);
20015        let c = _mm512_set1_pch(0.0, 3.0);
20016        let r = _mm512_maskz_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20017            0b0101010101010101,
20018            a,
20019            b,
20020            c,
20021        );
20022        let e = _mm512_setr_ph(
20023            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
20024            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
20025        );
20026        assert_eq_m512h(r, e);
20027    }
20028
20029    #[simd_test(enable = "avx512fp16")]
20030    unsafe fn test_mm_fcmadd_sch() {
20031        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20032        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20033        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20034        let r = _mm_fcmadd_sch(a, b, c);
20035        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20036        assert_eq_m128h(r, e);
20037    }
20038
20039    #[simd_test(enable = "avx512fp16")]
20040    unsafe fn test_mm_mask_fcmadd_sch() {
20041        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20042        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20043        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20044        let r = _mm_mask_fcmadd_sch(a, 0, b, c);
20045        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20046        assert_eq_m128h(r, e);
20047        let r = _mm_mask_fcmadd_sch(a, 1, b, c);
20048        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20049        assert_eq_m128h(r, e);
20050    }
20051
20052    #[simd_test(enable = "avx512fp16")]
20053    unsafe fn test_mm_mask3_fcmadd_sch() {
20054        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20055        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20056        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20057        let r = _mm_mask3_fcmadd_sch(a, b, c, 0);
20058        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20059        assert_eq_m128h(r, e);
20060        let r = _mm_mask3_fcmadd_sch(a, b, c, 1);
20061        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20062        assert_eq_m128h(r, e);
20063    }
20064
20065    #[simd_test(enable = "avx512fp16")]
20066    unsafe fn test_mm_maskz_fcmadd_sch() {
20067        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20068        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20069        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20070        let r = _mm_maskz_fcmadd_sch(0, a, b, c);
20071        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20072        assert_eq_m128h(r, e);
20073        let r = _mm_maskz_fcmadd_sch(1, a, b, c);
20074        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20075        assert_eq_m128h(r, e);
20076    }
20077
20078    #[simd_test(enable = "avx512fp16")]
20079    unsafe fn test_mm_fcmadd_round_sch() {
20080        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20081        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20082        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20083        let r = _mm_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20084        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20085        assert_eq_m128h(r, e);
20086    }
20087
20088    #[simd_test(enable = "avx512fp16")]
20089    unsafe fn test_mm_mask_fcmadd_round_sch() {
20090        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20091        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20092        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20093        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20094            a, 0, b, c,
20095        );
20096        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20097        assert_eq_m128h(r, e);
20098        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20099            a, 1, b, c,
20100        );
20101        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20102        assert_eq_m128h(r, e);
20103    }
20104
20105    #[simd_test(enable = "avx512fp16")]
20106    unsafe fn test_mm_mask3_fcmadd_round_sch() {
20107        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20108        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20109        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20110        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20111            a, b, c, 0,
20112        );
20113        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20114        assert_eq_m128h(r, e);
20115        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20116            a, b, c, 1,
20117        );
20118        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20119        assert_eq_m128h(r, e);
20120    }
20121
20122    #[simd_test(enable = "avx512fp16")]
20123    unsafe fn test_mm_maskz_fcmadd_round_sch() {
20124        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20125        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20126        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20127        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20128            0, a, b, c,
20129        );
20130        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20131        assert_eq_m128h(r, e);
20132        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20133            1, a, b, c,
20134        );
20135        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20136        assert_eq_m128h(r, e);
20137    }
20138
20139    #[simd_test(enable = "avx512fp16,avx512vl")]
20140    unsafe fn test_mm_fmadd_ph() {
20141        let a = _mm_set1_ph(1.0);
20142        let b = _mm_set1_ph(2.0);
20143        let c = _mm_set1_ph(3.0);
20144        let r = _mm_fmadd_ph(a, b, c);
20145        let e = _mm_set1_ph(5.0);
20146        assert_eq_m128h(r, e);
20147    }
20148
20149    #[simd_test(enable = "avx512fp16,avx512vl")]
20150    unsafe fn test_mm_mask_fmadd_ph() {
20151        let a = _mm_set1_ph(1.0);
20152        let b = _mm_set1_ph(2.0);
20153        let c = _mm_set1_ph(3.0);
20154        let r = _mm_mask_fmadd_ph(a, 0b01010101, b, c);
20155        let e = _mm_set_ph(1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0);
20156        assert_eq_m128h(r, e);
20157    }
20158
20159    #[simd_test(enable = "avx512fp16,avx512vl")]
20160    unsafe fn test_mm_mask3_fmadd_ph() {
20161        let a = _mm_set1_ph(1.0);
20162        let b = _mm_set1_ph(2.0);
20163        let c = _mm_set1_ph(3.0);
20164        let r = _mm_mask3_fmadd_ph(a, b, c, 0b01010101);
20165        let e = _mm_set_ph(3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0);
20166        assert_eq_m128h(r, e);
20167    }
20168
20169    #[simd_test(enable = "avx512fp16,avx512vl")]
20170    unsafe fn test_mm_maskz_fmadd_ph() {
20171        let a = _mm_set1_ph(1.0);
20172        let b = _mm_set1_ph(2.0);
20173        let c = _mm_set1_ph(3.0);
20174        let r = _mm_maskz_fmadd_ph(0b01010101, a, b, c);
20175        let e = _mm_set_ph(0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0);
20176        assert_eq_m128h(r, e);
20177    }
20178
20179    #[simd_test(enable = "avx512fp16,avx512vl")]
20180    unsafe fn test_mm256_fmadd_ph() {
20181        let a = _mm256_set1_ph(1.0);
20182        let b = _mm256_set1_ph(2.0);
20183        let c = _mm256_set1_ph(3.0);
20184        let r = _mm256_fmadd_ph(a, b, c);
20185        let e = _mm256_set1_ph(5.0);
20186        assert_eq_m256h(r, e);
20187    }
20188
20189    #[simd_test(enable = "avx512fp16,avx512vl")]
20190    unsafe fn test_mm256_mask_fmadd_ph() {
20191        let a = _mm256_set1_ph(1.0);
20192        let b = _mm256_set1_ph(2.0);
20193        let c = _mm256_set1_ph(3.0);
20194        let r = _mm256_mask_fmadd_ph(a, 0b0101010101010101, b, c);
20195        let e = _mm256_set_ph(
20196            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20197        );
20198        assert_eq_m256h(r, e);
20199    }
20200
20201    #[simd_test(enable = "avx512fp16,avx512vl")]
20202    unsafe fn test_mm256_mask3_fmadd_ph() {
20203        let a = _mm256_set1_ph(1.0);
20204        let b = _mm256_set1_ph(2.0);
20205        let c = _mm256_set1_ph(3.0);
20206        let r = _mm256_mask3_fmadd_ph(a, b, c, 0b0101010101010101);
20207        let e = _mm256_set_ph(
20208            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20209        );
20210        assert_eq_m256h(r, e);
20211    }
20212
20213    #[simd_test(enable = "avx512fp16,avx512vl")]
20214    unsafe fn test_mm256_maskz_fmadd_ph() {
20215        let a = _mm256_set1_ph(1.0);
20216        let b = _mm256_set1_ph(2.0);
20217        let c = _mm256_set1_ph(3.0);
20218        let r = _mm256_maskz_fmadd_ph(0b0101010101010101, a, b, c);
20219        let e = _mm256_set_ph(
20220            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20221        );
20222        assert_eq_m256h(r, e);
20223    }
20224
20225    #[simd_test(enable = "avx512fp16")]
20226    unsafe fn test_mm512_fmadd_ph() {
20227        let a = _mm512_set1_ph(1.0);
20228        let b = _mm512_set1_ph(2.0);
20229        let c = _mm512_set1_ph(3.0);
20230        let r = _mm512_fmadd_ph(a, b, c);
20231        let e = _mm512_set1_ph(5.0);
20232        assert_eq_m512h(r, e);
20233    }
20234
20235    #[simd_test(enable = "avx512fp16")]
20236    unsafe fn test_mm512_mask_fmadd_ph() {
20237        let a = _mm512_set1_ph(1.0);
20238        let b = _mm512_set1_ph(2.0);
20239        let c = _mm512_set1_ph(3.0);
20240        let r = _mm512_mask_fmadd_ph(a, 0b01010101010101010101010101010101, b, c);
20241        let e = _mm512_set_ph(
20242            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
20243            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20244        );
20245        assert_eq_m512h(r, e);
20246    }
20247
20248    #[simd_test(enable = "avx512fp16")]
20249    unsafe fn test_mm512_mask3_fmadd_ph() {
20250        let a = _mm512_set1_ph(1.0);
20251        let b = _mm512_set1_ph(2.0);
20252        let c = _mm512_set1_ph(3.0);
20253        let r = _mm512_mask3_fmadd_ph(a, b, c, 0b01010101010101010101010101010101);
20254        let e = _mm512_set_ph(
20255            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
20256            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20257        );
20258        assert_eq_m512h(r, e);
20259    }
20260
20261    #[simd_test(enable = "avx512fp16")]
20262    unsafe fn test_mm512_maskz_fmadd_ph() {
20263        let a = _mm512_set1_ph(1.0);
20264        let b = _mm512_set1_ph(2.0);
20265        let c = _mm512_set1_ph(3.0);
20266        let r = _mm512_maskz_fmadd_ph(0b01010101010101010101010101010101, a, b, c);
20267        let e = _mm512_set_ph(
20268            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
20269            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20270        );
20271        assert_eq_m512h(r, e);
20272    }
20273
20274    #[simd_test(enable = "avx512fp16")]
20275    unsafe fn test_mm512_fmadd_round_ph() {
20276        let a = _mm512_set1_ph(1.0);
20277        let b = _mm512_set1_ph(2.0);
20278        let c = _mm512_set1_ph(3.0);
20279        let r = _mm512_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20280        let e = _mm512_set1_ph(5.0);
20281        assert_eq_m512h(r, e);
20282    }
20283
20284    #[simd_test(enable = "avx512fp16")]
20285    unsafe fn test_mm512_mask_fmadd_round_ph() {
20286        let a = _mm512_set1_ph(1.0);
20287        let b = _mm512_set1_ph(2.0);
20288        let c = _mm512_set1_ph(3.0);
20289        let r = _mm512_mask_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20290            a,
20291            0b01010101010101010101010101010101,
20292            b,
20293            c,
20294        );
20295        let e = _mm512_set_ph(
20296            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
20297            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20298        );
20299        assert_eq_m512h(r, e);
20300    }
20301
20302    #[simd_test(enable = "avx512fp16")]
20303    unsafe fn test_mm512_mask3_fmadd_round_ph() {
20304        let a = _mm512_set1_ph(1.0);
20305        let b = _mm512_set1_ph(2.0);
20306        let c = _mm512_set1_ph(3.0);
20307        let r = _mm512_mask3_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20308            a,
20309            b,
20310            c,
20311            0b01010101010101010101010101010101,
20312        );
20313        let e = _mm512_set_ph(
20314            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
20315            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20316        );
20317        assert_eq_m512h(r, e);
20318    }
20319
20320    #[simd_test(enable = "avx512fp16")]
20321    unsafe fn test_mm512_maskz_fmadd_round_ph() {
20322        let a = _mm512_set1_ph(1.0);
20323        let b = _mm512_set1_ph(2.0);
20324        let c = _mm512_set1_ph(3.0);
20325        let r = _mm512_maskz_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20326            0b01010101010101010101010101010101,
20327            a,
20328            b,
20329            c,
20330        );
20331        let e = _mm512_set_ph(
20332            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
20333            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20334        );
20335        assert_eq_m512h(r, e);
20336    }
20337
20338    #[simd_test(enable = "avx512fp16")]
20339    unsafe fn test_mm_fmadd_sh() {
20340        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20341        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20342        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20343        let r = _mm_fmadd_sh(a, b, c);
20344        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20345        assert_eq_m128h(r, e);
20346    }
20347
20348    #[simd_test(enable = "avx512fp16")]
20349    unsafe fn test_mm_mask_fmadd_sh() {
20350        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20351        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20352        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20353        let r = _mm_mask_fmadd_sh(a, 0, b, c);
20354        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20355        assert_eq_m128h(r, e);
20356        let r = _mm_mask_fmadd_sh(a, 1, b, c);
20357        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20358        assert_eq_m128h(r, e);
20359    }
20360
20361    #[simd_test(enable = "avx512fp16")]
20362    unsafe fn test_mm_mask3_fmadd_sh() {
20363        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20364        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20365        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20366        let r = _mm_mask3_fmadd_sh(a, b, c, 0);
20367        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20368        assert_eq_m128h(r, e);
20369        let r = _mm_mask3_fmadd_sh(a, b, c, 1);
20370        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
20371        assert_eq_m128h(r, e);
20372    }
20373
20374    #[simd_test(enable = "avx512fp16")]
20375    unsafe fn test_mm_maskz_fmadd_sh() {
20376        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20377        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20378        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20379        let r = _mm_maskz_fmadd_sh(0, a, b, c);
20380        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20381        assert_eq_m128h(r, e);
20382        let r = _mm_maskz_fmadd_sh(1, a, b, c);
20383        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20384        assert_eq_m128h(r, e);
20385    }
20386
20387    #[simd_test(enable = "avx512fp16")]
20388    unsafe fn test_mm_fmadd_round_sh() {
20389        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20390        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20391        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20392        let r = _mm_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20393        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20394        assert_eq_m128h(r, e);
20395    }
20396
20397    #[simd_test(enable = "avx512fp16")]
20398    unsafe fn test_mm_mask_fmadd_round_sh() {
20399        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20400        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20401        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20402        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20403            a, 0, b, c,
20404        );
20405        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20406        assert_eq_m128h(r, e);
20407        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20408            a, 1, b, c,
20409        );
20410        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20411        assert_eq_m128h(r, e);
20412    }
20413
20414    #[simd_test(enable = "avx512fp16")]
20415    unsafe fn test_mm_mask3_fmadd_round_sh() {
20416        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20417        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20418        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20419        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20420            a, b, c, 0,
20421        );
20422        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20423        assert_eq_m128h(r, e);
20424        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20425            a, b, c, 1,
20426        );
20427        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
20428        assert_eq_m128h(r, e);
20429    }
20430
20431    #[simd_test(enable = "avx512fp16")]
20432    unsafe fn test_mm_maskz_fmadd_round_sh() {
20433        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20434        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20435        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20436        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20437            0, a, b, c,
20438        );
20439        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20440        assert_eq_m128h(r, e);
20441        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20442            1, a, b, c,
20443        );
20444        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20445        assert_eq_m128h(r, e);
20446    }
20447
20448    #[simd_test(enable = "avx512fp16,avx512vl")]
20449    unsafe fn test_mm_fmsub_ph() {
20450        let a = _mm_set1_ph(1.0);
20451        let b = _mm_set1_ph(2.0);
20452        let c = _mm_set1_ph(3.0);
20453        let r = _mm_fmsub_ph(a, b, c);
20454        let e = _mm_set1_ph(-1.0);
20455        assert_eq_m128h(r, e);
20456    }
20457
20458    #[simd_test(enable = "avx512fp16,avx512vl")]
20459    unsafe fn test_mm_mask_fmsub_ph() {
20460        let a = _mm_set1_ph(1.0);
20461        let b = _mm_set1_ph(2.0);
20462        let c = _mm_set1_ph(3.0);
20463        let r = _mm_mask_fmsub_ph(a, 0b01010101, b, c);
20464        let e = _mm_set_ph(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0);
20465        assert_eq_m128h(r, e);
20466    }
20467
20468    #[simd_test(enable = "avx512fp16,avx512vl")]
20469    unsafe fn test_mm_mask3_fmsub_ph() {
20470        let a = _mm_set1_ph(1.0);
20471        let b = _mm_set1_ph(2.0);
20472        let c = _mm_set1_ph(3.0);
20473        let r = _mm_mask3_fmsub_ph(a, b, c, 0b01010101);
20474        let e = _mm_set_ph(3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0);
20475        assert_eq_m128h(r, e);
20476    }
20477
20478    #[simd_test(enable = "avx512fp16,avx512vl")]
20479    unsafe fn test_mm_maskz_fmsub_ph() {
20480        let a = _mm_set1_ph(1.0);
20481        let b = _mm_set1_ph(2.0);
20482        let c = _mm_set1_ph(3.0);
20483        let r = _mm_maskz_fmsub_ph(0b01010101, a, b, c);
20484        let e = _mm_set_ph(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0);
20485        assert_eq_m128h(r, e);
20486    }
20487
20488    #[simd_test(enable = "avx512fp16,avx512vl")]
20489    unsafe fn test_mm256_fmsub_ph() {
20490        let a = _mm256_set1_ph(1.0);
20491        let b = _mm256_set1_ph(2.0);
20492        let c = _mm256_set1_ph(3.0);
20493        let r = _mm256_fmsub_ph(a, b, c);
20494        let e = _mm256_set1_ph(-1.0);
20495        assert_eq_m256h(r, e);
20496    }
20497
20498    #[simd_test(enable = "avx512fp16,avx512vl")]
20499    unsafe fn test_mm256_mask_fmsub_ph() {
20500        let a = _mm256_set1_ph(1.0);
20501        let b = _mm256_set1_ph(2.0);
20502        let c = _mm256_set1_ph(3.0);
20503        let r = _mm256_mask_fmsub_ph(a, 0b0101010101010101, b, c);
20504        let e = _mm256_set_ph(
20505            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20506        );
20507        assert_eq_m256h(r, e);
20508    }
20509
20510    #[simd_test(enable = "avx512fp16,avx512vl")]
20511    unsafe fn test_mm256_mask3_fmsub_ph() {
20512        let a = _mm256_set1_ph(1.0);
20513        let b = _mm256_set1_ph(2.0);
20514        let c = _mm256_set1_ph(3.0);
20515        let r = _mm256_mask3_fmsub_ph(a, b, c, 0b0101010101010101);
20516        let e = _mm256_set_ph(
20517            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20518        );
20519        assert_eq_m256h(r, e);
20520    }
20521
20522    #[simd_test(enable = "avx512fp16,avx512vl")]
20523    unsafe fn test_mm256_maskz_fmsub_ph() {
20524        let a = _mm256_set1_ph(1.0);
20525        let b = _mm256_set1_ph(2.0);
20526        let c = _mm256_set1_ph(3.0);
20527        let r = _mm256_maskz_fmsub_ph(0b0101010101010101, a, b, c);
20528        let e = _mm256_set_ph(
20529            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20530        );
20531        assert_eq_m256h(r, e);
20532    }
20533
20534    #[simd_test(enable = "avx512fp16")]
20535    unsafe fn test_mm512_fmsub_ph() {
20536        let a = _mm512_set1_ph(1.0);
20537        let b = _mm512_set1_ph(2.0);
20538        let c = _mm512_set1_ph(3.0);
20539        let r = _mm512_fmsub_ph(a, b, c);
20540        let e = _mm512_set1_ph(-1.0);
20541        assert_eq_m512h(r, e);
20542    }
20543
20544    #[simd_test(enable = "avx512fp16")]
20545    unsafe fn test_mm512_mask_fmsub_ph() {
20546        let a = _mm512_set1_ph(1.0);
20547        let b = _mm512_set1_ph(2.0);
20548        let c = _mm512_set1_ph(3.0);
20549        let r = _mm512_mask_fmsub_ph(a, 0b01010101010101010101010101010101, b, c);
20550        let e = _mm512_set_ph(
20551            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20552            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20553        );
20554        assert_eq_m512h(r, e);
20555    }
20556
20557    #[simd_test(enable = "avx512fp16")]
20558    unsafe fn test_mm512_mask3_fmsub_ph() {
20559        let a = _mm512_set1_ph(1.0);
20560        let b = _mm512_set1_ph(2.0);
20561        let c = _mm512_set1_ph(3.0);
20562        let r = _mm512_mask3_fmsub_ph(a, b, c, 0b01010101010101010101010101010101);
20563        let e = _mm512_set_ph(
20564            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20565            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20566        );
20567        assert_eq_m512h(r, e);
20568    }
20569
20570    #[simd_test(enable = "avx512fp16")]
20571    unsafe fn test_mm512_maskz_fmsub_ph() {
20572        let a = _mm512_set1_ph(1.0);
20573        let b = _mm512_set1_ph(2.0);
20574        let c = _mm512_set1_ph(3.0);
20575        let r = _mm512_maskz_fmsub_ph(0b01010101010101010101010101010101, a, b, c);
20576        let e = _mm512_set_ph(
20577            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20578            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20579        );
20580        assert_eq_m512h(r, e);
20581    }
20582
20583    #[simd_test(enable = "avx512fp16")]
20584    unsafe fn test_mm512_fmsub_round_ph() {
20585        let a = _mm512_set1_ph(1.0);
20586        let b = _mm512_set1_ph(2.0);
20587        let c = _mm512_set1_ph(3.0);
20588        let r = _mm512_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20589        let e = _mm512_set1_ph(-1.0);
20590        assert_eq_m512h(r, e);
20591    }
20592
20593    #[simd_test(enable = "avx512fp16")]
20594    unsafe fn test_mm512_mask_fmsub_round_ph() {
20595        let a = _mm512_set1_ph(1.0);
20596        let b = _mm512_set1_ph(2.0);
20597        let c = _mm512_set1_ph(3.0);
20598        let r = _mm512_mask_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20599            a,
20600            0b01010101010101010101010101010101,
20601            b,
20602            c,
20603        );
20604        let e = _mm512_set_ph(
20605            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20606            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20607        );
20608        assert_eq_m512h(r, e);
20609    }
20610
20611    #[simd_test(enable = "avx512fp16")]
20612    unsafe fn test_mm512_mask3_fmsub_round_ph() {
20613        let a = _mm512_set1_ph(1.0);
20614        let b = _mm512_set1_ph(2.0);
20615        let c = _mm512_set1_ph(3.0);
20616        let r = _mm512_mask3_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20617            a,
20618            b,
20619            c,
20620            0b01010101010101010101010101010101,
20621        );
20622        let e = _mm512_set_ph(
20623            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20624            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20625        );
20626        assert_eq_m512h(r, e);
20627    }
20628
20629    #[simd_test(enable = "avx512fp16")]
20630    unsafe fn test_mm512_maskz_fmsub_round_ph() {
20631        let a = _mm512_set1_ph(1.0);
20632        let b = _mm512_set1_ph(2.0);
20633        let c = _mm512_set1_ph(3.0);
20634        let r = _mm512_maskz_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20635            0b01010101010101010101010101010101,
20636            a,
20637            b,
20638            c,
20639        );
20640        let e = _mm512_set_ph(
20641            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20642            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20643        );
20644        assert_eq_m512h(r, e);
20645    }
20646
20647    #[simd_test(enable = "avx512fp16")]
20648    unsafe fn test_mm_fmsub_sh() {
20649        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20650        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20651        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20652        let r = _mm_fmsub_sh(a, b, c);
20653        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20654        assert_eq_m128h(r, e);
20655    }
20656
20657    #[simd_test(enable = "avx512fp16")]
20658    unsafe fn test_mm_mask_fmsub_sh() {
20659        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20660        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20661        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20662        let r = _mm_mask_fmsub_sh(a, 0, b, c);
20663        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20664        assert_eq_m128h(r, e);
20665        let r = _mm_mask_fmsub_sh(a, 1, b, c);
20666        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20667        assert_eq_m128h(r, e);
20668    }
20669
20670    #[simd_test(enable = "avx512fp16")]
20671    unsafe fn test_mm_mask3_fmsub_sh() {
20672        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20673        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20674        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20675        let r = _mm_mask3_fmsub_sh(a, b, c, 0);
20676        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20677        assert_eq_m128h(r, e);
20678        let r = _mm_mask3_fmsub_sh(a, b, c, 1);
20679        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
20680        assert_eq_m128h(r, e);
20681    }
20682
20683    #[simd_test(enable = "avx512fp16")]
20684    unsafe fn test_mm_maskz_fmsub_sh() {
20685        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20686        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20687        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20688        let r = _mm_maskz_fmsub_sh(0, a, b, c);
20689        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20690        assert_eq_m128h(r, e);
20691        let r = _mm_maskz_fmsub_sh(1, a, b, c);
20692        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20693        assert_eq_m128h(r, e);
20694    }
20695
20696    #[simd_test(enable = "avx512fp16")]
20697    unsafe fn test_mm_fmsub_round_sh() {
20698        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20699        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20700        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20701        let r = _mm_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20702        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20703        assert_eq_m128h(r, e);
20704    }
20705
20706    #[simd_test(enable = "avx512fp16")]
20707    unsafe fn test_mm_mask_fmsub_round_sh() {
20708        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20709        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20710        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20711        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20712            a, 0, b, c,
20713        );
20714        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20715        assert_eq_m128h(r, e);
20716        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20717            a, 1, b, c,
20718        );
20719        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20720        assert_eq_m128h(r, e);
20721    }
20722
20723    #[simd_test(enable = "avx512fp16")]
20724    unsafe fn test_mm_mask3_fmsub_round_sh() {
20725        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20726        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20727        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20728        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20729            a, b, c, 0,
20730        );
20731        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20732        assert_eq_m128h(r, e);
20733        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20734            a, b, c, 1,
20735        );
20736        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
20737        assert_eq_m128h(r, e);
20738    }
20739
20740    #[simd_test(enable = "avx512fp16")]
20741    unsafe fn test_mm_maskz_fmsub_round_sh() {
20742        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20743        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20744        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20745        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20746            0, a, b, c,
20747        );
20748        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20749        assert_eq_m128h(r, e);
20750        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20751            1, a, b, c,
20752        );
20753        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20754        assert_eq_m128h(r, e);
20755    }
20756
20757    #[simd_test(enable = "avx512fp16,avx512vl")]
20758    unsafe fn test_mm_fnmadd_ph() {
20759        let a = _mm_set1_ph(1.0);
20760        let b = _mm_set1_ph(2.0);
20761        let c = _mm_set1_ph(3.0);
20762        let r = _mm_fnmadd_ph(a, b, c);
20763        let e = _mm_set1_ph(1.0);
20764        assert_eq_m128h(r, e);
20765    }
20766
20767    #[simd_test(enable = "avx512fp16,avx512vl")]
20768    unsafe fn test_mm_mask_fnmadd_ph() {
20769        let a = _mm_set1_ph(1.0);
20770        let b = _mm_set1_ph(2.0);
20771        let c = _mm_set1_ph(3.0);
20772        let r = _mm_mask_fnmadd_ph(a, 0b01010101, b, c);
20773        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
20774        assert_eq_m128h(r, e);
20775    }
20776
20777    #[simd_test(enable = "avx512fp16,avx512vl")]
20778    unsafe fn test_mm_mask3_fnmadd_ph() {
20779        let a = _mm_set1_ph(1.0);
20780        let b = _mm_set1_ph(2.0);
20781        let c = _mm_set1_ph(3.0);
20782        let r = _mm_mask3_fnmadd_ph(a, b, c, 0b01010101);
20783        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
20784        assert_eq_m128h(r, e);
20785    }
20786
20787    #[simd_test(enable = "avx512fp16,avx512vl")]
20788    unsafe fn test_mm_maskz_fnmadd_ph() {
20789        let a = _mm_set1_ph(1.0);
20790        let b = _mm_set1_ph(2.0);
20791        let c = _mm_set1_ph(3.0);
20792        let r = _mm_maskz_fnmadd_ph(0b01010101, a, b, c);
20793        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
20794        assert_eq_m128h(r, e);
20795    }
20796
20797    #[simd_test(enable = "avx512fp16,avx512vl")]
20798    unsafe fn test_mm256_fnmadd_ph() {
20799        let a = _mm256_set1_ph(1.0);
20800        let b = _mm256_set1_ph(2.0);
20801        let c = _mm256_set1_ph(3.0);
20802        let r = _mm256_fnmadd_ph(a, b, c);
20803        let e = _mm256_set1_ph(1.0);
20804        assert_eq_m256h(r, e);
20805    }
20806
20807    #[simd_test(enable = "avx512fp16,avx512vl")]
20808    unsafe fn test_mm256_mask_fnmadd_ph() {
20809        let a = _mm256_set1_ph(1.0);
20810        let b = _mm256_set1_ph(2.0);
20811        let c = _mm256_set1_ph(3.0);
20812        let r = _mm256_mask_fnmadd_ph(a, 0b0101010101010101, b, c);
20813        let e = _mm256_set_ph(
20814            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20815        );
20816        assert_eq_m256h(r, e);
20817    }
20818
20819    #[simd_test(enable = "avx512fp16,avx512vl")]
20820    unsafe fn test_mm256_mask3_fnmadd_ph() {
20821        let a = _mm256_set1_ph(1.0);
20822        let b = _mm256_set1_ph(2.0);
20823        let c = _mm256_set1_ph(3.0);
20824        let r = _mm256_mask3_fnmadd_ph(a, b, c, 0b0101010101010101);
20825        let e = _mm256_set_ph(
20826            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
20827        );
20828        assert_eq_m256h(r, e);
20829    }
20830
20831    #[simd_test(enable = "avx512fp16,avx512vl")]
20832    unsafe fn test_mm256_maskz_fnmadd_ph() {
20833        let a = _mm256_set1_ph(1.0);
20834        let b = _mm256_set1_ph(2.0);
20835        let c = _mm256_set1_ph(3.0);
20836        let r = _mm256_maskz_fnmadd_ph(0b0101010101010101, a, b, c);
20837        let e = _mm256_set_ph(
20838            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
20839        );
20840        assert_eq_m256h(r, e);
20841    }
20842
20843    #[simd_test(enable = "avx512fp16")]
20844    unsafe fn test_mm512_fnmadd_ph() {
20845        let a = _mm512_set1_ph(1.0);
20846        let b = _mm512_set1_ph(2.0);
20847        let c = _mm512_set1_ph(3.0);
20848        let r = _mm512_fnmadd_ph(a, b, c);
20849        let e = _mm512_set1_ph(1.0);
20850        assert_eq_m512h(r, e);
20851    }
20852
20853    #[simd_test(enable = "avx512fp16")]
20854    unsafe fn test_mm512_mask_fnmadd_ph() {
20855        let a = _mm512_set1_ph(1.0);
20856        let b = _mm512_set1_ph(2.0);
20857        let c = _mm512_set1_ph(3.0);
20858        let r = _mm512_mask_fnmadd_ph(a, 0b01010101010101010101010101010101, b, c);
20859        let e = _mm512_set_ph(
20860            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20861            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20862        );
20863        assert_eq_m512h(r, e);
20864    }
20865
20866    #[simd_test(enable = "avx512fp16")]
20867    unsafe fn test_mm512_mask3_fnmadd_ph() {
20868        let a = _mm512_set1_ph(1.0);
20869        let b = _mm512_set1_ph(2.0);
20870        let c = _mm512_set1_ph(3.0);
20871        let r = _mm512_mask3_fnmadd_ph(a, b, c, 0b01010101010101010101010101010101);
20872        let e = _mm512_set_ph(
20873            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
20874            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
20875        );
20876        assert_eq_m512h(r, e);
20877    }
20878
20879    #[simd_test(enable = "avx512fp16")]
20880    unsafe fn test_mm512_maskz_fnmadd_ph() {
20881        let a = _mm512_set1_ph(1.0);
20882        let b = _mm512_set1_ph(2.0);
20883        let c = _mm512_set1_ph(3.0);
20884        let r = _mm512_maskz_fnmadd_ph(0b01010101010101010101010101010101, a, b, c);
20885        let e = _mm512_set_ph(
20886            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
20887            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
20888        );
20889        assert_eq_m512h(r, e);
20890    }
20891
20892    #[simd_test(enable = "avx512fp16")]
20893    unsafe fn test_mm512_fnmadd_round_ph() {
20894        let a = _mm512_set1_ph(1.0);
20895        let b = _mm512_set1_ph(2.0);
20896        let c = _mm512_set1_ph(3.0);
20897        let r =
20898            _mm512_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20899        let e = _mm512_set1_ph(1.0);
20900        assert_eq_m512h(r, e);
20901    }
20902
20903    #[simd_test(enable = "avx512fp16")]
20904    unsafe fn test_mm512_mask_fnmadd_round_ph() {
20905        let a = _mm512_set1_ph(1.0);
20906        let b = _mm512_set1_ph(2.0);
20907        let c = _mm512_set1_ph(3.0);
20908        let r = _mm512_mask_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20909            a,
20910            0b01010101010101010101010101010101,
20911            b,
20912            c,
20913        );
20914        let e = _mm512_set_ph(
20915            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20916            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20917        );
20918        assert_eq_m512h(r, e);
20919    }
20920
20921    #[simd_test(enable = "avx512fp16")]
20922    unsafe fn test_mm512_mask3_fnmadd_round_ph() {
20923        let a = _mm512_set1_ph(1.0);
20924        let b = _mm512_set1_ph(2.0);
20925        let c = _mm512_set1_ph(3.0);
20926        let r = _mm512_mask3_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20927            a,
20928            b,
20929            c,
20930            0b01010101010101010101010101010101,
20931        );
20932        let e = _mm512_set_ph(
20933            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
20934            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
20935        );
20936        assert_eq_m512h(r, e);
20937    }
20938
20939    #[simd_test(enable = "avx512fp16")]
20940    unsafe fn test_mm512_maskz_fnmadd_round_ph() {
20941        let a = _mm512_set1_ph(1.0);
20942        let b = _mm512_set1_ph(2.0);
20943        let c = _mm512_set1_ph(3.0);
20944        let r = _mm512_maskz_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20945            0b01010101010101010101010101010101,
20946            a,
20947            b,
20948            c,
20949        );
20950        let e = _mm512_set_ph(
20951            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
20952            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
20953        );
20954        assert_eq_m512h(r, e);
20955    }
20956
20957    #[simd_test(enable = "avx512fp16")]
20958    unsafe fn test_mm_fnmadd_sh() {
20959        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20960        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20961        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20962        let r = _mm_fnmadd_sh(a, b, c);
20963        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20964        assert_eq_m128h(r, e);
20965    }
20966
20967    #[simd_test(enable = "avx512fp16")]
20968    unsafe fn test_mm_mask_fnmadd_sh() {
20969        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20970        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20971        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20972        let r = _mm_mask_fnmadd_sh(a, 0, b, c);
20973        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20974        assert_eq_m128h(r, e);
20975        let r = _mm_mask_fnmadd_sh(a, 1, b, c);
20976        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20977        assert_eq_m128h(r, e);
20978    }
20979
20980    #[simd_test(enable = "avx512fp16")]
20981    unsafe fn test_mm_mask3_fnmadd_sh() {
20982        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20983        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20984        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20985        let r = _mm_mask3_fnmadd_sh(a, b, c, 0);
20986        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20987        assert_eq_m128h(r, e);
20988        let r = _mm_mask3_fnmadd_sh(a, b, c, 1);
20989        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
20990        assert_eq_m128h(r, e);
20991    }
20992
20993    #[simd_test(enable = "avx512fp16")]
20994    unsafe fn test_mm_maskz_fnmadd_sh() {
20995        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20996        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20997        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20998        let r = _mm_maskz_fnmadd_sh(0, a, b, c);
20999        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21000        assert_eq_m128h(r, e);
21001        let r = _mm_maskz_fnmadd_sh(1, a, b, c);
21002        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21003        assert_eq_m128h(r, e);
21004    }
21005
21006    #[simd_test(enable = "avx512fp16")]
21007    unsafe fn test_mm_fnmadd_round_sh() {
21008        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21009        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21010        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21011        let r = _mm_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21012        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21013        assert_eq_m128h(r, e);
21014    }
21015
21016    #[simd_test(enable = "avx512fp16")]
21017    unsafe fn test_mm_mask_fnmadd_round_sh() {
21018        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21019        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21020        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21021        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21022            a, 0, b, c,
21023        );
21024        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21025        assert_eq_m128h(r, e);
21026        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21027            a, 1, b, c,
21028        );
21029        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21030        assert_eq_m128h(r, e);
21031    }
21032
21033    #[simd_test(enable = "avx512fp16")]
21034    unsafe fn test_mm_mask3_fnmadd_round_sh() {
21035        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21036        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21037        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21038        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21039            a, b, c, 0,
21040        );
21041        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21042        assert_eq_m128h(r, e);
21043        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21044            a, b, c, 1,
21045        );
21046        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
21047        assert_eq_m128h(r, e);
21048    }
21049
21050    #[simd_test(enable = "avx512fp16")]
21051    unsafe fn test_mm_maskz_fnmadd_round_sh() {
21052        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21053        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21054        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21055        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21056            0, a, b, c,
21057        );
21058        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21059        assert_eq_m128h(r, e);
21060        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21061            1, a, b, c,
21062        );
21063        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21064        assert_eq_m128h(r, e);
21065    }
21066
21067    #[simd_test(enable = "avx512fp16,avx512vl")]
21068    unsafe fn test_mm_fnmsub_ph() {
21069        let a = _mm_set1_ph(1.0);
21070        let b = _mm_set1_ph(2.0);
21071        let c = _mm_set1_ph(3.0);
21072        let r = _mm_fnmsub_ph(a, b, c);
21073        let e = _mm_set1_ph(-5.0);
21074        assert_eq_m128h(r, e);
21075    }
21076
21077    #[simd_test(enable = "avx512fp16,avx512vl")]
21078    unsafe fn test_mm_mask_fnmsub_ph() {
21079        let a = _mm_set1_ph(1.0);
21080        let b = _mm_set1_ph(2.0);
21081        let c = _mm_set1_ph(3.0);
21082        let r = _mm_mask_fnmsub_ph(a, 0b01010101, b, c);
21083        let e = _mm_set_ph(1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0);
21084        assert_eq_m128h(r, e);
21085    }
21086
21087    #[simd_test(enable = "avx512fp16,avx512vl")]
21088    unsafe fn test_mm_mask3_fnmsub_ph() {
21089        let a = _mm_set1_ph(1.0);
21090        let b = _mm_set1_ph(2.0);
21091        let c = _mm_set1_ph(3.0);
21092        let r = _mm_mask3_fnmsub_ph(a, b, c, 0b01010101);
21093        let e = _mm_set_ph(3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0);
21094        assert_eq_m128h(r, e);
21095    }
21096
21097    #[simd_test(enable = "avx512fp16,avx512vl")]
21098    unsafe fn test_mm_maskz_fnmsub_ph() {
21099        let a = _mm_set1_ph(1.0);
21100        let b = _mm_set1_ph(2.0);
21101        let c = _mm_set1_ph(3.0);
21102        let r = _mm_maskz_fnmsub_ph(0b01010101, a, b, c);
21103        let e = _mm_set_ph(0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0);
21104        assert_eq_m128h(r, e);
21105    }
21106
21107    #[simd_test(enable = "avx512fp16,avx512vl")]
21108    unsafe fn test_mm256_fnmsub_ph() {
21109        let a = _mm256_set1_ph(1.0);
21110        let b = _mm256_set1_ph(2.0);
21111        let c = _mm256_set1_ph(3.0);
21112        let r = _mm256_fnmsub_ph(a, b, c);
21113        let e = _mm256_set1_ph(-5.0);
21114        assert_eq_m256h(r, e);
21115    }
21116
21117    #[simd_test(enable = "avx512fp16,avx512vl")]
21118    unsafe fn test_mm256_mask_fnmsub_ph() {
21119        let a = _mm256_set1_ph(1.0);
21120        let b = _mm256_set1_ph(2.0);
21121        let c = _mm256_set1_ph(3.0);
21122        let r = _mm256_mask_fnmsub_ph(a, 0b0101010101010101, b, c);
21123        let e = _mm256_set_ph(
21124            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21125        );
21126        assert_eq_m256h(r, e);
21127    }
21128
21129    #[simd_test(enable = "avx512fp16,avx512vl")]
21130    unsafe fn test_mm256_mask3_fnmsub_ph() {
21131        let a = _mm256_set1_ph(1.0);
21132        let b = _mm256_set1_ph(2.0);
21133        let c = _mm256_set1_ph(3.0);
21134        let r = _mm256_mask3_fnmsub_ph(a, b, c, 0b0101010101010101);
21135        let e = _mm256_set_ph(
21136            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21137        );
21138        assert_eq_m256h(r, e);
21139    }
21140
21141    #[simd_test(enable = "avx512fp16,avx512vl")]
21142    unsafe fn test_mm256_maskz_fnmsub_ph() {
21143        let a = _mm256_set1_ph(1.0);
21144        let b = _mm256_set1_ph(2.0);
21145        let c = _mm256_set1_ph(3.0);
21146        let r = _mm256_maskz_fnmsub_ph(0b0101010101010101, a, b, c);
21147        let e = _mm256_set_ph(
21148            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21149        );
21150        assert_eq_m256h(r, e);
21151    }
21152
21153    #[simd_test(enable = "avx512fp16")]
21154    unsafe fn test_mm512_fnmsub_ph() {
21155        let a = _mm512_set1_ph(1.0);
21156        let b = _mm512_set1_ph(2.0);
21157        let c = _mm512_set1_ph(3.0);
21158        let r = _mm512_fnmsub_ph(a, b, c);
21159        let e = _mm512_set1_ph(-5.0);
21160        assert_eq_m512h(r, e);
21161    }
21162
21163    #[simd_test(enable = "avx512fp16")]
21164    unsafe fn test_mm512_mask_fnmsub_ph() {
21165        let a = _mm512_set1_ph(1.0);
21166        let b = _mm512_set1_ph(2.0);
21167        let c = _mm512_set1_ph(3.0);
21168        let r = _mm512_mask_fnmsub_ph(a, 0b01010101010101010101010101010101, b, c);
21169        let e = _mm512_set_ph(
21170            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21171            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21172        );
21173        assert_eq_m512h(r, e);
21174    }
21175
21176    #[simd_test(enable = "avx512fp16")]
21177    unsafe fn test_mm512_mask3_fnmsub_ph() {
21178        let a = _mm512_set1_ph(1.0);
21179        let b = _mm512_set1_ph(2.0);
21180        let c = _mm512_set1_ph(3.0);
21181        let r = _mm512_mask3_fnmsub_ph(a, b, c, 0b01010101010101010101010101010101);
21182        let e = _mm512_set_ph(
21183            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21184            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21185        );
21186        assert_eq_m512h(r, e);
21187    }
21188
21189    #[simd_test(enable = "avx512fp16")]
21190    unsafe fn test_mm512_maskz_fnmsub_ph() {
21191        let a = _mm512_set1_ph(1.0);
21192        let b = _mm512_set1_ph(2.0);
21193        let c = _mm512_set1_ph(3.0);
21194        let r = _mm512_maskz_fnmsub_ph(0b01010101010101010101010101010101, a, b, c);
21195        let e = _mm512_set_ph(
21196            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21197            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21198        );
21199        assert_eq_m512h(r, e);
21200    }
21201
21202    #[simd_test(enable = "avx512fp16")]
21203    unsafe fn test_mm512_fnmsub_round_ph() {
21204        let a = _mm512_set1_ph(1.0);
21205        let b = _mm512_set1_ph(2.0);
21206        let c = _mm512_set1_ph(3.0);
21207        let r =
21208            _mm512_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21209        let e = _mm512_set1_ph(-5.0);
21210        assert_eq_m512h(r, e);
21211    }
21212
21213    #[simd_test(enable = "avx512fp16")]
21214    unsafe fn test_mm512_mask_fnmsub_round_ph() {
21215        let a = _mm512_set1_ph(1.0);
21216        let b = _mm512_set1_ph(2.0);
21217        let c = _mm512_set1_ph(3.0);
21218        let r = _mm512_mask_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21219            a,
21220            0b01010101010101010101010101010101,
21221            b,
21222            c,
21223        );
21224        let e = _mm512_set_ph(
21225            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21226            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21227        );
21228        assert_eq_m512h(r, e);
21229    }
21230
21231    #[simd_test(enable = "avx512fp16")]
21232    unsafe fn test_mm512_mask3_fnmsub_round_ph() {
21233        let a = _mm512_set1_ph(1.0);
21234        let b = _mm512_set1_ph(2.0);
21235        let c = _mm512_set1_ph(3.0);
21236        let r = _mm512_mask3_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21237            a,
21238            b,
21239            c,
21240            0b01010101010101010101010101010101,
21241        );
21242        let e = _mm512_set_ph(
21243            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21244            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21245        );
21246        assert_eq_m512h(r, e);
21247    }
21248
21249    #[simd_test(enable = "avx512fp16")]
21250    unsafe fn test_mm512_maskz_fnmsub_round_ph() {
21251        let a = _mm512_set1_ph(1.0);
21252        let b = _mm512_set1_ph(2.0);
21253        let c = _mm512_set1_ph(3.0);
21254        let r = _mm512_maskz_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21255            0b01010101010101010101010101010101,
21256            a,
21257            b,
21258            c,
21259        );
21260        let e = _mm512_set_ph(
21261            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21262            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21263        );
21264        assert_eq_m512h(r, e);
21265    }
21266
21267    #[simd_test(enable = "avx512fp16")]
21268    unsafe fn test_mm_fnmsub_sh() {
21269        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21270        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21271        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21272        let r = _mm_fnmsub_sh(a, b, c);
21273        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21274        assert_eq_m128h(r, e);
21275    }
21276
21277    #[simd_test(enable = "avx512fp16")]
21278    unsafe fn test_mm_mask_fnmsub_sh() {
21279        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21280        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21281        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21282        let r = _mm_mask_fnmsub_sh(a, 0, b, c);
21283        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21284        assert_eq_m128h(r, e);
21285        let r = _mm_mask_fnmsub_sh(a, 1, b, c);
21286        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21287        assert_eq_m128h(r, e);
21288    }
21289
21290    #[simd_test(enable = "avx512fp16")]
21291    unsafe fn test_mm_mask3_fnmsub_sh() {
21292        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21293        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21294        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21295        let r = _mm_mask3_fnmsub_sh(a, b, c, 0);
21296        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21297        assert_eq_m128h(r, e);
21298        let r = _mm_mask3_fnmsub_sh(a, b, c, 1);
21299        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
21300        assert_eq_m128h(r, e);
21301    }
21302
21303    #[simd_test(enable = "avx512fp16")]
21304    unsafe fn test_mm_maskz_fnmsub_sh() {
21305        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21306        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21307        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21308        let r = _mm_maskz_fnmsub_sh(0, a, b, c);
21309        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21310        assert_eq_m128h(r, e);
21311        let r = _mm_maskz_fnmsub_sh(1, a, b, c);
21312        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21313        assert_eq_m128h(r, e);
21314    }
21315
21316    #[simd_test(enable = "avx512fp16")]
21317    unsafe fn test_mm_fnmsub_round_sh() {
21318        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21319        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21320        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21321        let r = _mm_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21322        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21323        assert_eq_m128h(r, e);
21324    }
21325
21326    #[simd_test(enable = "avx512fp16")]
21327    unsafe fn test_mm_mask_fnmsub_round_sh() {
21328        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21329        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21330        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21331        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21332            a, 0, b, c,
21333        );
21334        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21335        assert_eq_m128h(r, e);
21336        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21337            a, 1, b, c,
21338        );
21339        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21340        assert_eq_m128h(r, e);
21341    }
21342
21343    #[simd_test(enable = "avx512fp16")]
21344    unsafe fn test_mm_mask3_fnmsub_round_sh() {
21345        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21346        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21347        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21348        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21349            a, b, c, 0,
21350        );
21351        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21352        assert_eq_m128h(r, e);
21353        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21354            a, b, c, 1,
21355        );
21356        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
21357        assert_eq_m128h(r, e);
21358    }
21359
21360    #[simd_test(enable = "avx512fp16")]
21361    unsafe fn test_mm_maskz_fnmsub_round_sh() {
21362        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21363        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21364        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21365        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21366            0, a, b, c,
21367        );
21368        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21369        assert_eq_m128h(r, e);
21370        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21371            1, a, b, c,
21372        );
21373        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21374        assert_eq_m128h(r, e);
21375    }
21376
21377    #[simd_test(enable = "avx512fp16,avx512vl")]
21378    unsafe fn test_mm_fmaddsub_ph() {
21379        let a = _mm_set1_ph(1.0);
21380        let b = _mm_set1_ph(2.0);
21381        let c = _mm_set1_ph(3.0);
21382        let r = _mm_fmaddsub_ph(a, b, c);
21383        let e = _mm_set_ph(5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0);
21384        assert_eq_m128h(r, e);
21385    }
21386
21387    #[simd_test(enable = "avx512fp16,avx512vl")]
21388    unsafe fn test_mm_mask_fmaddsub_ph() {
21389        let a = _mm_set1_ph(1.0);
21390        let b = _mm_set1_ph(2.0);
21391        let c = _mm_set1_ph(3.0);
21392        let r = _mm_mask_fmaddsub_ph(a, 0b00110011, b, c);
21393        let e = _mm_set_ph(1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0);
21394        assert_eq_m128h(r, e);
21395    }
21396
21397    #[simd_test(enable = "avx512fp16,avx512vl")]
21398    unsafe fn test_mm_mask3_fmaddsub_ph() {
21399        let a = _mm_set1_ph(1.0);
21400        let b = _mm_set1_ph(2.0);
21401        let c = _mm_set1_ph(3.0);
21402        let r = _mm_mask3_fmaddsub_ph(a, b, c, 0b00110011);
21403        let e = _mm_set_ph(3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0);
21404        assert_eq_m128h(r, e);
21405    }
21406
21407    #[simd_test(enable = "avx512fp16,avx512vl")]
21408    unsafe fn test_mm_maskz_fmaddsub_ph() {
21409        let a = _mm_set1_ph(1.0);
21410        let b = _mm_set1_ph(2.0);
21411        let c = _mm_set1_ph(3.0);
21412        let r = _mm_maskz_fmaddsub_ph(0b00110011, a, b, c);
21413        let e = _mm_set_ph(0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0);
21414        assert_eq_m128h(r, e);
21415    }
21416
21417    #[simd_test(enable = "avx512fp16,avx512vl")]
21418    unsafe fn test_mm256_fmaddsub_ph() {
21419        let a = _mm256_set1_ph(1.0);
21420        let b = _mm256_set1_ph(2.0);
21421        let c = _mm256_set1_ph(3.0);
21422        let r = _mm256_fmaddsub_ph(a, b, c);
21423        let e = _mm256_set_ph(
21424            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21425        );
21426        assert_eq_m256h(r, e);
21427    }
21428
21429    #[simd_test(enable = "avx512fp16,avx512vl")]
21430    unsafe fn test_mm256_mask_fmaddsub_ph() {
21431        let a = _mm256_set1_ph(1.0);
21432        let b = _mm256_set1_ph(2.0);
21433        let c = _mm256_set1_ph(3.0);
21434        let r = _mm256_mask_fmaddsub_ph(a, 0b0011001100110011, b, c);
21435        let e = _mm256_set_ph(
21436            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21437        );
21438        assert_eq_m256h(r, e);
21439    }
21440
21441    #[simd_test(enable = "avx512fp16,avx512vl")]
21442    unsafe fn test_mm256_mask3_fmaddsub_ph() {
21443        let a = _mm256_set1_ph(1.0);
21444        let b = _mm256_set1_ph(2.0);
21445        let c = _mm256_set1_ph(3.0);
21446        let r = _mm256_mask3_fmaddsub_ph(a, b, c, 0b0011001100110011);
21447        let e = _mm256_set_ph(
21448            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21449        );
21450        assert_eq_m256h(r, e);
21451    }
21452
21453    #[simd_test(enable = "avx512fp16,avx512vl")]
21454    unsafe fn test_mm256_maskz_fmaddsub_ph() {
21455        let a = _mm256_set1_ph(1.0);
21456        let b = _mm256_set1_ph(2.0);
21457        let c = _mm256_set1_ph(3.0);
21458        let r = _mm256_maskz_fmaddsub_ph(0b0011001100110011, a, b, c);
21459        let e = _mm256_set_ph(
21460            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21461        );
21462        assert_eq_m256h(r, e);
21463    }
21464
21465    #[simd_test(enable = "avx512fp16")]
21466    unsafe fn test_mm512_fmaddsub_ph() {
21467        let a = _mm512_set1_ph(1.0);
21468        let b = _mm512_set1_ph(2.0);
21469        let c = _mm512_set1_ph(3.0);
21470        let r = _mm512_fmaddsub_ph(a, b, c);
21471        let e = _mm512_set_ph(
21472            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21473            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21474        );
21475        assert_eq_m512h(r, e);
21476    }
21477
21478    #[simd_test(enable = "avx512fp16")]
21479    unsafe fn test_mm512_mask_fmaddsub_ph() {
21480        let a = _mm512_set1_ph(1.0);
21481        let b = _mm512_set1_ph(2.0);
21482        let c = _mm512_set1_ph(3.0);
21483        let r = _mm512_mask_fmaddsub_ph(a, 0b00110011001100110011001100110011, b, c);
21484        let e = _mm512_set_ph(
21485            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21486            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21487        );
21488        assert_eq_m512h(r, e);
21489    }
21490
21491    #[simd_test(enable = "avx512fp16")]
21492    unsafe fn test_mm512_mask3_fmaddsub_ph() {
21493        let a = _mm512_set1_ph(1.0);
21494        let b = _mm512_set1_ph(2.0);
21495        let c = _mm512_set1_ph(3.0);
21496        let r = _mm512_mask3_fmaddsub_ph(a, b, c, 0b00110011001100110011001100110011);
21497        let e = _mm512_set_ph(
21498            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21499            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21500        );
21501        assert_eq_m512h(r, e);
21502    }
21503
21504    #[simd_test(enable = "avx512fp16")]
21505    unsafe fn test_mm512_maskz_fmaddsub_ph() {
21506        let a = _mm512_set1_ph(1.0);
21507        let b = _mm512_set1_ph(2.0);
21508        let c = _mm512_set1_ph(3.0);
21509        let r = _mm512_maskz_fmaddsub_ph(0b00110011001100110011001100110011, a, b, c);
21510        let e = _mm512_set_ph(
21511            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21512            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21513        );
21514        assert_eq_m512h(r, e);
21515    }
21516
21517    #[simd_test(enable = "avx512fp16")]
21518    unsafe fn test_mm512_fmaddsub_round_ph() {
21519        let a = _mm512_set1_ph(1.0);
21520        let b = _mm512_set1_ph(2.0);
21521        let c = _mm512_set1_ph(3.0);
21522        let r =
21523            _mm512_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21524        let e = _mm512_set_ph(
21525            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21526            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21527        );
21528        assert_eq_m512h(r, e);
21529    }
21530
21531    #[simd_test(enable = "avx512fp16")]
21532    unsafe fn test_mm512_mask_fmaddsub_round_ph() {
21533        let a = _mm512_set1_ph(1.0);
21534        let b = _mm512_set1_ph(2.0);
21535        let c = _mm512_set1_ph(3.0);
21536        let r = _mm512_mask_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21537            a,
21538            0b00110011001100110011001100110011,
21539            b,
21540            c,
21541        );
21542        let e = _mm512_set_ph(
21543            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21544            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21545        );
21546        assert_eq_m512h(r, e);
21547    }
21548
21549    #[simd_test(enable = "avx512fp16")]
21550    unsafe fn test_mm512_mask3_fmaddsub_round_ph() {
21551        let a = _mm512_set1_ph(1.0);
21552        let b = _mm512_set1_ph(2.0);
21553        let c = _mm512_set1_ph(3.0);
21554        let r = _mm512_mask3_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21555            a,
21556            b,
21557            c,
21558            0b00110011001100110011001100110011,
21559        );
21560        let e = _mm512_set_ph(
21561            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21562            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21563        );
21564        assert_eq_m512h(r, e);
21565    }
21566
21567    #[simd_test(enable = "avx512fp16")]
21568    unsafe fn test_mm512_maskz_fmaddsub_round_ph() {
21569        let a = _mm512_set1_ph(1.0);
21570        let b = _mm512_set1_ph(2.0);
21571        let c = _mm512_set1_ph(3.0);
21572        let r = _mm512_maskz_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21573            0b00110011001100110011001100110011,
21574            a,
21575            b,
21576            c,
21577        );
21578        let e = _mm512_set_ph(
21579            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21580            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21581        );
21582        assert_eq_m512h(r, e);
21583    }
21584
21585    #[simd_test(enable = "avx512fp16,avx512vl")]
21586    unsafe fn test_mm_fmsubadd_ph() {
21587        let a = _mm_set1_ph(1.0);
21588        let b = _mm_set1_ph(2.0);
21589        let c = _mm_set1_ph(3.0);
21590        let r = _mm_fmsubadd_ph(a, b, c);
21591        let e = _mm_set_ph(-1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0);
21592        assert_eq_m128h(r, e);
21593    }
21594
21595    #[simd_test(enable = "avx512fp16,avx512vl")]
21596    unsafe fn test_mm_mask_fmsubadd_ph() {
21597        let a = _mm_set1_ph(1.0);
21598        let b = _mm_set1_ph(2.0);
21599        let c = _mm_set1_ph(3.0);
21600        let r = _mm_mask_fmsubadd_ph(a, 0b00110011, b, c);
21601        let e = _mm_set_ph(1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0);
21602        assert_eq_m128h(r, e);
21603    }
21604
21605    #[simd_test(enable = "avx512fp16,avx512vl")]
21606    unsafe fn test_mm_mask3_fmsubadd_ph() {
21607        let a = _mm_set1_ph(1.0);
21608        let b = _mm_set1_ph(2.0);
21609        let c = _mm_set1_ph(3.0);
21610        let r = _mm_mask3_fmsubadd_ph(a, b, c, 0b00110011);
21611        let e = _mm_set_ph(3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0);
21612        assert_eq_m128h(r, e);
21613    }
21614
21615    #[simd_test(enable = "avx512fp16,avx512vl")]
21616    unsafe fn test_mm_maskz_fmsubadd_ph() {
21617        let a = _mm_set1_ph(1.0);
21618        let b = _mm_set1_ph(2.0);
21619        let c = _mm_set1_ph(3.0);
21620        let r = _mm_maskz_fmsubadd_ph(0b00110011, a, b, c);
21621        let e = _mm_set_ph(0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0);
21622        assert_eq_m128h(r, e);
21623    }
21624
21625    #[simd_test(enable = "avx512fp16,avx512vl")]
21626    unsafe fn test_mm256_fmsubadd_ph() {
21627        let a = _mm256_set1_ph(1.0);
21628        let b = _mm256_set1_ph(2.0);
21629        let c = _mm256_set1_ph(3.0);
21630        let r = _mm256_fmsubadd_ph(a, b, c);
21631        let e = _mm256_set_ph(
21632            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21633        );
21634        assert_eq_m256h(r, e);
21635    }
21636
21637    #[simd_test(enable = "avx512fp16,avx512vl")]
21638    unsafe fn test_mm256_mask_fmsubadd_ph() {
21639        let a = _mm256_set1_ph(1.0);
21640        let b = _mm256_set1_ph(2.0);
21641        let c = _mm256_set1_ph(3.0);
21642        let r = _mm256_mask_fmsubadd_ph(a, 0b0011001100110011, b, c);
21643        let e = _mm256_set_ph(
21644            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21645        );
21646        assert_eq_m256h(r, e);
21647    }
21648
21649    #[simd_test(enable = "avx512fp16,avx512vl")]
21650    unsafe fn test_mm256_mask3_fmsubadd_ph() {
21651        let a = _mm256_set1_ph(1.0);
21652        let b = _mm256_set1_ph(2.0);
21653        let c = _mm256_set1_ph(3.0);
21654        let r = _mm256_mask3_fmsubadd_ph(a, b, c, 0b0011001100110011);
21655        let e = _mm256_set_ph(
21656            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21657        );
21658        assert_eq_m256h(r, e);
21659    }
21660
21661    #[simd_test(enable = "avx512fp16,avx512vl")]
21662    unsafe fn test_mm256_maskz_fmsubadd_ph() {
21663        let a = _mm256_set1_ph(1.0);
21664        let b = _mm256_set1_ph(2.0);
21665        let c = _mm256_set1_ph(3.0);
21666        let r = _mm256_maskz_fmsubadd_ph(0b0011001100110011, a, b, c);
21667        let e = _mm256_set_ph(
21668            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21669        );
21670        assert_eq_m256h(r, e);
21671    }
21672
21673    #[simd_test(enable = "avx512fp16")]
21674    unsafe fn test_mm512_fmsubadd_ph() {
21675        let a = _mm512_set1_ph(1.0);
21676        let b = _mm512_set1_ph(2.0);
21677        let c = _mm512_set1_ph(3.0);
21678        let r = _mm512_fmsubadd_ph(a, b, c);
21679        let e = _mm512_set_ph(
21680            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21681            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21682        );
21683        assert_eq_m512h(r, e);
21684    }
21685
21686    #[simd_test(enable = "avx512fp16")]
21687    unsafe fn test_mm512_mask_fmsubadd_ph() {
21688        let a = _mm512_set1_ph(1.0);
21689        let b = _mm512_set1_ph(2.0);
21690        let c = _mm512_set1_ph(3.0);
21691        let r = _mm512_mask_fmsubadd_ph(a, 0b00110011001100110011001100110011, b, c);
21692        let e = _mm512_set_ph(
21693            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21694            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21695        );
21696        assert_eq_m512h(r, e);
21697    }
21698
21699    #[simd_test(enable = "avx512fp16")]
21700    unsafe fn test_mm512_mask3_fmsubadd_ph() {
21701        let a = _mm512_set1_ph(1.0);
21702        let b = _mm512_set1_ph(2.0);
21703        let c = _mm512_set1_ph(3.0);
21704        let r = _mm512_mask3_fmsubadd_ph(a, b, c, 0b00110011001100110011001100110011);
21705        let e = _mm512_set_ph(
21706            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21707            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21708        );
21709        assert_eq_m512h(r, e);
21710    }
21711
21712    #[simd_test(enable = "avx512fp16")]
21713    unsafe fn test_mm512_maskz_fmsubadd_ph() {
21714        let a = _mm512_set1_ph(1.0);
21715        let b = _mm512_set1_ph(2.0);
21716        let c = _mm512_set1_ph(3.0);
21717        let r = _mm512_maskz_fmsubadd_ph(0b00110011001100110011001100110011, a, b, c);
21718        let e = _mm512_set_ph(
21719            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21720            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21721        );
21722        assert_eq_m512h(r, e);
21723    }
21724
21725    #[simd_test(enable = "avx512fp16")]
21726    unsafe fn test_mm512_fmsubadd_round_ph() {
21727        let a = _mm512_set1_ph(1.0);
21728        let b = _mm512_set1_ph(2.0);
21729        let c = _mm512_set1_ph(3.0);
21730        let r =
21731            _mm512_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21732        let e = _mm512_set_ph(
21733            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21734            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21735        );
21736        assert_eq_m512h(r, e);
21737    }
21738
21739    #[simd_test(enable = "avx512fp16")]
21740    unsafe fn test_mm512_mask_fmsubadd_round_ph() {
21741        let a = _mm512_set1_ph(1.0);
21742        let b = _mm512_set1_ph(2.0);
21743        let c = _mm512_set1_ph(3.0);
21744        let r = _mm512_mask_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21745            a,
21746            0b00110011001100110011001100110011,
21747            b,
21748            c,
21749        );
21750        let e = _mm512_set_ph(
21751            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21752            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21753        );
21754        assert_eq_m512h(r, e);
21755    }
21756
21757    #[simd_test(enable = "avx512fp16")]
21758    unsafe fn test_mm512_mask3_fmsubadd_round_ph() {
21759        let a = _mm512_set1_ph(1.0);
21760        let b = _mm512_set1_ph(2.0);
21761        let c = _mm512_set1_ph(3.0);
21762        let r = _mm512_mask3_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21763            a,
21764            b,
21765            c,
21766            0b00110011001100110011001100110011,
21767        );
21768        let e = _mm512_set_ph(
21769            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21770            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21771        );
21772        assert_eq_m512h(r, e);
21773    }
21774
21775    #[simd_test(enable = "avx512fp16")]
21776    unsafe fn test_mm512_maskz_fmsubadd_round_ph() {
21777        let a = _mm512_set1_ph(1.0);
21778        let b = _mm512_set1_ph(2.0);
21779        let c = _mm512_set1_ph(3.0);
21780        let r = _mm512_maskz_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21781            0b00110011001100110011001100110011,
21782            a,
21783            b,
21784            c,
21785        );
21786        let e = _mm512_set_ph(
21787            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21788            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21789        );
21790        assert_eq_m512h(r, e);
21791    }
21792
21793    #[simd_test(enable = "avx512fp16,avx512vl")]
21794    unsafe fn test_mm_rcp_ph() {
21795        let a = _mm_set1_ph(2.0);
21796        let r = _mm_rcp_ph(a);
21797        let e = _mm_set1_ph(0.5);
21798        assert_eq_m128h(r, e);
21799    }
21800
21801    #[simd_test(enable = "avx512fp16,avx512vl")]
21802    unsafe fn test_mm_mask_rcp_ph() {
21803        let a = _mm_set1_ph(2.0);
21804        let src = _mm_set1_ph(1.0);
21805        let r = _mm_mask_rcp_ph(src, 0b01010101, a);
21806        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
21807        assert_eq_m128h(r, e);
21808    }
21809
21810    #[simd_test(enable = "avx512fp16,avx512vl")]
21811    unsafe fn test_mm_maskz_rcp_ph() {
21812        let a = _mm_set1_ph(2.0);
21813        let r = _mm_maskz_rcp_ph(0b01010101, a);
21814        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
21815        assert_eq_m128h(r, e);
21816    }
21817
21818    #[simd_test(enable = "avx512fp16,avx512vl")]
21819    unsafe fn test_mm256_rcp_ph() {
21820        let a = _mm256_set1_ph(2.0);
21821        let r = _mm256_rcp_ph(a);
21822        let e = _mm256_set1_ph(0.5);
21823        assert_eq_m256h(r, e);
21824    }
21825
21826    #[simd_test(enable = "avx512fp16,avx512vl")]
21827    unsafe fn test_mm256_mask_rcp_ph() {
21828        let a = _mm256_set1_ph(2.0);
21829        let src = _mm256_set1_ph(1.0);
21830        let r = _mm256_mask_rcp_ph(src, 0b0101010101010101, a);
21831        let e = _mm256_set_ph(
21832            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21833        );
21834        assert_eq_m256h(r, e);
21835    }
21836
21837    #[simd_test(enable = "avx512fp16,avx512vl")]
21838    unsafe fn test_mm256_maskz_rcp_ph() {
21839        let a = _mm256_set1_ph(2.0);
21840        let r = _mm256_maskz_rcp_ph(0b0101010101010101, a);
21841        let e = _mm256_set_ph(
21842            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21843        );
21844        assert_eq_m256h(r, e);
21845    }
21846
21847    #[simd_test(enable = "avx512fp16")]
21848    unsafe fn test_mm512_rcp_ph() {
21849        let a = _mm512_set1_ph(2.0);
21850        let r = _mm512_rcp_ph(a);
21851        let e = _mm512_set1_ph(0.5);
21852        assert_eq_m512h(r, e);
21853    }
21854
21855    #[simd_test(enable = "avx512fp16")]
21856    unsafe fn test_mm512_mask_rcp_ph() {
21857        let a = _mm512_set1_ph(2.0);
21858        let src = _mm512_set1_ph(1.0);
21859        let r = _mm512_mask_rcp_ph(src, 0b01010101010101010101010101010101, a);
21860        let e = _mm512_set_ph(
21861            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
21862            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21863        );
21864        assert_eq_m512h(r, e);
21865    }
21866
21867    #[simd_test(enable = "avx512fp16")]
21868    unsafe fn test_mm512_maskz_rcp_ph() {
21869        let a = _mm512_set1_ph(2.0);
21870        let r = _mm512_maskz_rcp_ph(0b01010101010101010101010101010101, a);
21871        let e = _mm512_set_ph(
21872            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
21873            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21874        );
21875        assert_eq_m512h(r, e);
21876    }
21877
21878    #[simd_test(enable = "avx512fp16")]
21879    unsafe fn test_mm_rcp_sh() {
21880        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21881        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
21882        let r = _mm_rcp_sh(a, b);
21883        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21884        assert_eq_m128h(r, e);
21885    }
21886
21887    #[simd_test(enable = "avx512fp16")]
21888    unsafe fn test_mm_mask_rcp_sh() {
21889        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21890        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
21891        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
21892        let r = _mm_mask_rcp_sh(src, 0, a, b);
21893        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21894        assert_eq_m128h(r, e);
21895        let r = _mm_mask_rcp_sh(src, 1, a, b);
21896        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21897        assert_eq_m128h(r, e);
21898    }
21899
21900    #[simd_test(enable = "avx512fp16")]
21901    unsafe fn test_mm_maskz_rcp_sh() {
21902        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21903        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
21904        let r = _mm_maskz_rcp_sh(0, a, b);
21905        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21906        assert_eq_m128h(r, e);
21907        let r = _mm_maskz_rcp_sh(1, a, b);
21908        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21909        assert_eq_m128h(r, e);
21910    }
21911
21912    #[simd_test(enable = "avx512fp16,avx512vl")]
21913    unsafe fn test_mm_rsqrt_ph() {
21914        let a = _mm_set1_ph(4.0);
21915        let r = _mm_rsqrt_ph(a);
21916        let e = _mm_set1_ph(0.5);
21917        assert_eq_m128h(r, e);
21918    }
21919
21920    #[simd_test(enable = "avx512fp16,avx512vl")]
21921    unsafe fn test_mm_mask_rsqrt_ph() {
21922        let a = _mm_set1_ph(4.0);
21923        let src = _mm_set1_ph(1.0);
21924        let r = _mm_mask_rsqrt_ph(src, 0b01010101, a);
21925        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
21926        assert_eq_m128h(r, e);
21927    }
21928
21929    #[simd_test(enable = "avx512fp16,avx512vl")]
21930    unsafe fn test_mm_maskz_rsqrt_ph() {
21931        let a = _mm_set1_ph(4.0);
21932        let r = _mm_maskz_rsqrt_ph(0b01010101, a);
21933        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
21934        assert_eq_m128h(r, e);
21935    }
21936
21937    #[simd_test(enable = "avx512fp16,avx512vl")]
21938    unsafe fn test_mm256_rsqrt_ph() {
21939        let a = _mm256_set1_ph(4.0);
21940        let r = _mm256_rsqrt_ph(a);
21941        let e = _mm256_set1_ph(0.5);
21942        assert_eq_m256h(r, e);
21943    }
21944
21945    #[simd_test(enable = "avx512fp16,avx512vl")]
21946    unsafe fn test_mm256_mask_rsqrt_ph() {
21947        let a = _mm256_set1_ph(4.0);
21948        let src = _mm256_set1_ph(1.0);
21949        let r = _mm256_mask_rsqrt_ph(src, 0b0101010101010101, a);
21950        let e = _mm256_set_ph(
21951            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21952        );
21953        assert_eq_m256h(r, e);
21954    }
21955
21956    #[simd_test(enable = "avx512fp16,avx512vl")]
21957    unsafe fn test_mm256_maskz_rsqrt_ph() {
21958        let a = _mm256_set1_ph(4.0);
21959        let r = _mm256_maskz_rsqrt_ph(0b0101010101010101, a);
21960        let e = _mm256_set_ph(
21961            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21962        );
21963        assert_eq_m256h(r, e);
21964    }
21965
21966    #[simd_test(enable = "avx512fp16")]
21967    unsafe fn test_mm512_rsqrt_ph() {
21968        let a = _mm512_set1_ph(4.0);
21969        let r = _mm512_rsqrt_ph(a);
21970        let e = _mm512_set1_ph(0.5);
21971        assert_eq_m512h(r, e);
21972    }
21973
21974    #[simd_test(enable = "avx512fp16")]
21975    unsafe fn test_mm512_mask_rsqrt_ph() {
21976        let a = _mm512_set1_ph(4.0);
21977        let src = _mm512_set1_ph(1.0);
21978        let r = _mm512_mask_rsqrt_ph(src, 0b01010101010101010101010101010101, a);
21979        let e = _mm512_set_ph(
21980            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
21981            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21982        );
21983        assert_eq_m512h(r, e);
21984    }
21985
21986    #[simd_test(enable = "avx512fp16")]
21987    unsafe fn test_mm512_maskz_rsqrt_ph() {
21988        let a = _mm512_set1_ph(4.0);
21989        let r = _mm512_maskz_rsqrt_ph(0b01010101010101010101010101010101, a);
21990        let e = _mm512_set_ph(
21991            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
21992            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21993        );
21994        assert_eq_m512h(r, e);
21995    }
21996
21997    #[simd_test(enable = "avx512fp16")]
21998    unsafe fn test_mm_rsqrt_sh() {
21999        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22000        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22001        let r = _mm_rsqrt_sh(a, b);
22002        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22003        assert_eq_m128h(r, e);
22004    }
22005
22006    #[simd_test(enable = "avx512fp16")]
22007    unsafe fn test_mm_mask_rsqrt_sh() {
22008        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22009        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22010        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22011        let r = _mm_mask_rsqrt_sh(src, 0, a, b);
22012        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22013        assert_eq_m128h(r, e);
22014        let r = _mm_mask_rsqrt_sh(src, 1, a, b);
22015        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22016        assert_eq_m128h(r, e);
22017    }
22018
22019    #[simd_test(enable = "avx512fp16")]
22020    unsafe fn test_mm_maskz_rsqrt_sh() {
22021        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22022        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22023        let r = _mm_maskz_rsqrt_sh(0, a, b);
22024        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22025        assert_eq_m128h(r, e);
22026        let r = _mm_maskz_rsqrt_sh(1, a, b);
22027        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22028        assert_eq_m128h(r, e);
22029    }
22030
22031    #[simd_test(enable = "avx512fp16,avx512vl")]
22032    unsafe fn test_mm_sqrt_ph() {
22033        let a = _mm_set1_ph(4.0);
22034        let r = _mm_sqrt_ph(a);
22035        let e = _mm_set1_ph(2.0);
22036        assert_eq_m128h(r, e);
22037    }
22038
22039    #[simd_test(enable = "avx512fp16,avx512vl")]
22040    unsafe fn test_mm_mask_sqrt_ph() {
22041        let a = _mm_set1_ph(4.0);
22042        let src = _mm_set1_ph(1.0);
22043        let r = _mm_mask_sqrt_ph(src, 0b01010101, a);
22044        let e = _mm_set_ph(1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0);
22045        assert_eq_m128h(r, e);
22046    }
22047
22048    #[simd_test(enable = "avx512fp16,avx512vl")]
22049    unsafe fn test_mm_maskz_sqrt_ph() {
22050        let a = _mm_set1_ph(4.0);
22051        let r = _mm_maskz_sqrt_ph(0b01010101, a);
22052        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
22053        assert_eq_m128h(r, e);
22054    }
22055
22056    #[simd_test(enable = "avx512fp16,avx512vl")]
22057    unsafe fn test_mm256_sqrt_ph() {
22058        let a = _mm256_set1_ph(4.0);
22059        let r = _mm256_sqrt_ph(a);
22060        let e = _mm256_set1_ph(2.0);
22061        assert_eq_m256h(r, e);
22062    }
22063
22064    #[simd_test(enable = "avx512fp16,avx512vl")]
22065    unsafe fn test_mm256_mask_sqrt_ph() {
22066        let a = _mm256_set1_ph(4.0);
22067        let src = _mm256_set1_ph(1.0);
22068        let r = _mm256_mask_sqrt_ph(src, 0b0101010101010101, a);
22069        let e = _mm256_set_ph(
22070            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22071        );
22072        assert_eq_m256h(r, e);
22073    }
22074
22075    #[simd_test(enable = "avx512fp16,avx512vl")]
22076    unsafe fn test_mm256_maskz_sqrt_ph() {
22077        let a = _mm256_set1_ph(4.0);
22078        let r = _mm256_maskz_sqrt_ph(0b0101010101010101, a);
22079        let e = _mm256_set_ph(
22080            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22081        );
22082        assert_eq_m256h(r, e);
22083    }
22084
22085    #[simd_test(enable = "avx512fp16")]
22086    unsafe fn test_mm512_sqrt_ph() {
22087        let a = _mm512_set1_ph(4.0);
22088        let r = _mm512_sqrt_ph(a);
22089        let e = _mm512_set1_ph(2.0);
22090        assert_eq_m512h(r, e);
22091    }
22092
22093    #[simd_test(enable = "avx512fp16")]
22094    unsafe fn test_mm512_mask_sqrt_ph() {
22095        let a = _mm512_set1_ph(4.0);
22096        let src = _mm512_set1_ph(1.0);
22097        let r = _mm512_mask_sqrt_ph(src, 0b01010101010101010101010101010101, a);
22098        let e = _mm512_set_ph(
22099            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
22100            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22101        );
22102        assert_eq_m512h(r, e);
22103    }
22104
22105    #[simd_test(enable = "avx512fp16")]
22106    unsafe fn test_mm512_maskz_sqrt_ph() {
22107        let a = _mm512_set1_ph(4.0);
22108        let r = _mm512_maskz_sqrt_ph(0b01010101010101010101010101010101, a);
22109        let e = _mm512_set_ph(
22110            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22111            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22112        );
22113        assert_eq_m512h(r, e);
22114    }
22115
22116    #[simd_test(enable = "avx512fp16")]
22117    unsafe fn test_mm512_sqrt_round_ph() {
22118        let a = _mm512_set1_ph(4.0);
22119        let r = _mm512_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
22120        let e = _mm512_set1_ph(2.0);
22121        assert_eq_m512h(r, e);
22122    }
22123
22124    #[simd_test(enable = "avx512fp16")]
22125    unsafe fn test_mm512_mask_sqrt_round_ph() {
22126        let a = _mm512_set1_ph(4.0);
22127        let src = _mm512_set1_ph(1.0);
22128        let r = _mm512_mask_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22129            src,
22130            0b01010101010101010101010101010101,
22131            a,
22132        );
22133        let e = _mm512_set_ph(
22134            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
22135            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22136        );
22137        assert_eq_m512h(r, e);
22138    }
22139
22140    #[simd_test(enable = "avx512fp16")]
22141    unsafe fn test_mm512_maskz_sqrt_round_ph() {
22142        let a = _mm512_set1_ph(4.0);
22143        let r = _mm512_maskz_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22144            0b01010101010101010101010101010101,
22145            a,
22146        );
22147        let e = _mm512_set_ph(
22148            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22149            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22150        );
22151        assert_eq_m512h(r, e);
22152    }
22153
22154    #[simd_test(enable = "avx512fp16")]
22155    unsafe fn test_mm_sqrt_sh() {
22156        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22157        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22158        let r = _mm_sqrt_sh(a, b);
22159        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22160        assert_eq_m128h(r, e);
22161    }
22162
22163    #[simd_test(enable = "avx512fp16")]
22164    unsafe fn test_mm_mask_sqrt_sh() {
22165        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22166        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22167        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22168        let r = _mm_mask_sqrt_sh(src, 0, a, b);
22169        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22170        assert_eq_m128h(r, e);
22171        let r = _mm_mask_sqrt_sh(src, 1, a, b);
22172        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22173        assert_eq_m128h(r, e);
22174    }
22175
22176    #[simd_test(enable = "avx512fp16")]
22177    unsafe fn test_mm_maskz_sqrt_sh() {
22178        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22179        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22180        let r = _mm_maskz_sqrt_sh(0, a, b);
22181        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22182        assert_eq_m128h(r, e);
22183        let r = _mm_maskz_sqrt_sh(1, a, b);
22184        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22185        assert_eq_m128h(r, e);
22186    }
22187
22188    #[simd_test(enable = "avx512fp16")]
22189    unsafe fn test_mm_sqrt_round_sh() {
22190        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22191        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22192        let r = _mm_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22193        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22194        assert_eq_m128h(r, e);
22195    }
22196
22197    #[simd_test(enable = "avx512fp16")]
22198    unsafe fn test_mm_mask_sqrt_round_sh() {
22199        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22200        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22201        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22202        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22203            src, 0, a, b,
22204        );
22205        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22206        assert_eq_m128h(r, e);
22207        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22208            src, 1, a, b,
22209        );
22210        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22211        assert_eq_m128h(r, e);
22212    }
22213
22214    #[simd_test(enable = "avx512fp16")]
22215    unsafe fn test_mm_maskz_sqrt_round_sh() {
22216        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22217        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22218        let r =
22219            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22220        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22221        assert_eq_m128h(r, e);
22222        let r =
22223            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22224        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22225        assert_eq_m128h(r, e);
22226    }
22227
22228    #[simd_test(enable = "avx512fp16,avx512vl")]
22229    unsafe fn test_mm_max_ph() {
22230        let a = _mm_set1_ph(2.0);
22231        let b = _mm_set1_ph(1.0);
22232        let r = _mm_max_ph(a, b);
22233        let e = _mm_set1_ph(2.0);
22234        assert_eq_m128h(r, e);
22235    }
22236
22237    #[simd_test(enable = "avx512fp16,avx512vl")]
22238    unsafe fn test_mm_mask_max_ph() {
22239        let a = _mm_set1_ph(2.0);
22240        let b = _mm_set1_ph(1.0);
22241        let src = _mm_set1_ph(3.0);
22242        let r = _mm_mask_max_ph(src, 0b01010101, a, b);
22243        let e = _mm_set_ph(3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0);
22244        assert_eq_m128h(r, e);
22245    }
22246
22247    #[simd_test(enable = "avx512fp16,avx512vl")]
22248    unsafe fn test_mm_maskz_max_ph() {
22249        let a = _mm_set1_ph(2.0);
22250        let b = _mm_set1_ph(1.0);
22251        let r = _mm_maskz_max_ph(0b01010101, a, b);
22252        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
22253        assert_eq_m128h(r, e);
22254    }
22255
22256    #[simd_test(enable = "avx512fp16,avx512vl")]
22257    unsafe fn test_mm256_max_ph() {
22258        let a = _mm256_set1_ph(2.0);
22259        let b = _mm256_set1_ph(1.0);
22260        let r = _mm256_max_ph(a, b);
22261        let e = _mm256_set1_ph(2.0);
22262        assert_eq_m256h(r, e);
22263    }
22264
22265    #[simd_test(enable = "avx512fp16,avx512vl")]
22266    unsafe fn test_mm256_mask_max_ph() {
22267        let a = _mm256_set1_ph(2.0);
22268        let b = _mm256_set1_ph(1.0);
22269        let src = _mm256_set1_ph(3.0);
22270        let r = _mm256_mask_max_ph(src, 0b0101010101010101, a, b);
22271        let e = _mm256_set_ph(
22272            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22273        );
22274        assert_eq_m256h(r, e);
22275    }
22276
22277    #[simd_test(enable = "avx512fp16,avx512vl")]
22278    unsafe fn test_mm256_maskz_max_ph() {
22279        let a = _mm256_set1_ph(2.0);
22280        let b = _mm256_set1_ph(1.0);
22281        let r = _mm256_maskz_max_ph(0b0101010101010101, a, b);
22282        let e = _mm256_set_ph(
22283            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22284        );
22285        assert_eq_m256h(r, e);
22286    }
22287
22288    #[simd_test(enable = "avx512fp16")]
22289    unsafe fn test_mm512_max_ph() {
22290        let a = _mm512_set1_ph(2.0);
22291        let b = _mm512_set1_ph(1.0);
22292        let r = _mm512_max_ph(a, b);
22293        let e = _mm512_set1_ph(2.0);
22294        assert_eq_m512h(r, e);
22295    }
22296
22297    #[simd_test(enable = "avx512fp16")]
22298    unsafe fn test_mm512_mask_max_ph() {
22299        let a = _mm512_set1_ph(2.0);
22300        let b = _mm512_set1_ph(1.0);
22301        let src = _mm512_set1_ph(3.0);
22302        let r = _mm512_mask_max_ph(src, 0b01010101010101010101010101010101, a, b);
22303        let e = _mm512_set_ph(
22304            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
22305            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22306        );
22307        assert_eq_m512h(r, e);
22308    }
22309
22310    #[simd_test(enable = "avx512fp16")]
22311    unsafe fn test_mm512_maskz_max_ph() {
22312        let a = _mm512_set1_ph(2.0);
22313        let b = _mm512_set1_ph(1.0);
22314        let r = _mm512_maskz_max_ph(0b01010101010101010101010101010101, a, b);
22315        let e = _mm512_set_ph(
22316            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22317            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22318        );
22319        assert_eq_m512h(r, e);
22320    }
22321
22322    #[simd_test(enable = "avx512fp16")]
22323    unsafe fn test_mm512_max_round_ph() {
22324        let a = _mm512_set1_ph(2.0);
22325        let b = _mm512_set1_ph(1.0);
22326        let r = _mm512_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22327        let e = _mm512_set1_ph(2.0);
22328        assert_eq_m512h(r, e);
22329    }
22330
22331    #[simd_test(enable = "avx512fp16")]
22332    unsafe fn test_mm512_mask_max_round_ph() {
22333        let a = _mm512_set1_ph(2.0);
22334        let b = _mm512_set1_ph(1.0);
22335        let src = _mm512_set1_ph(3.0);
22336        let r = _mm512_mask_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22337            src,
22338            0b01010101010101010101010101010101,
22339            a,
22340            b,
22341        );
22342        let e = _mm512_set_ph(
22343            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
22344            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22345        );
22346        assert_eq_m512h(r, e);
22347    }
22348
22349    #[simd_test(enable = "avx512fp16")]
22350    unsafe fn test_mm512_maskz_max_round_ph() {
22351        let a = _mm512_set1_ph(2.0);
22352        let b = _mm512_set1_ph(1.0);
22353        let r = _mm512_maskz_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22354            0b01010101010101010101010101010101,
22355            a,
22356            b,
22357        );
22358        let e = _mm512_set_ph(
22359            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22360            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22361        );
22362        assert_eq_m512h(r, e);
22363    }
22364
22365    #[simd_test(enable = "avx512fp16")]
22366    unsafe fn test_mm_max_sh() {
22367        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22368        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22369        let r = _mm_max_sh(a, b);
22370        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22371        assert_eq_m128h(r, e);
22372    }
22373
22374    #[simd_test(enable = "avx512fp16")]
22375    unsafe fn test_mm_mask_max_sh() {
22376        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22377        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22378        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22379        let r = _mm_mask_max_sh(src, 0, a, b);
22380        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22381        assert_eq_m128h(r, e);
22382        let r = _mm_mask_max_sh(src, 1, a, b);
22383        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22384        assert_eq_m128h(r, e);
22385    }
22386
22387    #[simd_test(enable = "avx512fp16")]
22388    unsafe fn test_mm_maskz_max_sh() {
22389        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22390        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22391        let r = _mm_maskz_max_sh(0, a, b);
22392        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22393        assert_eq_m128h(r, e);
22394        let r = _mm_maskz_max_sh(1, a, b);
22395        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22396        assert_eq_m128h(r, e);
22397    }
22398
22399    #[simd_test(enable = "avx512fp16")]
22400    unsafe fn test_mm_max_round_sh() {
22401        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22402        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22403        let r = _mm_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22404        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22405        assert_eq_m128h(r, e);
22406    }
22407
22408    #[simd_test(enable = "avx512fp16")]
22409    unsafe fn test_mm_mask_max_round_sh() {
22410        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22411        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22412        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22413        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22414            src, 0, a, b,
22415        );
22416        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22417        assert_eq_m128h(r, e);
22418        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22419            src, 1, a, b,
22420        );
22421        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22422        assert_eq_m128h(r, e);
22423    }
22424
22425    #[simd_test(enable = "avx512fp16")]
22426    unsafe fn test_mm_maskz_max_round_sh() {
22427        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22428        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22429        let r =
22430            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22431        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22432        assert_eq_m128h(r, e);
22433        let r =
22434            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22435        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22436        assert_eq_m128h(r, e);
22437    }
22438
22439    #[simd_test(enable = "avx512fp16,avx512vl")]
22440    unsafe fn test_mm_min_ph() {
22441        let a = _mm_set1_ph(2.0);
22442        let b = _mm_set1_ph(1.0);
22443        let r = _mm_min_ph(a, b);
22444        let e = _mm_set1_ph(1.0);
22445        assert_eq_m128h(r, e);
22446    }
22447
22448    #[simd_test(enable = "avx512fp16,avx512vl")]
22449    unsafe fn test_mm_mask_min_ph() {
22450        let a = _mm_set1_ph(2.0);
22451        let b = _mm_set1_ph(1.0);
22452        let src = _mm_set1_ph(3.0);
22453        let r = _mm_mask_min_ph(src, 0b01010101, a, b);
22454        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
22455        assert_eq_m128h(r, e);
22456    }
22457
22458    #[simd_test(enable = "avx512fp16,avx512vl")]
22459    unsafe fn test_mm_maskz_min_ph() {
22460        let a = _mm_set1_ph(2.0);
22461        let b = _mm_set1_ph(1.0);
22462        let r = _mm_maskz_min_ph(0b01010101, a, b);
22463        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
22464        assert_eq_m128h(r, e);
22465    }
22466
22467    #[simd_test(enable = "avx512fp16,avx512vl")]
22468    unsafe fn test_mm256_min_ph() {
22469        let a = _mm256_set1_ph(2.0);
22470        let b = _mm256_set1_ph(1.0);
22471        let r = _mm256_min_ph(a, b);
22472        let e = _mm256_set1_ph(1.0);
22473        assert_eq_m256h(r, e);
22474    }
22475
22476    #[simd_test(enable = "avx512fp16,avx512vl")]
22477    unsafe fn test_mm256_mask_min_ph() {
22478        let a = _mm256_set1_ph(2.0);
22479        let b = _mm256_set1_ph(1.0);
22480        let src = _mm256_set1_ph(3.0);
22481        let r = _mm256_mask_min_ph(src, 0b0101010101010101, a, b);
22482        let e = _mm256_set_ph(
22483            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22484        );
22485        assert_eq_m256h(r, e);
22486    }
22487
22488    #[simd_test(enable = "avx512fp16,avx512vl")]
22489    unsafe fn test_mm256_maskz_min_ph() {
22490        let a = _mm256_set1_ph(2.0);
22491        let b = _mm256_set1_ph(1.0);
22492        let r = _mm256_maskz_min_ph(0b0101010101010101, a, b);
22493        let e = _mm256_set_ph(
22494            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22495        );
22496        assert_eq_m256h(r, e);
22497    }
22498
22499    #[simd_test(enable = "avx512fp16")]
22500    unsafe fn test_mm512_min_ph() {
22501        let a = _mm512_set1_ph(2.0);
22502        let b = _mm512_set1_ph(1.0);
22503        let r = _mm512_min_ph(a, b);
22504        let e = _mm512_set1_ph(1.0);
22505        assert_eq_m512h(r, e);
22506    }
22507
22508    #[simd_test(enable = "avx512fp16")]
22509    unsafe fn test_mm512_mask_min_ph() {
22510        let a = _mm512_set1_ph(2.0);
22511        let b = _mm512_set1_ph(1.0);
22512        let src = _mm512_set1_ph(3.0);
22513        let r = _mm512_mask_min_ph(src, 0b01010101010101010101010101010101, a, b);
22514        let e = _mm512_set_ph(
22515            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
22516            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22517        );
22518        assert_eq_m512h(r, e);
22519    }
22520
22521    #[simd_test(enable = "avx512fp16")]
22522    unsafe fn test_mm512_maskz_min_ph() {
22523        let a = _mm512_set1_ph(2.0);
22524        let b = _mm512_set1_ph(1.0);
22525        let r = _mm512_maskz_min_ph(0b01010101010101010101010101010101, a, b);
22526        let e = _mm512_set_ph(
22527            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22528            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22529        );
22530        assert_eq_m512h(r, e);
22531    }
22532
22533    #[simd_test(enable = "avx512fp16")]
22534    unsafe fn test_mm512_min_round_ph() {
22535        let a = _mm512_set1_ph(2.0);
22536        let b = _mm512_set1_ph(1.0);
22537        let r = _mm512_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22538        let e = _mm512_set1_ph(1.0);
22539        assert_eq_m512h(r, e);
22540    }
22541
22542    #[simd_test(enable = "avx512fp16")]
22543    unsafe fn test_mm512_mask_min_round_ph() {
22544        let a = _mm512_set1_ph(2.0);
22545        let b = _mm512_set1_ph(1.0);
22546        let src = _mm512_set1_ph(3.0);
22547        let r = _mm512_mask_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22548            src,
22549            0b01010101010101010101010101010101,
22550            a,
22551            b,
22552        );
22553        let e = _mm512_set_ph(
22554            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
22555            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22556        );
22557        assert_eq_m512h(r, e);
22558    }
22559
22560    #[simd_test(enable = "avx512fp16")]
22561    unsafe fn test_mm512_maskz_min_round_ph() {
22562        let a = _mm512_set1_ph(2.0);
22563        let b = _mm512_set1_ph(1.0);
22564        let r = _mm512_maskz_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22565            0b01010101010101010101010101010101,
22566            a,
22567            b,
22568        );
22569        let e = _mm512_set_ph(
22570            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22571            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22572        );
22573        assert_eq_m512h(r, e);
22574    }
22575
22576    #[simd_test(enable = "avx512fp16")]
22577    unsafe fn test_mm_min_sh() {
22578        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22579        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22580        let r = _mm_min_sh(a, b);
22581        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22582        assert_eq_m128h(r, e);
22583    }
22584
22585    #[simd_test(enable = "avx512fp16")]
22586    unsafe fn test_mm_mask_min_sh() {
22587        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22588        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22589        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22590        let r = _mm_mask_min_sh(src, 0, a, b);
22591        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22592        assert_eq_m128h(r, e);
22593        let r = _mm_mask_min_sh(src, 1, a, b);
22594        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22595        assert_eq_m128h(r, e);
22596    }
22597
22598    #[simd_test(enable = "avx512fp16")]
22599    unsafe fn test_mm_maskz_min_sh() {
22600        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22601        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22602        let r = _mm_maskz_min_sh(0, a, b);
22603        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22604        assert_eq_m128h(r, e);
22605        let r = _mm_maskz_min_sh(1, a, b);
22606        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22607        assert_eq_m128h(r, e);
22608    }
22609
22610    #[simd_test(enable = "avx512fp16")]
22611    unsafe fn test_mm_min_round_sh() {
22612        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22613        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22614        let r = _mm_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22615        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22616        assert_eq_m128h(r, e);
22617    }
22618
22619    #[simd_test(enable = "avx512fp16")]
22620    unsafe fn test_mm_mask_min_round_sh() {
22621        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22622        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22623        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22624        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22625            src, 0, a, b,
22626        );
22627        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22628        assert_eq_m128h(r, e);
22629        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22630            src, 1, a, b,
22631        );
22632        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22633        assert_eq_m128h(r, e);
22634    }
22635
22636    #[simd_test(enable = "avx512fp16")]
22637    unsafe fn test_mm_maskz_min_round_sh() {
22638        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22639        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22640        let r =
22641            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22642        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22643        assert_eq_m128h(r, e);
22644        let r =
22645            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22646        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22647        assert_eq_m128h(r, e);
22648    }
22649
22650    #[simd_test(enable = "avx512fp16,avx512vl")]
22651    unsafe fn test_mm_getexp_ph() {
22652        let a = _mm_set1_ph(3.0);
22653        let r = _mm_getexp_ph(a);
22654        let e = _mm_set1_ph(1.0);
22655        assert_eq_m128h(r, e);
22656    }
22657
22658    #[simd_test(enable = "avx512fp16,avx512vl")]
22659    unsafe fn test_mm_mask_getexp_ph() {
22660        let a = _mm_set1_ph(3.0);
22661        let src = _mm_set1_ph(4.0);
22662        let r = _mm_mask_getexp_ph(src, 0b01010101, a);
22663        let e = _mm_set_ph(4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0);
22664        assert_eq_m128h(r, e);
22665    }
22666
22667    #[simd_test(enable = "avx512fp16,avx512vl")]
22668    unsafe fn test_mm_maskz_getexp_ph() {
22669        let a = _mm_set1_ph(3.0);
22670        let r = _mm_maskz_getexp_ph(0b01010101, a);
22671        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
22672        assert_eq_m128h(r, e);
22673    }
22674
22675    #[simd_test(enable = "avx512fp16,avx512vl")]
22676    unsafe fn test_mm256_getexp_ph() {
22677        let a = _mm256_set1_ph(3.0);
22678        let r = _mm256_getexp_ph(a);
22679        let e = _mm256_set1_ph(1.0);
22680        assert_eq_m256h(r, e);
22681    }
22682
22683    #[simd_test(enable = "avx512fp16,avx512vl")]
22684    unsafe fn test_mm256_mask_getexp_ph() {
22685        let a = _mm256_set1_ph(3.0);
22686        let src = _mm256_set1_ph(4.0);
22687        let r = _mm256_mask_getexp_ph(src, 0b0101010101010101, a);
22688        let e = _mm256_set_ph(
22689            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22690        );
22691        assert_eq_m256h(r, e);
22692    }
22693
22694    #[simd_test(enable = "avx512fp16,avx512vl")]
22695    unsafe fn test_mm256_maskz_getexp_ph() {
22696        let a = _mm256_set1_ph(3.0);
22697        let r = _mm256_maskz_getexp_ph(0b0101010101010101, a);
22698        let e = _mm256_set_ph(
22699            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22700        );
22701        assert_eq_m256h(r, e);
22702    }
22703
22704    #[simd_test(enable = "avx512fp16")]
22705    unsafe fn test_mm512_getexp_ph() {
22706        let a = _mm512_set1_ph(3.0);
22707        let r = _mm512_getexp_ph(a);
22708        let e = _mm512_set1_ph(1.0);
22709        assert_eq_m512h(r, e);
22710    }
22711
22712    #[simd_test(enable = "avx512fp16")]
22713    unsafe fn test_mm512_mask_getexp_ph() {
22714        let a = _mm512_set1_ph(3.0);
22715        let src = _mm512_set1_ph(4.0);
22716        let r = _mm512_mask_getexp_ph(src, 0b01010101010101010101010101010101, a);
22717        let e = _mm512_set_ph(
22718            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
22719            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22720        );
22721        assert_eq_m512h(r, e);
22722    }
22723
22724    #[simd_test(enable = "avx512fp16")]
22725    unsafe fn test_mm512_maskz_getexp_ph() {
22726        let a = _mm512_set1_ph(3.0);
22727        let r = _mm512_maskz_getexp_ph(0b01010101010101010101010101010101, a);
22728        let e = _mm512_set_ph(
22729            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22730            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22731        );
22732        assert_eq_m512h(r, e);
22733    }
22734
22735    #[simd_test(enable = "avx512fp16")]
22736    unsafe fn test_mm512_getexp_round_ph() {
22737        let a = _mm512_set1_ph(3.0);
22738        let r = _mm512_getexp_round_ph::<_MM_FROUND_NO_EXC>(a);
22739        let e = _mm512_set1_ph(1.0);
22740        assert_eq_m512h(r, e);
22741    }
22742
22743    #[simd_test(enable = "avx512fp16")]
22744    unsafe fn test_mm512_mask_getexp_round_ph() {
22745        let a = _mm512_set1_ph(3.0);
22746        let src = _mm512_set1_ph(4.0);
22747        let r = _mm512_mask_getexp_round_ph::<_MM_FROUND_NO_EXC>(
22748            src,
22749            0b01010101010101010101010101010101,
22750            a,
22751        );
22752        let e = _mm512_set_ph(
22753            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
22754            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22755        );
22756        assert_eq_m512h(r, e);
22757    }
22758
22759    #[simd_test(enable = "avx512fp16")]
22760    unsafe fn test_mm512_maskz_getexp_round_ph() {
22761        let a = _mm512_set1_ph(3.0);
22762        let r = _mm512_maskz_getexp_round_ph::<_MM_FROUND_NO_EXC>(
22763            0b01010101010101010101010101010101,
22764            a,
22765        );
22766        let e = _mm512_set_ph(
22767            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22768            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22769        );
22770        assert_eq_m512h(r, e);
22771    }
22772
22773    #[simd_test(enable = "avx512fp16")]
22774    unsafe fn test_mm_getexp_sh() {
22775        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22776        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22777        let r = _mm_getexp_sh(a, b);
22778        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22779        assert_eq_m128h(r, e);
22780    }
22781
22782    #[simd_test(enable = "avx512fp16")]
22783    unsafe fn test_mm_mask_getexp_sh() {
22784        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22785        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22786        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
22787        let r = _mm_mask_getexp_sh(src, 0, a, b);
22788        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22789        assert_eq_m128h(r, e);
22790        let r = _mm_mask_getexp_sh(src, 1, a, b);
22791        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22792        assert_eq_m128h(r, e);
22793    }
22794
22795    #[simd_test(enable = "avx512fp16")]
22796    unsafe fn test_mm_maskz_getexp_sh() {
22797        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22798        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22799        let r = _mm_maskz_getexp_sh(0, a, b);
22800        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
22801        assert_eq_m128h(r, e);
22802        let r = _mm_maskz_getexp_sh(1, a, b);
22803        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22804        assert_eq_m128h(r, e);
22805    }
22806
22807    #[simd_test(enable = "avx512fp16")]
22808    unsafe fn test_mm_getexp_round_sh() {
22809        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22810        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22811        let r = _mm_getexp_round_sh::<_MM_FROUND_NO_EXC>(a, b);
22812        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22813        assert_eq_m128h(r, e);
22814    }
22815
22816    #[simd_test(enable = "avx512fp16")]
22817    unsafe fn test_mm_mask_getexp_round_sh() {
22818        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22819        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22820        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
22821        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 0, a, b);
22822        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22823        assert_eq_m128h(r, e);
22824        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 1, a, b);
22825        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22826        assert_eq_m128h(r, e);
22827    }
22828
22829    #[simd_test(enable = "avx512fp16")]
22830    unsafe fn test_mm_maskz_getexp_round_sh() {
22831        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22832        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22833        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(0, a, b);
22834        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
22835        assert_eq_m128h(r, e);
22836        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(1, a, b);
22837        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22838        assert_eq_m128h(r, e);
22839    }
22840
22841    #[simd_test(enable = "avx512fp16,avx512vl")]
22842    unsafe fn test_mm_getmant_ph() {
22843        let a = _mm_set1_ph(10.0);
22844        let r = _mm_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
22845        let e = _mm_set1_ph(1.25);
22846        assert_eq_m128h(r, e);
22847    }
22848
22849    #[simd_test(enable = "avx512fp16,avx512vl")]
22850    unsafe fn test_mm_mask_getmant_ph() {
22851        let a = _mm_set1_ph(10.0);
22852        let src = _mm_set1_ph(20.0);
22853        let r = _mm_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0b01010101, a);
22854        let e = _mm_set_ph(20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25);
22855        assert_eq_m128h(r, e);
22856    }
22857
22858    #[simd_test(enable = "avx512fp16,avx512vl")]
22859    unsafe fn test_mm_maskz_getmant_ph() {
22860        let a = _mm_set1_ph(10.0);
22861        let r = _mm_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0b01010101, a);
22862        let e = _mm_set_ph(0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25);
22863        assert_eq_m128h(r, e);
22864    }
22865
22866    #[simd_test(enable = "avx512fp16,avx512vl")]
22867    unsafe fn test_mm256_getmant_ph() {
22868        let a = _mm256_set1_ph(10.0);
22869        let r = _mm256_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
22870        let e = _mm256_set1_ph(1.25);
22871        assert_eq_m256h(r, e);
22872    }
22873
22874    #[simd_test(enable = "avx512fp16,avx512vl")]
22875    unsafe fn test_mm256_mask_getmant_ph() {
22876        let a = _mm256_set1_ph(10.0);
22877        let src = _mm256_set1_ph(20.0);
22878        let r = _mm256_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22879            src,
22880            0b0101010101010101,
22881            a,
22882        );
22883        let e = _mm256_set_ph(
22884            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22885            20.0, 1.25,
22886        );
22887        assert_eq_m256h(r, e);
22888    }
22889
22890    #[simd_test(enable = "avx512fp16,avx512vl")]
22891    unsafe fn test_mm256_maskz_getmant_ph() {
22892        let a = _mm256_set1_ph(10.0);
22893        let r = _mm256_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22894            0b0101010101010101,
22895            a,
22896        );
22897        let e = _mm256_set_ph(
22898            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22899        );
22900        assert_eq_m256h(r, e);
22901    }
22902
22903    #[simd_test(enable = "avx512fp16")]
22904    unsafe fn test_mm512_getmant_ph() {
22905        let a = _mm512_set1_ph(10.0);
22906        let r = _mm512_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
22907        let e = _mm512_set1_ph(1.25);
22908        assert_eq_m512h(r, e);
22909    }
22910
22911    #[simd_test(enable = "avx512fp16")]
22912    unsafe fn test_mm512_mask_getmant_ph() {
22913        let a = _mm512_set1_ph(10.0);
22914        let src = _mm512_set1_ph(20.0);
22915        let r = _mm512_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22916            src,
22917            0b01010101010101010101010101010101,
22918            a,
22919        );
22920        let e = _mm512_set_ph(
22921            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22922            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22923            20.0, 1.25, 20.0, 1.25,
22924        );
22925        assert_eq_m512h(r, e);
22926    }
22927
22928    #[simd_test(enable = "avx512fp16")]
22929    unsafe fn test_mm512_maskz_getmant_ph() {
22930        let a = _mm512_set1_ph(10.0);
22931        let r = _mm512_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22932            0b01010101010101010101010101010101,
22933            a,
22934        );
22935        let e = _mm512_set_ph(
22936            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22937            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22938        );
22939        assert_eq_m512h(r, e);
22940    }
22941
22942    #[simd_test(enable = "avx512fp16")]
22943    unsafe fn test_mm512_getmant_round_ph() {
22944        let a = _mm512_set1_ph(10.0);
22945        let r =
22946            _mm512_getmant_round_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
22947                a,
22948            );
22949        let e = _mm512_set1_ph(1.25);
22950        assert_eq_m512h(r, e);
22951    }
22952
22953    #[simd_test(enable = "avx512fp16")]
22954    unsafe fn test_mm512_mask_getmant_round_ph() {
22955        let a = _mm512_set1_ph(10.0);
22956        let src = _mm512_set1_ph(20.0);
22957        let r = _mm512_mask_getmant_round_ph::<
22958            _MM_MANT_NORM_P75_1P5,
22959            _MM_MANT_SIGN_NAN,
22960            _MM_FROUND_NO_EXC,
22961        >(src, 0b01010101010101010101010101010101, a);
22962        let e = _mm512_set_ph(
22963            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22964            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22965            20.0, 1.25, 20.0, 1.25,
22966        );
22967        assert_eq_m512h(r, e);
22968    }
22969
22970    #[simd_test(enable = "avx512fp16")]
22971    unsafe fn test_mm512_maskz_getmant_round_ph() {
22972        let a = _mm512_set1_ph(10.0);
22973        let r = _mm512_maskz_getmant_round_ph::<
22974            _MM_MANT_NORM_P75_1P5,
22975            _MM_MANT_SIGN_NAN,
22976            _MM_FROUND_NO_EXC,
22977        >(0b01010101010101010101010101010101, a);
22978        let e = _mm512_set_ph(
22979            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22980            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22981        );
22982        assert_eq_m512h(r, e);
22983    }
22984
22985    #[simd_test(enable = "avx512fp16")]
22986    unsafe fn test_mm_getmant_sh() {
22987        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
22988        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
22989        let r = _mm_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a, b);
22990        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
22991        assert_eq_m128h(r, e);
22992    }
22993
22994    #[simd_test(enable = "avx512fp16")]
22995    unsafe fn test_mm_mask_getmant_sh() {
22996        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
22997        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
22998        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
22999        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0, a, b);
23000        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
23001        assert_eq_m128h(r, e);
23002        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 1, a, b);
23003        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23004        assert_eq_m128h(r, e);
23005    }
23006
23007    #[simd_test(enable = "avx512fp16")]
23008    unsafe fn test_mm_maskz_getmant_sh() {
23009        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23010        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23011        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0, a, b);
23012        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23013        assert_eq_m128h(r, e);
23014        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(1, a, b);
23015        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23016        assert_eq_m128h(r, e);
23017    }
23018
23019    #[simd_test(enable = "avx512fp16")]
23020    unsafe fn test_mm_getmant_round_sh() {
23021        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23022        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23023        let r = _mm_getmant_round_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
23024            a, b,
23025        );
23026        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23027        assert_eq_m128h(r, e);
23028    }
23029
23030    #[simd_test(enable = "avx512fp16")]
23031    unsafe fn test_mm_mask_getmant_round_sh() {
23032        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23033        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23034        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
23035        let r = _mm_mask_getmant_round_sh::<
23036            _MM_MANT_NORM_P75_1P5,
23037            _MM_MANT_SIGN_NAN,
23038            _MM_FROUND_NO_EXC,
23039        >(src, 0, a, b);
23040        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
23041        assert_eq_m128h(r, e);
23042        let r = _mm_mask_getmant_round_sh::<
23043            _MM_MANT_NORM_P75_1P5,
23044            _MM_MANT_SIGN_NAN,
23045            _MM_FROUND_NO_EXC,
23046        >(src, 1, a, b);
23047        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23048        assert_eq_m128h(r, e);
23049    }
23050
23051    #[simd_test(enable = "avx512fp16")]
23052    unsafe fn test_mm_maskz_getmant_round_sh() {
23053        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23054        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23055        let r = _mm_maskz_getmant_round_sh::<
23056            _MM_MANT_NORM_P75_1P5,
23057            _MM_MANT_SIGN_NAN,
23058            _MM_FROUND_NO_EXC,
23059        >(0, a, b);
23060        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23061        assert_eq_m128h(r, e);
23062        let r = _mm_maskz_getmant_round_sh::<
23063            _MM_MANT_NORM_P75_1P5,
23064            _MM_MANT_SIGN_NAN,
23065            _MM_FROUND_NO_EXC,
23066        >(1, a, b);
23067        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23068        assert_eq_m128h(r, e);
23069    }
23070
23071    #[simd_test(enable = "avx512fp16,avx512vl")]
23072    unsafe fn test_mm_roundscale_ph() {
23073        let a = _mm_set1_ph(1.1);
23074        let r = _mm_roundscale_ph::<0>(a);
23075        let e = _mm_set1_ph(1.0);
23076        assert_eq_m128h(r, e);
23077    }
23078
23079    #[simd_test(enable = "avx512fp16,avx512vl")]
23080    unsafe fn test_mm_mask_roundscale_ph() {
23081        let a = _mm_set1_ph(1.1);
23082        let src = _mm_set1_ph(2.0);
23083        let r = _mm_mask_roundscale_ph::<0>(src, 0b01010101, a);
23084        let e = _mm_set_ph(2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0);
23085        assert_eq_m128h(r, e);
23086    }
23087
23088    #[simd_test(enable = "avx512fp16,avx512vl")]
23089    unsafe fn test_mm_maskz_roundscale_ph() {
23090        let a = _mm_set1_ph(1.1);
23091        let r = _mm_maskz_roundscale_ph::<0>(0b01010101, a);
23092        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
23093        assert_eq_m128h(r, e);
23094    }
23095
23096    #[simd_test(enable = "avx512fp16,avx512vl")]
23097    unsafe fn test_mm256_roundscale_ph() {
23098        let a = _mm256_set1_ph(1.1);
23099        let r = _mm256_roundscale_ph::<0>(a);
23100        let e = _mm256_set1_ph(1.0);
23101        assert_eq_m256h(r, e);
23102    }
23103
23104    #[simd_test(enable = "avx512fp16,avx512vl")]
23105    unsafe fn test_mm256_mask_roundscale_ph() {
23106        let a = _mm256_set1_ph(1.1);
23107        let src = _mm256_set1_ph(2.0);
23108        let r = _mm256_mask_roundscale_ph::<0>(src, 0b0101010101010101, a);
23109        let e = _mm256_set_ph(
23110            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23111        );
23112        assert_eq_m256h(r, e);
23113    }
23114
23115    #[simd_test(enable = "avx512fp16,avx512vl")]
23116    unsafe fn test_mm256_maskz_roundscale_ph() {
23117        let a = _mm256_set1_ph(1.1);
23118        let r = _mm256_maskz_roundscale_ph::<0>(0b0101010101010101, a);
23119        let e = _mm256_set_ph(
23120            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23121        );
23122        assert_eq_m256h(r, e);
23123    }
23124
23125    #[simd_test(enable = "avx512fp16")]
23126    unsafe fn test_mm512_roundscale_ph() {
23127        let a = _mm512_set1_ph(1.1);
23128        let r = _mm512_roundscale_ph::<0>(a);
23129        let e = _mm512_set1_ph(1.0);
23130        assert_eq_m512h(r, e);
23131    }
23132
23133    #[simd_test(enable = "avx512fp16")]
23134    unsafe fn test_mm512_mask_roundscale_ph() {
23135        let a = _mm512_set1_ph(1.1);
23136        let src = _mm512_set1_ph(2.0);
23137        let r = _mm512_mask_roundscale_ph::<0>(src, 0b01010101010101010101010101010101, a);
23138        let e = _mm512_set_ph(
23139            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
23140            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23141        );
23142        assert_eq_m512h(r, e);
23143    }
23144
23145    #[simd_test(enable = "avx512fp16")]
23146    unsafe fn test_mm512_maskz_roundscale_ph() {
23147        let a = _mm512_set1_ph(1.1);
23148        let r = _mm512_maskz_roundscale_ph::<0>(0b01010101010101010101010101010101, a);
23149        let e = _mm512_set_ph(
23150            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
23151            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23152        );
23153        assert_eq_m512h(r, e);
23154    }
23155
23156    #[simd_test(enable = "avx512fp16")]
23157    unsafe fn test_mm512_roundscale_round_ph() {
23158        let a = _mm512_set1_ph(1.1);
23159        let r = _mm512_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(a);
23160        let e = _mm512_set1_ph(1.0);
23161        assert_eq_m512h(r, e);
23162    }
23163
23164    #[simd_test(enable = "avx512fp16")]
23165    unsafe fn test_mm512_mask_roundscale_round_ph() {
23166        let a = _mm512_set1_ph(1.1);
23167        let src = _mm512_set1_ph(2.0);
23168        let r = _mm512_mask_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
23169            src,
23170            0b01010101010101010101010101010101,
23171            a,
23172        );
23173        let e = _mm512_set_ph(
23174            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
23175            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23176        );
23177        assert_eq_m512h(r, e);
23178    }
23179
23180    #[simd_test(enable = "avx512fp16")]
23181    unsafe fn test_mm512_maskz_roundscale_round_ph() {
23182        let a = _mm512_set1_ph(1.1);
23183        let r = _mm512_maskz_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
23184            0b01010101010101010101010101010101,
23185            a,
23186        );
23187        let e = _mm512_set_ph(
23188            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
23189            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23190        );
23191        assert_eq_m512h(r, e);
23192    }
23193
23194    #[simd_test(enable = "avx512fp16")]
23195    unsafe fn test_mm_roundscale_sh() {
23196        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23197        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23198        let r = _mm_roundscale_sh::<0>(a, b);
23199        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23200        assert_eq_m128h(r, e);
23201    }
23202
23203    #[simd_test(enable = "avx512fp16")]
23204    unsafe fn test_mm_mask_roundscale_sh() {
23205        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23206        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23207        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
23208        let r = _mm_mask_roundscale_sh::<0>(src, 0, a, b);
23209        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23210        assert_eq_m128h(r, e);
23211        let r = _mm_mask_roundscale_sh::<0>(src, 1, a, b);
23212        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23213        assert_eq_m128h(r, e);
23214    }
23215
23216    #[simd_test(enable = "avx512fp16")]
23217    unsafe fn test_mm_maskz_roundscale_sh() {
23218        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23219        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23220        let r = _mm_maskz_roundscale_sh::<0>(0, a, b);
23221        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23222        assert_eq_m128h(r, e);
23223        let r = _mm_maskz_roundscale_sh::<0>(1, a, b);
23224        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23225        assert_eq_m128h(r, e);
23226    }
23227
23228    #[simd_test(enable = "avx512fp16")]
23229    unsafe fn test_mm_roundscale_round_sh() {
23230        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23231        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23232        let r = _mm_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(a, b);
23233        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23234        assert_eq_m128h(r, e);
23235    }
23236
23237    #[simd_test(enable = "avx512fp16")]
23238    unsafe fn test_mm_mask_roundscale_round_sh() {
23239        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23240        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23241        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
23242        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 0, a, b);
23243        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23244        assert_eq_m128h(r, e);
23245        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 1, a, b);
23246        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23247        assert_eq_m128h(r, e);
23248    }
23249
23250    #[simd_test(enable = "avx512fp16")]
23251    unsafe fn test_mm_maskz_roundscale_round_sh() {
23252        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23253        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23254        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(0, a, b);
23255        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23256        assert_eq_m128h(r, e);
23257        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(1, a, b);
23258        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23259        assert_eq_m128h(r, e);
23260    }
23261
23262    #[simd_test(enable = "avx512fp16,avx512vl")]
23263    unsafe fn test_mm_scalef_ph() {
23264        let a = _mm_set1_ph(1.);
23265        let b = _mm_set1_ph(3.);
23266        let r = _mm_scalef_ph(a, b);
23267        let e = _mm_set1_ph(8.0);
23268        assert_eq_m128h(r, e);
23269    }
23270
23271    #[simd_test(enable = "avx512fp16,avx512vl")]
23272    unsafe fn test_mm_mask_scalef_ph() {
23273        let a = _mm_set1_ph(1.);
23274        let b = _mm_set1_ph(3.);
23275        let src = _mm_set1_ph(2.);
23276        let r = _mm_mask_scalef_ph(src, 0b01010101, a, b);
23277        let e = _mm_set_ph(2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0);
23278        assert_eq_m128h(r, e);
23279    }
23280
23281    #[simd_test(enable = "avx512fp16,avx512vl")]
23282    unsafe fn test_mm_maskz_scalef_ph() {
23283        let a = _mm_set1_ph(1.);
23284        let b = _mm_set1_ph(3.);
23285        let r = _mm_maskz_scalef_ph(0b01010101, a, b);
23286        let e = _mm_set_ph(0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0);
23287        assert_eq_m128h(r, e);
23288    }
23289
23290    #[simd_test(enable = "avx512fp16,avx512vl")]
23291    unsafe fn test_mm256_scalef_ph() {
23292        let a = _mm256_set1_ph(1.);
23293        let b = _mm256_set1_ph(3.);
23294        let r = _mm256_scalef_ph(a, b);
23295        let e = _mm256_set1_ph(8.0);
23296        assert_eq_m256h(r, e);
23297    }
23298
23299    #[simd_test(enable = "avx512fp16,avx512vl")]
23300    unsafe fn test_mm256_mask_scalef_ph() {
23301        let a = _mm256_set1_ph(1.);
23302        let b = _mm256_set1_ph(3.);
23303        let src = _mm256_set1_ph(2.);
23304        let r = _mm256_mask_scalef_ph(src, 0b0101010101010101, a, b);
23305        let e = _mm256_set_ph(
23306            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23307        );
23308        assert_eq_m256h(r, e);
23309    }
23310
23311    #[simd_test(enable = "avx512fp16,avx512vl")]
23312    unsafe fn test_mm256_maskz_scalef_ph() {
23313        let a = _mm256_set1_ph(1.);
23314        let b = _mm256_set1_ph(3.);
23315        let r = _mm256_maskz_scalef_ph(0b0101010101010101, a, b);
23316        let e = _mm256_set_ph(
23317            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23318        );
23319        assert_eq_m256h(r, e);
23320    }
23321
23322    #[simd_test(enable = "avx512fp16")]
23323    unsafe fn test_mm512_scalef_ph() {
23324        let a = _mm512_set1_ph(1.);
23325        let b = _mm512_set1_ph(3.);
23326        let r = _mm512_scalef_ph(a, b);
23327        let e = _mm512_set1_ph(8.0);
23328        assert_eq_m512h(r, e);
23329    }
23330
23331    #[simd_test(enable = "avx512fp16")]
23332    unsafe fn test_mm512_mask_scalef_ph() {
23333        let a = _mm512_set1_ph(1.);
23334        let b = _mm512_set1_ph(3.);
23335        let src = _mm512_set1_ph(2.);
23336        let r = _mm512_mask_scalef_ph(src, 0b01010101010101010101010101010101, a, b);
23337        let e = _mm512_set_ph(
23338            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
23339            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23340        );
23341        assert_eq_m512h(r, e);
23342    }
23343
23344    #[simd_test(enable = "avx512fp16")]
23345    unsafe fn test_mm512_maskz_scalef_ph() {
23346        let a = _mm512_set1_ph(1.);
23347        let b = _mm512_set1_ph(3.);
23348        let r = _mm512_maskz_scalef_ph(0b01010101010101010101010101010101, a, b);
23349        let e = _mm512_set_ph(
23350            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
23351            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23352        );
23353        assert_eq_m512h(r, e);
23354    }
23355
23356    #[simd_test(enable = "avx512fp16")]
23357    unsafe fn test_mm512_scalef_round_ph() {
23358        let a = _mm512_set1_ph(1.);
23359        let b = _mm512_set1_ph(3.);
23360        let r = _mm512_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
23361        let e = _mm512_set1_ph(8.0);
23362        assert_eq_m512h(r, e);
23363    }
23364
23365    #[simd_test(enable = "avx512fp16")]
23366    unsafe fn test_mm512_mask_scalef_round_ph() {
23367        let a = _mm512_set1_ph(1.);
23368        let b = _mm512_set1_ph(3.);
23369        let src = _mm512_set1_ph(2.);
23370        let r = _mm512_mask_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23371            src,
23372            0b01010101010101010101010101010101,
23373            a,
23374            b,
23375        );
23376        let e = _mm512_set_ph(
23377            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
23378            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23379        );
23380        assert_eq_m512h(r, e);
23381    }
23382
23383    #[simd_test(enable = "avx512fp16")]
23384    unsafe fn test_mm512_maskz_scalef_round_ph() {
23385        let a = _mm512_set1_ph(1.);
23386        let b = _mm512_set1_ph(3.);
23387        let r = _mm512_maskz_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23388            0b01010101010101010101010101010101,
23389            a,
23390            b,
23391        );
23392        let e = _mm512_set_ph(
23393            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
23394            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23395        );
23396        assert_eq_m512h(r, e);
23397    }
23398
23399    #[simd_test(enable = "avx512fp16")]
23400    unsafe fn test_mm_scalef_sh() {
23401        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23402        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23403        let r = _mm_scalef_sh(a, b);
23404        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23405        assert_eq_m128h(r, e);
23406    }
23407
23408    #[simd_test(enable = "avx512fp16")]
23409    unsafe fn test_mm_mask_scalef_sh() {
23410        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23411        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23412        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23413        let r = _mm_mask_scalef_sh(src, 0, a, b);
23414        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23415        assert_eq_m128h(r, e);
23416        let r = _mm_mask_scalef_sh(src, 1, a, b);
23417        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23418        assert_eq_m128h(r, e);
23419    }
23420
23421    #[simd_test(enable = "avx512fp16")]
23422    unsafe fn test_mm_maskz_scalef_sh() {
23423        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23424        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23425        let r = _mm_maskz_scalef_sh(0, a, b);
23426        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23427        assert_eq_m128h(r, e);
23428        let r = _mm_maskz_scalef_sh(1, a, b);
23429        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23430        assert_eq_m128h(r, e);
23431    }
23432
23433    #[simd_test(enable = "avx512fp16")]
23434    unsafe fn test_mm_scalef_round_sh() {
23435        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23436        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23437        let r = _mm_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
23438        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23439        assert_eq_m128h(r, e);
23440    }
23441
23442    #[simd_test(enable = "avx512fp16")]
23443    unsafe fn test_mm_mask_scalef_round_sh() {
23444        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23445        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23446        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23447        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23448            src, 0, a, b,
23449        );
23450        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23451        assert_eq_m128h(r, e);
23452        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23453            src, 1, a, b,
23454        );
23455        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23456        assert_eq_m128h(r, e);
23457    }
23458
23459    #[simd_test(enable = "avx512fp16")]
23460    unsafe fn test_mm_maskz_scalef_round_sh() {
23461        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23462        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23463        let r =
23464            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
23465        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23466        assert_eq_m128h(r, e);
23467        let r =
23468            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
23469        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23470        assert_eq_m128h(r, e);
23471    }
23472
23473    #[simd_test(enable = "avx512fp16,avx512vl")]
23474    unsafe fn test_mm_reduce_ph() {
23475        let a = _mm_set1_ph(1.25);
23476        let r = _mm_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23477        let e = _mm_set1_ph(0.25);
23478        assert_eq_m128h(r, e);
23479    }
23480
23481    #[simd_test(enable = "avx512fp16,avx512vl")]
23482    unsafe fn test_mm_mask_reduce_ph() {
23483        let a = _mm_set1_ph(1.25);
23484        let src = _mm_set1_ph(2.0);
23485        let r = _mm_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01010101, a);
23486        let e = _mm_set_ph(2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25);
23487        assert_eq_m128h(r, e);
23488    }
23489
23490    #[simd_test(enable = "avx512fp16,avx512vl")]
23491    unsafe fn test_mm_maskz_reduce_ph() {
23492        let a = _mm_set1_ph(1.25);
23493        let r = _mm_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01010101, a);
23494        let e = _mm_set_ph(0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25);
23495        assert_eq_m128h(r, e);
23496    }
23497
23498    #[simd_test(enable = "avx512fp16,avx512vl")]
23499    unsafe fn test_mm256_reduce_ph() {
23500        let a = _mm256_set1_ph(1.25);
23501        let r = _mm256_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23502        let e = _mm256_set1_ph(0.25);
23503        assert_eq_m256h(r, e);
23504    }
23505
23506    #[simd_test(enable = "avx512fp16,avx512vl")]
23507    unsafe fn test_mm256_mask_reduce_ph() {
23508        let a = _mm256_set1_ph(1.25);
23509        let src = _mm256_set1_ph(2.0);
23510        let r = _mm256_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0101010101010101, a);
23511        let e = _mm256_set_ph(
23512            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23513        );
23514        assert_eq_m256h(r, e);
23515    }
23516
23517    #[simd_test(enable = "avx512fp16,avx512vl")]
23518    unsafe fn test_mm256_maskz_reduce_ph() {
23519        let a = _mm256_set1_ph(1.25);
23520        let r = _mm256_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0101010101010101, a);
23521        let e = _mm256_set_ph(
23522            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23523        );
23524        assert_eq_m256h(r, e);
23525    }
23526
23527    #[simd_test(enable = "avx512fp16")]
23528    unsafe fn test_mm512_reduce_ph() {
23529        let a = _mm512_set1_ph(1.25);
23530        let r = _mm512_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23531        let e = _mm512_set1_ph(0.25);
23532        assert_eq_m512h(r, e);
23533    }
23534
23535    #[simd_test(enable = "avx512fp16")]
23536    unsafe fn test_mm512_mask_reduce_ph() {
23537        let a = _mm512_set1_ph(1.25);
23538        let src = _mm512_set1_ph(2.0);
23539        let r = _mm512_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
23540            src,
23541            0b01010101010101010101010101010101,
23542            a,
23543        );
23544        let e = _mm512_set_ph(
23545            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23546            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23547        );
23548        assert_eq_m512h(r, e);
23549    }
23550
23551    #[simd_test(enable = "avx512fp16")]
23552    unsafe fn test_mm512_maskz_reduce_ph() {
23553        let a = _mm512_set1_ph(1.25);
23554        let r = _mm512_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
23555            0b01010101010101010101010101010101,
23556            a,
23557        );
23558        let e = _mm512_set_ph(
23559            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23560            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23561        );
23562        assert_eq_m512h(r, e);
23563    }
23564
23565    #[simd_test(enable = "avx512fp16")]
23566    unsafe fn test_mm512_reduce_round_ph() {
23567        let a = _mm512_set1_ph(1.25);
23568        let r = _mm512_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a);
23569        let e = _mm512_set1_ph(0.25);
23570        assert_eq_m512h(r, e);
23571    }
23572
23573    #[simd_test(enable = "avx512fp16")]
23574    unsafe fn test_mm512_mask_reduce_round_ph() {
23575        let a = _mm512_set1_ph(1.25);
23576        let src = _mm512_set1_ph(2.0);
23577        let r = _mm512_mask_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23578            src,
23579            0b01010101010101010101010101010101,
23580            a,
23581        );
23582        let e = _mm512_set_ph(
23583            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23584            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23585        );
23586        assert_eq_m512h(r, e);
23587    }
23588
23589    #[simd_test(enable = "avx512fp16")]
23590    unsafe fn test_mm512_maskz_reduce_round_ph() {
23591        let a = _mm512_set1_ph(1.25);
23592        let r = _mm512_maskz_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23593            0b01010101010101010101010101010101,
23594            a,
23595        );
23596        let e = _mm512_set_ph(
23597            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23598            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23599        );
23600        assert_eq_m512h(r, e);
23601    }
23602
23603    #[simd_test(enable = "avx512fp16")]
23604    unsafe fn test_mm_reduce_sh() {
23605        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23606        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23607        let r = _mm_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(a, b);
23608        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23609        assert_eq_m128h(r, e);
23610    }
23611
23612    #[simd_test(enable = "avx512fp16")]
23613    unsafe fn test_mm_mask_reduce_sh() {
23614        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23615        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23616        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23617        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0, a, b);
23618        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23619        assert_eq_m128h(r, e);
23620        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 1, a, b);
23621        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23622        assert_eq_m128h(r, e);
23623    }
23624
23625    #[simd_test(enable = "avx512fp16")]
23626    unsafe fn test_mm_maskz_reduce_sh() {
23627        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23628        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23629        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(0, a, b);
23630        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23631        assert_eq_m128h(r, e);
23632        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(1, a, b);
23633        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23634        assert_eq_m128h(r, e);
23635    }
23636
23637    #[simd_test(enable = "avx512fp16")]
23638    unsafe fn test_mm_reduce_round_sh() {
23639        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23640        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23641        let r = _mm_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a, b);
23642        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23643        assert_eq_m128h(r, e);
23644    }
23645
23646    #[simd_test(enable = "avx512fp16")]
23647    unsafe fn test_mm_mask_reduce_round_sh() {
23648        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23649        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23650        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23651        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23652            src, 0, a, b,
23653        );
23654        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23655        assert_eq_m128h(r, e);
23656        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23657            src, 1, a, b,
23658        );
23659        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23660        assert_eq_m128h(r, e);
23661    }
23662
23663    #[simd_test(enable = "avx512fp16")]
23664    unsafe fn test_mm_maskz_reduce_round_sh() {
23665        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23666        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23667        let r =
23668            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(0, a, b);
23669        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23670        assert_eq_m128h(r, e);
23671        let r =
23672            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(1, a, b);
23673        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23674        assert_eq_m128h(r, e);
23675    }
23676
23677    #[simd_test(enable = "avx512fp16,avx512vl")]
23678    unsafe fn test_mm_reduce_add_ph() {
23679        let a = _mm_set1_ph(2.0);
23680        let r = _mm_reduce_add_ph(a);
23681        assert_eq!(r, 16.0);
23682    }
23683
23684    #[simd_test(enable = "avx512fp16,avx512vl")]
23685    unsafe fn test_mm256_reduce_add_ph() {
23686        let a = _mm256_set1_ph(2.0);
23687        let r = _mm256_reduce_add_ph(a);
23688        assert_eq!(r, 32.0);
23689    }
23690
23691    #[simd_test(enable = "avx512fp16")]
23692    unsafe fn test_mm512_reduce_add_ph() {
23693        let a = _mm512_set1_ph(2.0);
23694        let r = _mm512_reduce_add_ph(a);
23695        assert_eq!(r, 64.0);
23696    }
23697
23698    #[simd_test(enable = "avx512fp16,avx512vl")]
23699    unsafe fn test_mm_reduce_mul_ph() {
23700        let a = _mm_set1_ph(2.0);
23701        let r = _mm_reduce_mul_ph(a);
23702        assert_eq!(r, 256.0);
23703    }
23704
23705    #[simd_test(enable = "avx512fp16,avx512vl")]
23706    unsafe fn test_mm256_reduce_mul_ph() {
23707        let a = _mm256_set1_ph(2.0);
23708        let r = _mm256_reduce_mul_ph(a);
23709        assert_eq!(r, 65536.0);
23710    }
23711
23712    #[simd_test(enable = "avx512fp16")]
23713    unsafe fn test_mm512_reduce_mul_ph() {
23714        let a = _mm512_set1_ph(2.0);
23715        let r = _mm512_reduce_mul_ph(a);
23716        assert_eq!(r, 16777216.0);
23717    }
23718
23719    #[simd_test(enable = "avx512fp16,avx512vl")]
23720    unsafe fn test_mm_reduce_max_ph() {
23721        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23722        let r = _mm_reduce_max_ph(a);
23723        assert_eq!(r, 8.0);
23724    }
23725
23726    #[simd_test(enable = "avx512fp16,avx512vl")]
23727    unsafe fn test_mm256_reduce_max_ph() {
23728        let a = _mm256_set_ph(
23729            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23730        );
23731        let r = _mm256_reduce_max_ph(a);
23732        assert_eq!(r, 16.0);
23733    }
23734
23735    #[simd_test(enable = "avx512fp16")]
23736    unsafe fn test_mm512_reduce_max_ph() {
23737        let a = _mm512_set_ph(
23738            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23739            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23740            31.0, 32.0,
23741        );
23742        let r = _mm512_reduce_max_ph(a);
23743        assert_eq!(r, 32.0);
23744    }
23745
23746    #[simd_test(enable = "avx512fp16,avx512vl")]
23747    unsafe fn test_mm_reduce_min_ph() {
23748        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23749        let r = _mm_reduce_min_ph(a);
23750        assert_eq!(r, 1.0);
23751    }
23752
23753    #[simd_test(enable = "avx512fp16,avx512vl")]
23754    unsafe fn test_mm256_reduce_min_ph() {
23755        let a = _mm256_set_ph(
23756            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23757        );
23758        let r = _mm256_reduce_min_ph(a);
23759        assert_eq!(r, 1.0);
23760    }
23761
23762    #[simd_test(enable = "avx512fp16")]
23763    unsafe fn test_mm512_reduce_min_ph() {
23764        let a = _mm512_set_ph(
23765            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23766            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23767            31.0, 32.0,
23768        );
23769        let r = _mm512_reduce_min_ph(a);
23770        assert_eq!(r, 1.0);
23771    }
23772
23773    #[simd_test(enable = "avx512fp16,avx512vl")]
23774    unsafe fn test_mm_fpclass_ph_mask() {
23775        let a = _mm_set_ph(
23776            1.,
23777            f16::INFINITY,
23778            f16::NEG_INFINITY,
23779            0.0,
23780            -0.0,
23781            -2.0,
23782            f16::NAN,
23783            5.9e-8, // Denormal
23784        );
23785        let r = _mm_fpclass_ph_mask::<0x18>(a); // infinities
23786        assert_eq!(r, 0b01100000);
23787    }
23788
23789    #[simd_test(enable = "avx512fp16,avx512vl")]
23790    unsafe fn test_mm_mask_fpclass_ph_mask() {
23791        let a = _mm_set_ph(
23792            1.,
23793            f16::INFINITY,
23794            f16::NEG_INFINITY,
23795            0.0,
23796            -0.0,
23797            -2.0,
23798            f16::NAN,
23799            5.9e-8, // Denormal
23800        );
23801        let r = _mm_mask_fpclass_ph_mask::<0x18>(0b01010101, a);
23802        assert_eq!(r, 0b01000000);
23803    }
23804
23805    #[simd_test(enable = "avx512fp16,avx512vl")]
23806    unsafe fn test_mm256_fpclass_ph_mask() {
23807        let a = _mm256_set_ph(
23808            1.,
23809            f16::INFINITY,
23810            f16::NEG_INFINITY,
23811            0.0,
23812            -0.0,
23813            -2.0,
23814            f16::NAN,
23815            5.9e-8, // Denormal
23816            1.,
23817            f16::INFINITY,
23818            f16::NEG_INFINITY,
23819            0.0,
23820            -0.0,
23821            -2.0,
23822            f16::NAN,
23823            5.9e-8, // Denormal
23824        );
23825        let r = _mm256_fpclass_ph_mask::<0x18>(a); // infinities
23826        assert_eq!(r, 0b0110000001100000);
23827    }
23828
23829    #[simd_test(enable = "avx512fp16,avx512vl")]
23830    unsafe fn test_mm256_mask_fpclass_ph_mask() {
23831        let a = _mm256_set_ph(
23832            1.,
23833            f16::INFINITY,
23834            f16::NEG_INFINITY,
23835            0.0,
23836            -0.0,
23837            -2.0,
23838            f16::NAN,
23839            5.9e-8, // Denormal
23840            1.,
23841            f16::INFINITY,
23842            f16::NEG_INFINITY,
23843            0.0,
23844            -0.0,
23845            -2.0,
23846            f16::NAN,
23847            5.9e-8, // Denormal
23848        );
23849        let r = _mm256_mask_fpclass_ph_mask::<0x18>(0b0101010101010101, a);
23850        assert_eq!(r, 0b0100000001000000);
23851    }
23852
23853    #[simd_test(enable = "avx512fp16")]
23854    unsafe fn test_mm512_fpclass_ph_mask() {
23855        let a = _mm512_set_ph(
23856            1.,
23857            f16::INFINITY,
23858            f16::NEG_INFINITY,
23859            0.0,
23860            -0.0,
23861            -2.0,
23862            f16::NAN,
23863            5.9e-8, // Denormal
23864            1.,
23865            f16::INFINITY,
23866            f16::NEG_INFINITY,
23867            0.0,
23868            -0.0,
23869            -2.0,
23870            f16::NAN,
23871            5.9e-8, // Denormal
23872            1.,
23873            f16::INFINITY,
23874            f16::NEG_INFINITY,
23875            0.0,
23876            -0.0,
23877            -2.0,
23878            f16::NAN,
23879            5.9e-8, // Denormal
23880            1.,
23881            f16::INFINITY,
23882            f16::NEG_INFINITY,
23883            0.0,
23884            -0.0,
23885            -2.0,
23886            f16::NAN,
23887            5.9e-8, // Denormal
23888        );
23889        let r = _mm512_fpclass_ph_mask::<0x18>(a); // infinities
23890        assert_eq!(r, 0b01100000011000000110000001100000);
23891    }
23892
23893    #[simd_test(enable = "avx512fp16")]
23894    unsafe fn test_mm512_mask_fpclass_ph_mask() {
23895        let a = _mm512_set_ph(
23896            1.,
23897            f16::INFINITY,
23898            f16::NEG_INFINITY,
23899            0.0,
23900            -0.0,
23901            -2.0,
23902            f16::NAN,
23903            5.9e-8, // Denormal
23904            1.,
23905            f16::INFINITY,
23906            f16::NEG_INFINITY,
23907            0.0,
23908            -0.0,
23909            -2.0,
23910            f16::NAN,
23911            5.9e-8, // Denormal
23912            1.,
23913            f16::INFINITY,
23914            f16::NEG_INFINITY,
23915            0.0,
23916            -0.0,
23917            -2.0,
23918            f16::NAN,
23919            5.9e-8, // Denormal
23920            1.,
23921            f16::INFINITY,
23922            f16::NEG_INFINITY,
23923            0.0,
23924            -0.0,
23925            -2.0,
23926            f16::NAN,
23927            5.9e-8, // Denormal
23928        );
23929        let r = _mm512_mask_fpclass_ph_mask::<0x18>(0b01010101010101010101010101010101, a);
23930        assert_eq!(r, 0b01000000010000000100000001000000);
23931    }
23932
23933    #[simd_test(enable = "avx512fp16")]
23934    unsafe fn test_mm_fpclass_sh_mask() {
23935        let a = _mm_set_sh(f16::INFINITY);
23936        let r = _mm_fpclass_sh_mask::<0x18>(a);
23937        assert_eq!(r, 1);
23938    }
23939
23940    #[simd_test(enable = "avx512fp16")]
23941    unsafe fn test_mm_mask_fpclass_sh_mask() {
23942        let a = _mm_set_sh(f16::INFINITY);
23943        let r = _mm_mask_fpclass_sh_mask::<0x18>(0, a);
23944        assert_eq!(r, 0);
23945        let r = _mm_mask_fpclass_sh_mask::<0x18>(1, a);
23946        assert_eq!(r, 1);
23947    }
23948
23949    #[simd_test(enable = "avx512fp16,avx512vl")]
23950    unsafe fn test_mm_mask_blend_ph() {
23951        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23952        let b = _mm_set_ph(-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0);
23953        let r = _mm_mask_blend_ph(0b01010101, a, b);
23954        let e = _mm_set_ph(1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0);
23955        assert_eq_m128h(r, e);
23956    }
23957
23958    #[simd_test(enable = "avx512fp16,avx512vl")]
23959    unsafe fn test_mm256_mask_blend_ph() {
23960        let a = _mm256_set_ph(
23961            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23962        );
23963        let b = _mm256_set_ph(
23964            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
23965            -14.0, -15.0, -16.0,
23966        );
23967        let r = _mm256_mask_blend_ph(0b0101010101010101, a, b);
23968        let e = _mm256_set_ph(
23969            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
23970            -16.0,
23971        );
23972        assert_eq_m256h(r, e);
23973    }
23974
23975    #[simd_test(enable = "avx512fp16")]
23976    unsafe fn test_mm512_mask_blend_ph() {
23977        let a = _mm512_set_ph(
23978            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23979            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23980            31.0, 32.0,
23981        );
23982        let b = _mm512_set_ph(
23983            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
23984            -14.0, -15.0, -16.0, -17.0, -18.0, -19.0, -20.0, -21.0, -22.0, -23.0, -24.0, -25.0,
23985            -26.0, -27.0, -28.0, -29.0, -30.0, -31.0, -32.0,
23986        );
23987        let r = _mm512_mask_blend_ph(0b01010101010101010101010101010101, a, b);
23988        let e = _mm512_set_ph(
23989            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
23990            -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0, 27.0, -28.0,
23991            29.0, -30.0, 31.0, -32.0,
23992        );
23993        assert_eq_m512h(r, e);
23994    }
23995
23996    #[simd_test(enable = "avx512fp16,avx512vl")]
23997    unsafe fn test_mm_permutex2var_ph() {
23998        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23999        let b = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
24000        let idx = _mm_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14);
24001        let r = _mm_permutex2var_ph(a, idx, b);
24002        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0);
24003        assert_eq_m128h(r, e);
24004    }
24005
24006    #[simd_test(enable = "avx512fp16,avx512vl")]
24007    unsafe fn test_mm256_permutex2var_ph() {
24008        let a = _mm256_setr_ph(
24009            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24010        );
24011        let b = _mm256_setr_ph(
24012            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24013            31.0, 32.0,
24014        );
24015        let idx = _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
24016        let r = _mm256_permutex2var_ph(a, idx, b);
24017        let e = _mm256_setr_ph(
24018            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24019            31.0,
24020        );
24021        assert_eq_m256h(r, e);
24022    }
24023
24024    #[simd_test(enable = "avx512fp16")]
24025    unsafe fn test_mm512_permutex2var_ph() {
24026        let a = _mm512_setr_ph(
24027            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24028            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24029            31.0, 32.0,
24030        );
24031        let b = _mm512_setr_ph(
24032            33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0,
24033            47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
24034            61.0, 62.0, 63.0, 64.0,
24035        );
24036        let idx = _mm512_set_epi16(
24037            62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20,
24038            18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
24039        );
24040        let r = _mm512_permutex2var_ph(a, idx, b);
24041        let e = _mm512_setr_ph(
24042            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24043            31.0, 33.0, 35.0, 37.0, 39.0, 41.0, 43.0, 45.0, 47.0, 49.0, 51.0, 53.0, 55.0, 57.0,
24044            59.0, 61.0, 63.0,
24045        );
24046        assert_eq_m512h(r, e);
24047    }
24048
24049    #[simd_test(enable = "avx512fp16,avx512vl")]
24050    unsafe fn test_mm_permutexvar_ph() {
24051        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24052        let idx = _mm_set_epi16(0, 2, 4, 6, 1, 3, 5, 7);
24053        let r = _mm_permutexvar_ph(idx, a);
24054        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 2.0, 4.0, 6.0, 8.0);
24055        assert_eq_m128h(r, e);
24056    }
24057
24058    #[simd_test(enable = "avx512fp16,avx512vl")]
24059    unsafe fn test_mm256_permutexvar_ph() {
24060        let a = _mm256_set_ph(
24061            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24062        );
24063        let idx = _mm256_set_epi16(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
24064        let r = _mm256_permutexvar_ph(idx, a);
24065        let e = _mm256_setr_ph(
24066            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0,
24067        );
24068        assert_eq_m256h(r, e);
24069    }
24070
24071    #[simd_test(enable = "avx512fp16")]
24072    unsafe fn test_mm512_permutexvar_ph() {
24073        let a = _mm512_set_ph(
24074            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24075            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24076            31.0, 32.0,
24077        );
24078        let idx = _mm512_set_epi16(
24079            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 1, 3, 5, 7, 9, 11, 13, 15,
24080            17, 19, 21, 23, 25, 27, 29, 31,
24081        );
24082        let r = _mm512_permutexvar_ph(idx, a);
24083        let e = _mm512_setr_ph(
24084            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24085            31.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0,
24086            30.0, 32.0,
24087        );
24088        assert_eq_m512h(r, e);
24089    }
24090
24091    #[simd_test(enable = "avx512fp16,avx512vl")]
24092    unsafe fn test_mm_cvtepi16_ph() {
24093        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24094        let r = _mm_cvtepi16_ph(a);
24095        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24096        assert_eq_m128h(r, e);
24097    }
24098
24099    #[simd_test(enable = "avx512fp16,avx512vl")]
24100    unsafe fn test_mm_mask_cvtepi16_ph() {
24101        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24102        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24103        let r = _mm_mask_cvtepi16_ph(src, 0b01010101, a);
24104        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24105        assert_eq_m128h(r, e);
24106    }
24107
24108    #[simd_test(enable = "avx512fp16,avx512vl")]
24109    unsafe fn test_mm_maskz_cvtepi16_ph() {
24110        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24111        let r = _mm_maskz_cvtepi16_ph(0b01010101, a);
24112        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
24113        assert_eq_m128h(r, e);
24114    }
24115
24116    #[simd_test(enable = "avx512fp16,avx512vl")]
24117    unsafe fn test_mm256_cvtepi16_ph() {
24118        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24119        let r = _mm256_cvtepi16_ph(a);
24120        let e = _mm256_set_ph(
24121            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24122        );
24123        assert_eq_m256h(r, e);
24124    }
24125
24126    #[simd_test(enable = "avx512fp16,avx512vl")]
24127    unsafe fn test_mm256_mask_cvtepi16_ph() {
24128        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24129        let src = _mm256_set_ph(
24130            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24131        );
24132        let r = _mm256_mask_cvtepi16_ph(src, 0b0101010101010101, a);
24133        let e = _mm256_set_ph(
24134            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24135        );
24136        assert_eq_m256h(r, e);
24137    }
24138
24139    #[simd_test(enable = "avx512fp16,avx512vl")]
24140    unsafe fn test_mm256_maskz_cvtepi16_ph() {
24141        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24142        let r = _mm256_maskz_cvtepi16_ph(0b0101010101010101, a);
24143        let e = _mm256_set_ph(
24144            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
24145        );
24146        assert_eq_m256h(r, e);
24147    }
24148
24149    #[simd_test(enable = "avx512fp16")]
24150    unsafe fn test_mm512_cvtepi16_ph() {
24151        let a = _mm512_set_epi16(
24152            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24153            25, 26, 27, 28, 29, 30, 31, 32,
24154        );
24155        let r = _mm512_cvtepi16_ph(a);
24156        let e = _mm512_set_ph(
24157            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24158            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24159            31.0, 32.0,
24160        );
24161        assert_eq_m512h(r, e);
24162    }
24163
24164    #[simd_test(enable = "avx512fp16")]
24165    unsafe fn test_mm512_mask_cvtepi16_ph() {
24166        let a = _mm512_set_epi16(
24167            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24168            25, 26, 27, 28, 29, 30, 31, 32,
24169        );
24170        let src = _mm512_set_ph(
24171            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24172            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24173        );
24174        let r = _mm512_mask_cvtepi16_ph(src, 0b01010101010101010101010101010101, a);
24175        let e = _mm512_set_ph(
24176            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24177            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24178        );
24179        assert_eq_m512h(r, e);
24180    }
24181
24182    #[simd_test(enable = "avx512fp16")]
24183    unsafe fn test_mm512_maskz_cvtepi16_ph() {
24184        let a = _mm512_set_epi16(
24185            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24186            25, 26, 27, 28, 29, 30, 31, 32,
24187        );
24188        let r = _mm512_maskz_cvtepi16_ph(0b01010101010101010101010101010101, a);
24189        let e = _mm512_set_ph(
24190            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24191            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24192        );
24193        assert_eq_m512h(r, e);
24194    }
24195
24196    #[simd_test(enable = "avx512fp16")]
24197    unsafe fn test_mm512_cvt_roundepi16_ph() {
24198        let a = _mm512_set_epi16(
24199            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24200            25, 26, 27, 28, 29, 30, 31, 32,
24201        );
24202        let r = _mm512_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24203        let e = _mm512_set_ph(
24204            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24205            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24206            31.0, 32.0,
24207        );
24208        assert_eq_m512h(r, e);
24209    }
24210
24211    #[simd_test(enable = "avx512fp16")]
24212    unsafe fn test_mm512_mask_cvt_roundepi16_ph() {
24213        let a = _mm512_set_epi16(
24214            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24215            25, 26, 27, 28, 29, 30, 31, 32,
24216        );
24217        let src = _mm512_set_ph(
24218            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24219            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24220        );
24221        let r = _mm512_mask_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24222            src,
24223            0b01010101010101010101010101010101,
24224            a,
24225        );
24226        let e = _mm512_set_ph(
24227            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24228            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24229        );
24230        assert_eq_m512h(r, e);
24231    }
24232
24233    #[simd_test(enable = "avx512fp16")]
24234    unsafe fn test_mm512_maskz_cvt_roundepi16_ph() {
24235        let a = _mm512_set_epi16(
24236            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24237            25, 26, 27, 28, 29, 30, 31, 32,
24238        );
24239        let r = _mm512_maskz_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24240            0b01010101010101010101010101010101,
24241            a,
24242        );
24243        let e = _mm512_set_ph(
24244            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24245            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24246        );
24247        assert_eq_m512h(r, e);
24248    }
24249
24250    #[simd_test(enable = "avx512fp16,avx512vl")]
24251    unsafe fn test_mm_cvtepu16_ph() {
24252        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24253        let r = _mm_cvtepu16_ph(a);
24254        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24255        assert_eq_m128h(r, e);
24256    }
24257
24258    #[simd_test(enable = "avx512fp16,avx512vl")]
24259    unsafe fn test_mm_mask_cvtepu16_ph() {
24260        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24261        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24262        let r = _mm_mask_cvtepu16_ph(src, 0b01010101, a);
24263        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24264        assert_eq_m128h(r, e);
24265    }
24266
24267    #[simd_test(enable = "avx512fp16,avx512vl")]
24268    unsafe fn test_mm_maskz_cvtepu16_ph() {
24269        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24270        let r = _mm_maskz_cvtepu16_ph(0b01010101, a);
24271        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
24272        assert_eq_m128h(r, e);
24273    }
24274
24275    #[simd_test(enable = "avx512fp16,avx512vl")]
24276    unsafe fn test_mm256_cvtepu16_ph() {
24277        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24278        let r = _mm256_cvtepu16_ph(a);
24279        let e = _mm256_set_ph(
24280            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24281        );
24282        assert_eq_m256h(r, e);
24283    }
24284
24285    #[simd_test(enable = "avx512fp16,avx512vl")]
24286    unsafe fn test_mm256_mask_cvtepu16_ph() {
24287        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24288        let src = _mm256_set_ph(
24289            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24290        );
24291        let r = _mm256_mask_cvtepu16_ph(src, 0b0101010101010101, a);
24292        let e = _mm256_set_ph(
24293            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24294        );
24295        assert_eq_m256h(r, e);
24296    }
24297
24298    #[simd_test(enable = "avx512fp16,avx512vl")]
24299    unsafe fn test_mm256_maskz_cvtepu16_ph() {
24300        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24301        let r = _mm256_maskz_cvtepu16_ph(0b0101010101010101, a);
24302        let e = _mm256_set_ph(
24303            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
24304        );
24305        assert_eq_m256h(r, e);
24306    }
24307
24308    #[simd_test(enable = "avx512fp16")]
24309    unsafe fn test_mm512_cvtepu16_ph() {
24310        let a = _mm512_set_epi16(
24311            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24312            25, 26, 27, 28, 29, 30, 31, 32,
24313        );
24314        let r = _mm512_cvtepu16_ph(a);
24315        let e = _mm512_set_ph(
24316            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24317            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24318            31.0, 32.0,
24319        );
24320        assert_eq_m512h(r, e);
24321    }
24322
24323    #[simd_test(enable = "avx512fp16")]
24324    unsafe fn test_mm512_mask_cvtepu16_ph() {
24325        let a = _mm512_set_epi16(
24326            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24327            25, 26, 27, 28, 29, 30, 31, 32,
24328        );
24329        let src = _mm512_set_ph(
24330            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24331            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24332        );
24333        let r = _mm512_mask_cvtepu16_ph(src, 0b01010101010101010101010101010101, a);
24334        let e = _mm512_set_ph(
24335            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24336            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24337        );
24338        assert_eq_m512h(r, e);
24339    }
24340
24341    #[simd_test(enable = "avx512fp16")]
24342    unsafe fn test_mm512_maskz_cvtepu16_ph() {
24343        let a = _mm512_set_epi16(
24344            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24345            25, 26, 27, 28, 29, 30, 31, 32,
24346        );
24347        let r = _mm512_maskz_cvtepu16_ph(0b01010101010101010101010101010101, a);
24348        let e = _mm512_set_ph(
24349            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24350            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24351        );
24352        assert_eq_m512h(r, e);
24353    }
24354
24355    #[simd_test(enable = "avx512fp16")]
24356    unsafe fn test_mm512_cvt_roundepu16_ph() {
24357        let a = _mm512_set_epi16(
24358            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24359            25, 26, 27, 28, 29, 30, 31, 32,
24360        );
24361        let r = _mm512_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24362        let e = _mm512_set_ph(
24363            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24364            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24365            31.0, 32.0,
24366        );
24367        assert_eq_m512h(r, e);
24368    }
24369
24370    #[simd_test(enable = "avx512fp16")]
24371    unsafe fn test_mm512_mask_cvt_roundepu16_ph() {
24372        let a = _mm512_set_epi16(
24373            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24374            25, 26, 27, 28, 29, 30, 31, 32,
24375        );
24376        let src = _mm512_set_ph(
24377            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24378            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24379        );
24380        let r = _mm512_mask_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24381            src,
24382            0b01010101010101010101010101010101,
24383            a,
24384        );
24385        let e = _mm512_set_ph(
24386            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24387            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24388        );
24389        assert_eq_m512h(r, e);
24390    }
24391
24392    #[simd_test(enable = "avx512fp16")]
24393    unsafe fn test_mm512_maskz_cvt_roundepu16_ph() {
24394        let a = _mm512_set_epi16(
24395            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24396            25, 26, 27, 28, 29, 30, 31, 32,
24397        );
24398        let r = _mm512_maskz_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24399            0b01010101010101010101010101010101,
24400            a,
24401        );
24402        let e = _mm512_set_ph(
24403            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24404            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24405        );
24406        assert_eq_m512h(r, e);
24407    }
24408
24409    #[simd_test(enable = "avx512fp16,avx512vl")]
24410    unsafe fn test_mm_cvtepi32_ph() {
24411        let a = _mm_set_epi32(1, 2, 3, 4);
24412        let r = _mm_cvtepi32_ph(a);
24413        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24414        assert_eq_m128h(r, e);
24415    }
24416
24417    #[simd_test(enable = "avx512fp16,avx512vl")]
24418    unsafe fn test_mm_mask_cvtepi32_ph() {
24419        let a = _mm_set_epi32(1, 2, 3, 4);
24420        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24421        let r = _mm_mask_cvtepi32_ph(src, 0b0101, a);
24422        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
24423        assert_eq_m128h(r, e);
24424    }
24425
24426    #[simd_test(enable = "avx512fp16,avx512vl")]
24427    unsafe fn test_mm_maskz_cvtepi32_ph() {
24428        let a = _mm_set_epi32(1, 2, 3, 4);
24429        let r = _mm_maskz_cvtepi32_ph(0b0101, a);
24430        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
24431        assert_eq_m128h(r, e);
24432    }
24433
24434    #[simd_test(enable = "avx512fp16,avx512vl")]
24435    unsafe fn test_mm256_cvtepi32_ph() {
24436        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24437        let r = _mm256_cvtepi32_ph(a);
24438        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24439        assert_eq_m128h(r, e);
24440    }
24441
24442    #[simd_test(enable = "avx512fp16,avx512vl")]
24443    unsafe fn test_mm256_mask_cvtepi32_ph() {
24444        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24445        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24446        let r = _mm256_mask_cvtepi32_ph(src, 0b01010101, a);
24447        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24448        assert_eq_m128h(r, e);
24449    }
24450
24451    #[simd_test(enable = "avx512fp16,avx512vl")]
24452    unsafe fn test_mm256_maskz_cvtepi32_ph() {
24453        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24454        let r = _mm256_maskz_cvtepi32_ph(0b01010101, a);
24455        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24456        assert_eq_m128h(r, e);
24457    }
24458
24459    #[simd_test(enable = "avx512fp16")]
24460    unsafe fn test_mm512_cvtepi32_ph() {
24461        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24462        let r = _mm512_cvtepi32_ph(a);
24463        let e = _mm256_set_ph(
24464            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24465        );
24466        assert_eq_m256h(r, e);
24467    }
24468
24469    #[simd_test(enable = "avx512fp16")]
24470    unsafe fn test_mm512_mask_cvtepi32_ph() {
24471        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24472        let src = _mm256_set_ph(
24473            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24474        );
24475        let r = _mm512_mask_cvtepi32_ph(src, 0b0101010101010101, a);
24476        let e = _mm256_set_ph(
24477            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24478        );
24479        assert_eq_m256h(r, e);
24480    }
24481
24482    #[simd_test(enable = "avx512fp16")]
24483    unsafe fn test_mm512_maskz_cvtepi32_ph() {
24484        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24485        let r = _mm512_maskz_cvtepi32_ph(0b0101010101010101, a);
24486        let e = _mm256_set_ph(
24487            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24488        );
24489        assert_eq_m256h(r, e);
24490    }
24491
24492    #[simd_test(enable = "avx512fp16")]
24493    unsafe fn test_mm512_cvt_roundepi32_ph() {
24494        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24495        let r = _mm512_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24496        let e = _mm256_set_ph(
24497            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24498        );
24499        assert_eq_m256h(r, e);
24500    }
24501
24502    #[simd_test(enable = "avx512fp16")]
24503    unsafe fn test_mm512_mask_cvt_roundepi32_ph() {
24504        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24505        let src = _mm256_set_ph(
24506            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24507        );
24508        let r = _mm512_mask_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24509            src,
24510            0b0101010101010101,
24511            a,
24512        );
24513        let e = _mm256_set_ph(
24514            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24515        );
24516        assert_eq_m256h(r, e);
24517    }
24518
24519    #[simd_test(enable = "avx512fp16")]
24520    unsafe fn test_mm512_maskz_cvt_roundepi32_ph() {
24521        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24522        let r = _mm512_maskz_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24523            0b0101010101010101,
24524            a,
24525        );
24526        let e = _mm256_set_ph(
24527            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24528        );
24529        assert_eq_m256h(r, e);
24530    }
24531
24532    #[simd_test(enable = "avx512fp16")]
24533    unsafe fn test_mm_cvti32_sh() {
24534        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24535        let r = _mm_cvti32_sh(a, 10);
24536        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24537        assert_eq_m128h(r, e);
24538    }
24539
24540    #[simd_test(enable = "avx512fp16")]
24541    unsafe fn test_mm_cvt_roundi32_sh() {
24542        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24543        let r = _mm_cvt_roundi32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
24544        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24545        assert_eq_m128h(r, e);
24546    }
24547
24548    #[simd_test(enable = "avx512fp16,avx512vl")]
24549    unsafe fn test_mm_cvtepu32_ph() {
24550        let a = _mm_set_epi32(1, 2, 3, 4);
24551        let r = _mm_cvtepu32_ph(a);
24552        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24553        assert_eq_m128h(r, e);
24554    }
24555
24556    #[simd_test(enable = "avx512fp16,avx512vl")]
24557    unsafe fn test_mm_mask_cvtepu32_ph() {
24558        let a = _mm_set_epi32(1, 2, 3, 4);
24559        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24560        let r = _mm_mask_cvtepu32_ph(src, 0b0101, a);
24561        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
24562        assert_eq_m128h(r, e);
24563    }
24564
24565    #[simd_test(enable = "avx512fp16,avx512vl")]
24566    unsafe fn test_mm_maskz_cvtepu32_ph() {
24567        let a = _mm_set_epi32(1, 2, 3, 4);
24568        let r = _mm_maskz_cvtepu32_ph(0b0101, a);
24569        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
24570        assert_eq_m128h(r, e);
24571    }
24572
24573    #[simd_test(enable = "avx512fp16,avx512vl")]
24574    unsafe fn test_mm256_cvtepu32_ph() {
24575        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24576        let r = _mm256_cvtepu32_ph(a);
24577        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24578        assert_eq_m128h(r, e);
24579    }
24580
24581    #[simd_test(enable = "avx512fp16,avx512vl")]
24582    unsafe fn test_mm256_mask_cvtepu32_ph() {
24583        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24584        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24585        let r = _mm256_mask_cvtepu32_ph(src, 0b01010101, a);
24586        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24587        assert_eq_m128h(r, e);
24588    }
24589
24590    #[simd_test(enable = "avx512fp16,avx512vl")]
24591    unsafe fn test_mm256_maskz_cvtepu32_ph() {
24592        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24593        let r = _mm256_maskz_cvtepu32_ph(0b01010101, a);
24594        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24595        assert_eq_m128h(r, e);
24596    }
24597
24598    #[simd_test(enable = "avx512fp16")]
24599    unsafe fn test_mm512_cvtepu32_ph() {
24600        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24601        let r = _mm512_cvtepu32_ph(a);
24602        let e = _mm256_set_ph(
24603            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24604        );
24605        assert_eq_m256h(r, e);
24606    }
24607
24608    #[simd_test(enable = "avx512fp16")]
24609    unsafe fn test_mm512_mask_cvtepu32_ph() {
24610        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24611        let src = _mm256_set_ph(
24612            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24613        );
24614        let r = _mm512_mask_cvtepu32_ph(src, 0b0101010101010101, a);
24615        let e = _mm256_set_ph(
24616            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
24617        );
24618        assert_eq_m256h(r, e);
24619    }
24620
24621    #[simd_test(enable = "avx512fp16")]
24622    unsafe fn test_mm512_maskz_cvtepu32_ph() {
24623        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24624        let r = _mm512_maskz_cvtepu32_ph(0b0101010101010101, a);
24625        let e = _mm256_set_ph(
24626            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24627        );
24628        assert_eq_m256h(r, e);
24629    }
24630
24631    #[simd_test(enable = "avx512fp16")]
24632    unsafe fn test_mm512_cvt_roundepu32_ph() {
24633        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24634        let r = _mm512_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24635        let e = _mm256_set_ph(
24636            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24637        );
24638        assert_eq_m256h(r, e);
24639    }
24640
24641    #[simd_test(enable = "avx512fp16")]
24642    unsafe fn test_mm512_mask_cvt_roundepu32_ph() {
24643        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24644        let src = _mm256_set_ph(
24645            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24646        );
24647        let r = _mm512_mask_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24648            src,
24649            0b0101010101010101,
24650            a,
24651        );
24652        let e = _mm256_set_ph(
24653            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
24654            16.0,
24655        );
24656        assert_eq_m256h(r, e);
24657    }
24658
24659    #[simd_test(enable = "avx512fp16")]
24660    unsafe fn test_mm512_maskz_cvt_roundepu32_ph() {
24661        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24662        let r = _mm512_maskz_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24663            0b0101010101010101,
24664            a,
24665        );
24666        let e = _mm256_set_ph(
24667            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24668        );
24669        assert_eq_m256h(r, e);
24670    }
24671
24672    #[simd_test(enable = "avx512fp16")]
24673    unsafe fn test_mm_cvtu32_sh() {
24674        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24675        let r = _mm_cvtu32_sh(a, 10);
24676        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24677        assert_eq_m128h(r, e);
24678    }
24679
24680    #[simd_test(enable = "avx512fp16")]
24681    unsafe fn test_mm_cvt_roundu32_sh() {
24682        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24683        let r = _mm_cvt_roundu32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
24684        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24685        assert_eq_m128h(r, e);
24686    }
24687
24688    #[simd_test(enable = "avx512fp16,avx512vl")]
24689    unsafe fn test_mm_cvtepi64_ph() {
24690        let a = _mm_set_epi64x(1, 2);
24691        let r = _mm_cvtepi64_ph(a);
24692        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
24693        assert_eq_m128h(r, e);
24694    }
24695
24696    #[simd_test(enable = "avx512fp16,avx512vl")]
24697    unsafe fn test_mm_mask_cvtepi64_ph() {
24698        let a = _mm_set_epi64x(1, 2);
24699        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24700        let r = _mm_mask_cvtepi64_ph(src, 0b01, a);
24701        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
24702        assert_eq_m128h(r, e);
24703    }
24704
24705    #[simd_test(enable = "avx512fp16,avx512vl")]
24706    unsafe fn test_mm_maskz_cvtepi64_ph() {
24707        let a = _mm_set_epi64x(1, 2);
24708        let r = _mm_maskz_cvtepi64_ph(0b01, a);
24709        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.);
24710        assert_eq_m128h(r, e);
24711    }
24712
24713    #[simd_test(enable = "avx512fp16,avx512vl")]
24714    unsafe fn test_mm256_cvtepi64_ph() {
24715        let a = _mm256_set_epi64x(1, 2, 3, 4);
24716        let r = _mm256_cvtepi64_ph(a);
24717        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24718        assert_eq_m128h(r, e);
24719    }
24720
24721    #[simd_test(enable = "avx512fp16,avx512vl")]
24722    unsafe fn test_mm256_mask_cvtepi64_ph() {
24723        let a = _mm256_set_epi64x(1, 2, 3, 4);
24724        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24725        let r = _mm256_mask_cvtepi64_ph(src, 0b0101, a);
24726        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
24727        assert_eq_m128h(r, e);
24728    }
24729
24730    #[simd_test(enable = "avx512fp16,avx512vl")]
24731    unsafe fn test_mm256_maskz_cvtepi64_ph() {
24732        let a = _mm256_set_epi64x(1, 2, 3, 4);
24733        let r = _mm256_maskz_cvtepi64_ph(0b0101, a);
24734        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
24735        assert_eq_m128h(r, e);
24736    }
24737
24738    #[simd_test(enable = "avx512fp16")]
24739    unsafe fn test_mm512_cvtepi64_ph() {
24740        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24741        let r = _mm512_cvtepi64_ph(a);
24742        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24743        assert_eq_m128h(r, e);
24744    }
24745
24746    #[simd_test(enable = "avx512fp16")]
24747    unsafe fn test_mm512_mask_cvtepi64_ph() {
24748        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24749        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24750        let r = _mm512_mask_cvtepi64_ph(src, 0b01010101, a);
24751        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24752        assert_eq_m128h(r, e);
24753    }
24754
24755    #[simd_test(enable = "avx512fp16")]
24756    unsafe fn test_mm512_maskz_cvtepi64_ph() {
24757        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24758        let r = _mm512_maskz_cvtepi64_ph(0b01010101, a);
24759        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24760        assert_eq_m128h(r, e);
24761    }
24762
24763    #[simd_test(enable = "avx512fp16")]
24764    unsafe fn test_mm512_cvt_roundepi64_ph() {
24765        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24766        let r = _mm512_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24767        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24768        assert_eq_m128h(r, e);
24769    }
24770
24771    #[simd_test(enable = "avx512fp16")]
24772    unsafe fn test_mm512_mask_cvt_roundepi64_ph() {
24773        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24774        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24775        let r = _mm512_mask_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24776            src, 0b01010101, a,
24777        );
24778        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24779        assert_eq_m128h(r, e);
24780    }
24781
24782    #[simd_test(enable = "avx512fp16")]
24783    unsafe fn test_mm512_maskz_cvt_roundepi64_ph() {
24784        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24785        let r = _mm512_maskz_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24786            0b01010101, a,
24787        );
24788        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24789        assert_eq_m128h(r, e);
24790    }
24791
24792    #[simd_test(enable = "avx512fp16,avx512vl")]
24793    unsafe fn test_mm_cvtepu64_ph() {
24794        let a = _mm_set_epi64x(1, 2);
24795        let r = _mm_cvtepu64_ph(a);
24796        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
24797        assert_eq_m128h(r, e);
24798    }
24799
24800    #[simd_test(enable = "avx512fp16,avx512vl")]
24801    unsafe fn test_mm_mask_cvtepu64_ph() {
24802        let a = _mm_set_epi64x(1, 2);
24803        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24804        let r = _mm_mask_cvtepu64_ph(src, 0b01, a);
24805        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
24806        assert_eq_m128h(r, e);
24807    }
24808
24809    #[simd_test(enable = "avx512fp16,avx512vl")]
24810    unsafe fn test_mm_maskz_cvtepu64_ph() {
24811        let a = _mm_set_epi64x(1, 2);
24812        let r = _mm_maskz_cvtepu64_ph(0b01, a);
24813        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
24814        assert_eq_m128h(r, e);
24815    }
24816
24817    #[simd_test(enable = "avx512fp16,avx512vl")]
24818    unsafe fn test_mm256_cvtepu64_ph() {
24819        let a = _mm256_set_epi64x(1, 2, 3, 4);
24820        let r = _mm256_cvtepu64_ph(a);
24821        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24822        assert_eq_m128h(r, e);
24823    }
24824
24825    #[simd_test(enable = "avx512fp16,avx512vl")]
24826    unsafe fn test_mm256_mask_cvtepu64_ph() {
24827        let a = _mm256_set_epi64x(1, 2, 3, 4);
24828        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24829        let r = _mm256_mask_cvtepu64_ph(src, 0b0101, a);
24830        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
24831        assert_eq_m128h(r, e);
24832    }
24833
24834    #[simd_test(enable = "avx512fp16,avx512vl")]
24835    unsafe fn test_mm256_maskz_cvtepu64_ph() {
24836        let a = _mm256_set_epi64x(1, 2, 3, 4);
24837        let r = _mm256_maskz_cvtepu64_ph(0b0101, a);
24838        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
24839        assert_eq_m128h(r, e);
24840    }
24841
24842    #[simd_test(enable = "avx512fp16")]
24843    unsafe fn test_mm512_cvtepu64_ph() {
24844        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24845        let r = _mm512_cvtepu64_ph(a);
24846        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24847        assert_eq_m128h(r, e);
24848    }
24849
24850    #[simd_test(enable = "avx512fp16")]
24851    unsafe fn test_mm512_mask_cvtepu64_ph() {
24852        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24853        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24854        let r = _mm512_mask_cvtepu64_ph(src, 0b01010101, a);
24855        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24856        assert_eq_m128h(r, e);
24857    }
24858
24859    #[simd_test(enable = "avx512fp16")]
24860    unsafe fn test_mm512_maskz_cvtepu64_ph() {
24861        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24862        let r = _mm512_maskz_cvtepu64_ph(0b01010101, a);
24863        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24864        assert_eq_m128h(r, e);
24865    }
24866
24867    #[simd_test(enable = "avx512fp16")]
24868    unsafe fn test_mm512_cvt_roundepu64_ph() {
24869        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24870        let r = _mm512_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24871        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24872        assert_eq_m128h(r, e);
24873    }
24874
24875    #[simd_test(enable = "avx512fp16")]
24876    unsafe fn test_mm512_mask_cvt_roundepu64_ph() {
24877        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24878        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24879        let r = _mm512_mask_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24880            src, 0b01010101, a,
24881        );
24882        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24883        assert_eq_m128h(r, e);
24884    }
24885
24886    #[simd_test(enable = "avx512fp16")]
24887    unsafe fn test_mm512_maskz_cvt_roundepu64_ph() {
24888        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24889        let r = _mm512_maskz_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24890            0b01010101, a,
24891        );
24892        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24893        assert_eq_m128h(r, e);
24894    }
24895
24896    #[simd_test(enable = "avx512fp16,avx512vl")]
24897    unsafe fn test_mm_cvtxps_ph() {
24898        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
24899        let r = _mm_cvtxps_ph(a);
24900        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24901        assert_eq_m128h(r, e);
24902    }
24903
24904    #[simd_test(enable = "avx512fp16,avx512vl")]
24905    unsafe fn test_mm_mask_cvtxps_ph() {
24906        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
24907        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24908        let r = _mm_mask_cvtxps_ph(src, 0b0101, a);
24909        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16., 4.0);
24910        assert_eq_m128h(r, e);
24911    }
24912
24913    #[simd_test(enable = "avx512fp16,avx512vl")]
24914    unsafe fn test_mm_maskz_cvtxps_ph() {
24915        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
24916        let r = _mm_maskz_cvtxps_ph(0b0101, a);
24917        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
24918        assert_eq_m128h(r, e);
24919    }
24920
24921    #[simd_test(enable = "avx512fp16,avx512vl")]
24922    unsafe fn test_mm256_cvtxps_ph() {
24923        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24924        let r = _mm256_cvtxps_ph(a);
24925        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24926        assert_eq_m128h(r, e);
24927    }
24928
24929    #[simd_test(enable = "avx512fp16,avx512vl")]
24930    unsafe fn test_mm256_mask_cvtxps_ph() {
24931        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24932        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24933        let r = _mm256_mask_cvtxps_ph(src, 0b01010101, a);
24934        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24935        assert_eq_m128h(r, e);
24936    }
24937
24938    #[simd_test(enable = "avx512fp16,avx512vl")]
24939    unsafe fn test_mm256_maskz_cvtxps_ph() {
24940        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24941        let r = _mm256_maskz_cvtxps_ph(0b01010101, a);
24942        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24943        assert_eq_m128h(r, e);
24944    }
24945
24946    #[simd_test(enable = "avx512fp16")]
24947    unsafe fn test_mm512_cvtxps_ph() {
24948        let a = _mm512_set_ps(
24949            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24950        );
24951        let r = _mm512_cvtxps_ph(a);
24952        let e = _mm256_set_ph(
24953            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24954        );
24955        assert_eq_m256h(r, e);
24956    }
24957
24958    #[simd_test(enable = "avx512fp16")]
24959    unsafe fn test_mm512_mask_cvtxps_ph() {
24960        let a = _mm512_set_ps(
24961            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24962        );
24963        let src = _mm256_set_ph(
24964            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24965        );
24966        let r = _mm512_mask_cvtxps_ph(src, 0b0101010101010101, a);
24967        let e = _mm256_set_ph(
24968            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
24969        );
24970        assert_eq_m256h(r, e);
24971    }
24972
24973    #[simd_test(enable = "avx512fp16")]
24974    unsafe fn test_mm512_maskz_cvtxps_ph() {
24975        let a = _mm512_set_ps(
24976            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24977        );
24978        let r = _mm512_maskz_cvtxps_ph(0b0101010101010101, a);
24979        let e = _mm256_set_ph(
24980            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24981        );
24982        assert_eq_m256h(r, e);
24983    }
24984
24985    #[simd_test(enable = "avx512fp16")]
24986    unsafe fn test_mm512_cvtx_roundps_ph() {
24987        let a = _mm512_set_ps(
24988            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24989        );
24990        let r = _mm512_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24991        let e = _mm256_set_ph(
24992            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24993        );
24994        assert_eq_m256h(r, e);
24995    }
24996
24997    #[simd_test(enable = "avx512fp16")]
24998    unsafe fn test_mm512_mask_cvtx_roundps_ph() {
24999        let a = _mm512_set_ps(
25000            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25001        );
25002        let src = _mm256_set_ph(
25003            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
25004        );
25005        let r = _mm512_mask_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25006            src,
25007            0b0101010101010101,
25008            a,
25009        );
25010        let e = _mm256_set_ph(
25011            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
25012            16.0,
25013        );
25014        assert_eq_m256h(r, e);
25015    }
25016
25017    #[simd_test(enable = "avx512fp16")]
25018    unsafe fn test_mm512_maskz_cvtx_roundps_ph() {
25019        let a = _mm512_set_ps(
25020            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25021        );
25022        let r = _mm512_maskz_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25023            0b0101010101010101,
25024            a,
25025        );
25026        let e = _mm256_set_ph(
25027            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
25028        );
25029        assert_eq_m256h(r, e);
25030    }
25031
25032    #[simd_test(enable = "avx512fp16")]
25033    unsafe fn test_mm_cvtss_sh() {
25034        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25035        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25036        let r = _mm_cvtss_sh(a, b);
25037        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25038        assert_eq_m128h(r, e);
25039    }
25040
25041    #[simd_test(enable = "avx512fp16")]
25042    unsafe fn test_mm_mask_cvtss_sh() {
25043        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25044        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25045        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25046        let r = _mm_mask_cvtss_sh(src, 0, a, b);
25047        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25048        assert_eq_m128h(r, e);
25049        let r = _mm_mask_cvtss_sh(src, 1, a, b);
25050        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25051        assert_eq_m128h(r, e);
25052    }
25053
25054    #[simd_test(enable = "avx512fp16")]
25055    unsafe fn test_mm_maskz_cvtss_sh() {
25056        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25057        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25058        let r = _mm_maskz_cvtss_sh(0, a, b);
25059        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25060        assert_eq_m128h(r, e);
25061        let r = _mm_maskz_cvtss_sh(1, a, b);
25062        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25063        assert_eq_m128h(r, e);
25064    }
25065
25066    #[simd_test(enable = "avx512fp16")]
25067    unsafe fn test_mm_cvt_roundss_sh() {
25068        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25069        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25070        let r = _mm_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
25071        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25072        assert_eq_m128h(r, e);
25073    }
25074
25075    #[simd_test(enable = "avx512fp16")]
25076    unsafe fn test_mm_mask_cvt_roundss_sh() {
25077        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25078        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25079        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25080        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25081            src, 0, a, b,
25082        );
25083        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25084        assert_eq_m128h(r, e);
25085        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25086            src, 1, a, b,
25087        );
25088        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25089        assert_eq_m128h(r, e);
25090    }
25091
25092    #[simd_test(enable = "avx512fp16")]
25093    unsafe fn test_mm_maskz_cvt_roundss_sh() {
25094        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25095        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25096        let r =
25097            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
25098        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25099        assert_eq_m128h(r, e);
25100        let r =
25101            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
25102        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25103        assert_eq_m128h(r, e);
25104    }
25105
25106    #[simd_test(enable = "avx512fp16,avx512vl")]
25107    unsafe fn test_mm_cvtpd_ph() {
25108        let a = _mm_set_pd(1.0, 2.0);
25109        let r = _mm_cvtpd_ph(a);
25110        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
25111        assert_eq_m128h(r, e);
25112    }
25113
25114    #[simd_test(enable = "avx512fp16,avx512vl")]
25115    unsafe fn test_mm_mask_cvtpd_ph() {
25116        let a = _mm_set_pd(1.0, 2.0);
25117        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25118        let r = _mm_mask_cvtpd_ph(src, 0b01, a);
25119        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
25120        assert_eq_m128h(r, e);
25121    }
25122
25123    #[simd_test(enable = "avx512fp16,avx512vl")]
25124    unsafe fn test_mm_maskz_cvtpd_ph() {
25125        let a = _mm_set_pd(1.0, 2.0);
25126        let r = _mm_maskz_cvtpd_ph(0b01, a);
25127        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
25128        assert_eq_m128h(r, e);
25129    }
25130
25131    #[simd_test(enable = "avx512fp16,avx512vl")]
25132    unsafe fn test_mm256_cvtpd_ph() {
25133        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25134        let r = _mm256_cvtpd_ph(a);
25135        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25136        assert_eq_m128h(r, e);
25137    }
25138
25139    #[simd_test(enable = "avx512fp16,avx512vl")]
25140    unsafe fn test_mm256_mask_cvtpd_ph() {
25141        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25142        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25143        let r = _mm256_mask_cvtpd_ph(src, 0b0101, a);
25144        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
25145        assert_eq_m128h(r, e);
25146    }
25147
25148    #[simd_test(enable = "avx512fp16,avx512vl")]
25149    unsafe fn test_mm256_maskz_cvtpd_ph() {
25150        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25151        let r = _mm256_maskz_cvtpd_ph(0b0101, a);
25152        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
25153        assert_eq_m128h(r, e);
25154    }
25155
25156    #[simd_test(enable = "avx512fp16")]
25157    unsafe fn test_mm512_cvtpd_ph() {
25158        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25159        let r = _mm512_cvtpd_ph(a);
25160        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25161        assert_eq_m128h(r, e);
25162    }
25163
25164    #[simd_test(enable = "avx512fp16")]
25165    unsafe fn test_mm512_mask_cvtpd_ph() {
25166        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25167        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25168        let r = _mm512_mask_cvtpd_ph(src, 0b01010101, a);
25169        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25170        assert_eq_m128h(r, e);
25171    }
25172
25173    #[simd_test(enable = "avx512fp16")]
25174    unsafe fn test_mm512_maskz_cvtpd_ph() {
25175        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25176        let r = _mm512_maskz_cvtpd_ph(0b01010101, a);
25177        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25178        assert_eq_m128h(r, e);
25179    }
25180
25181    #[simd_test(enable = "avx512fp16")]
25182    unsafe fn test_mm512_cvt_roundpd_ph() {
25183        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25184        let r = _mm512_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25185        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25186        assert_eq_m128h(r, e);
25187    }
25188
25189    #[simd_test(enable = "avx512fp16")]
25190    unsafe fn test_mm512_mask_cvt_roundpd_ph() {
25191        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25192        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25193        let r = _mm512_mask_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25194            src, 0b01010101, a,
25195        );
25196        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25197        assert_eq_m128h(r, e);
25198    }
25199
25200    #[simd_test(enable = "avx512fp16")]
25201    unsafe fn test_mm512_maskz_cvt_roundpd_ph() {
25202        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25203        let r = _mm512_maskz_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25204            0b01010101, a,
25205        );
25206        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25207        assert_eq_m128h(r, e);
25208    }
25209
25210    #[simd_test(enable = "avx512fp16")]
25211    unsafe fn test_mm_cvtsd_sh() {
25212        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25213        let b = _mm_setr_pd(1.0, 2.0);
25214        let r = _mm_cvtsd_sh(a, b);
25215        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25216        assert_eq_m128h(r, e);
25217    }
25218
25219    #[simd_test(enable = "avx512fp16")]
25220    unsafe fn test_mm_mask_cvtsd_sh() {
25221        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25222        let b = _mm_setr_pd(1.0, 2.0);
25223        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25224        let r = _mm_mask_cvtsd_sh(src, 0, a, b);
25225        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25226        assert_eq_m128h(r, e);
25227        let r = _mm_mask_cvtsd_sh(src, 1, a, b);
25228        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25229        assert_eq_m128h(r, e);
25230    }
25231
25232    #[simd_test(enable = "avx512fp16")]
25233    unsafe fn test_mm_maskz_cvtsd_sh() {
25234        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25235        let b = _mm_setr_pd(1.0, 2.0);
25236        let r = _mm_maskz_cvtsd_sh(0, a, b);
25237        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25238        assert_eq_m128h(r, e);
25239        let r = _mm_maskz_cvtsd_sh(1, a, b);
25240        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25241        assert_eq_m128h(r, e);
25242    }
25243
25244    #[simd_test(enable = "avx512fp16")]
25245    unsafe fn test_mm_cvt_roundsd_sh() {
25246        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25247        let b = _mm_setr_pd(1.0, 2.0);
25248        let r = _mm_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
25249        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25250        assert_eq_m128h(r, e);
25251    }
25252
25253    #[simd_test(enable = "avx512fp16")]
25254    unsafe fn test_mm_mask_cvt_roundsd_sh() {
25255        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25256        let b = _mm_setr_pd(1.0, 2.0);
25257        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25258        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25259            src, 0, a, b,
25260        );
25261        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25262        assert_eq_m128h(r, e);
25263        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25264            src, 1, a, b,
25265        );
25266        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25267        assert_eq_m128h(r, e);
25268    }
25269
25270    #[simd_test(enable = "avx512fp16")]
25271    unsafe fn test_mm_maskz_cvt_roundsd_sh() {
25272        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25273        let b = _mm_setr_pd(1.0, 2.0);
25274        let r =
25275            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
25276        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25277        assert_eq_m128h(r, e);
25278        let r =
25279            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
25280        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25281        assert_eq_m128h(r, e);
25282    }
25283
25284    #[simd_test(enable = "avx512fp16,avx512vl")]
25285    unsafe fn test_mm_cvtph_epi16() {
25286        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25287        let r = _mm_cvttph_epi16(a);
25288        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25289        assert_eq_m128i(r, e);
25290    }
25291
25292    #[simd_test(enable = "avx512fp16,avx512vl")]
25293    unsafe fn test_mm_mask_cvtph_epi16() {
25294        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25295        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25296        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
25297        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25298        assert_eq_m128i(r, e);
25299    }
25300
25301    #[simd_test(enable = "avx512fp16,avx512vl")]
25302    unsafe fn test_mm_maskz_cvtph_epi16() {
25303        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25304        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
25305        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25306        assert_eq_m128i(r, e);
25307    }
25308
25309    #[simd_test(enable = "avx512fp16,avx512vl")]
25310    unsafe fn test_mm256_cvtph_epi16() {
25311        let a = _mm256_set_ph(
25312            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25313        );
25314        let r = _mm256_cvttph_epi16(a);
25315        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25316        assert_eq_m256i(r, e);
25317    }
25318
25319    #[simd_test(enable = "avx512fp16,avx512vl")]
25320    unsafe fn test_mm256_mask_cvtph_epi16() {
25321        let a = _mm256_set_ph(
25322            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25323        );
25324        let src = _mm256_set_epi16(
25325            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25326        );
25327        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
25328        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25329        assert_eq_m256i(r, e);
25330    }
25331
25332    #[simd_test(enable = "avx512fp16,avx512vl")]
25333    unsafe fn test_mm256_maskz_cvtph_epi16() {
25334        let a = _mm256_set_ph(
25335            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25336        );
25337        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
25338        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25339        assert_eq_m256i(r, e);
25340    }
25341
25342    #[simd_test(enable = "avx512fp16")]
25343    unsafe fn test_mm512_cvtph_epi16() {
25344        let a = _mm512_set_ph(
25345            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25346            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25347            31.0, 32.0,
25348        );
25349        let r = _mm512_cvttph_epi16(a);
25350        let e = _mm512_set_epi16(
25351            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25352            25, 26, 27, 28, 29, 30, 31, 32,
25353        );
25354        assert_eq_m512i(r, e);
25355    }
25356
25357    #[simd_test(enable = "avx512fp16")]
25358    unsafe fn test_mm512_mask_cvtph_epi16() {
25359        let a = _mm512_set_ph(
25360            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25361            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25362            31.0, 32.0,
25363        );
25364        let src = _mm512_set_epi16(
25365            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25366            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25367        );
25368        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
25369        let e = _mm512_set_epi16(
25370            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25371            24, 34, 26, 36, 28, 38, 30, 40, 32,
25372        );
25373        assert_eq_m512i(r, e);
25374    }
25375
25376    #[simd_test(enable = "avx512fp16")]
25377    unsafe fn test_mm512_maskz_cvtph_epi16() {
25378        let a = _mm512_set_ph(
25379            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25380            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25381            31.0, 32.0,
25382        );
25383        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
25384        let e = _mm512_set_epi16(
25385            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25386            0, 28, 0, 30, 0, 32,
25387        );
25388        assert_eq_m512i(r, e);
25389    }
25390
25391    #[simd_test(enable = "avx512fp16")]
25392    unsafe fn test_mm512_cvt_roundph_epi16() {
25393        let a = _mm512_set_ph(
25394            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25395            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25396            31.0, 32.0,
25397        );
25398        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
25399        let e = _mm512_set_epi16(
25400            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25401            25, 26, 27, 28, 29, 30, 31, 32,
25402        );
25403        assert_eq_m512i(r, e);
25404    }
25405
25406    #[simd_test(enable = "avx512fp16")]
25407    unsafe fn test_mm512_mask_cvt_roundph_epi16() {
25408        let a = _mm512_set_ph(
25409            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25410            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25411            31.0, 32.0,
25412        );
25413        let src = _mm512_set_epi16(
25414            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25415            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25416        );
25417        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25418            src,
25419            0b01010101010101010101010101010101,
25420            a,
25421        );
25422        let e = _mm512_set_epi16(
25423            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25424            24, 34, 26, 36, 28, 38, 30, 40, 32,
25425        );
25426        assert_eq_m512i(r, e);
25427    }
25428
25429    #[simd_test(enable = "avx512fp16")]
25430    unsafe fn test_mm512_maskz_cvt_roundph_epi16() {
25431        let a = _mm512_set_ph(
25432            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25433            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25434            31.0, 32.0,
25435        );
25436        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25437            0b01010101010101010101010101010101,
25438            a,
25439        );
25440        let e = _mm512_set_epi16(
25441            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25442            0, 28, 0, 30, 0, 32,
25443        );
25444        assert_eq_m512i(r, e);
25445    }
25446
25447    #[simd_test(enable = "avx512fp16,avx512vl")]
25448    unsafe fn test_mm_cvtph_epu16() {
25449        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25450        let r = _mm_cvttph_epu16(a);
25451        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25452        assert_eq_m128i(r, e);
25453    }
25454
25455    #[simd_test(enable = "avx512fp16,avx512vl")]
25456    unsafe fn test_mm_mask_cvtph_epu16() {
25457        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25458        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25459        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
25460        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25461        assert_eq_m128i(r, e);
25462    }
25463
25464    #[simd_test(enable = "avx512fp16,avx512vl")]
25465    unsafe fn test_mm_maskz_cvtph_epu16() {
25466        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25467        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
25468        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25469        assert_eq_m128i(r, e);
25470    }
25471
25472    #[simd_test(enable = "avx512fp16,avx512vl")]
25473    unsafe fn test_mm256_cvtph_epu16() {
25474        let a = _mm256_set_ph(
25475            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25476        );
25477        let r = _mm256_cvttph_epu16(a);
25478        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25479        assert_eq_m256i(r, e);
25480    }
25481
25482    #[simd_test(enable = "avx512fp16,avx512vl")]
25483    unsafe fn test_mm256_mask_cvtph_epu16() {
25484        let a = _mm256_set_ph(
25485            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25486        );
25487        let src = _mm256_set_epi16(
25488            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25489        );
25490        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
25491        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25492        assert_eq_m256i(r, e);
25493    }
25494
25495    #[simd_test(enable = "avx512fp16,avx512vl")]
25496    unsafe fn test_mm256_maskz_cvtph_epu16() {
25497        let a = _mm256_set_ph(
25498            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25499        );
25500        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
25501        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25502        assert_eq_m256i(r, e);
25503    }
25504
25505    #[simd_test(enable = "avx512fp16")]
25506    unsafe fn test_mm512_cvtph_epu16() {
25507        let a = _mm512_set_ph(
25508            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25509            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25510            31.0, 32.0,
25511        );
25512        let r = _mm512_cvttph_epu16(a);
25513        let e = _mm512_set_epi16(
25514            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25515            25, 26, 27, 28, 29, 30, 31, 32,
25516        );
25517        assert_eq_m512i(r, e);
25518    }
25519
25520    #[simd_test(enable = "avx512fp16")]
25521    unsafe fn test_mm512_mask_cvtph_epu16() {
25522        let a = _mm512_set_ph(
25523            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25524            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25525            31.0, 32.0,
25526        );
25527        let src = _mm512_set_epi16(
25528            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25529            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25530        );
25531        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
25532        let e = _mm512_set_epi16(
25533            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25534            24, 34, 26, 36, 28, 38, 30, 40, 32,
25535        );
25536        assert_eq_m512i(r, e);
25537    }
25538
25539    #[simd_test(enable = "avx512fp16")]
25540    unsafe fn test_mm512_maskz_cvtph_epu16() {
25541        let a = _mm512_set_ph(
25542            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25543            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25544            31.0, 32.0,
25545        );
25546        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
25547        let e = _mm512_set_epi16(
25548            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25549            0, 28, 0, 30, 0, 32,
25550        );
25551        assert_eq_m512i(r, e);
25552    }
25553
25554    #[simd_test(enable = "avx512fp16")]
25555    unsafe fn test_mm512_cvt_roundph_epu16() {
25556        let a = _mm512_set_ph(
25557            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25558            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25559            31.0, 32.0,
25560        );
25561        let r = _mm512_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25562        let e = _mm512_set_epi16(
25563            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25564            25, 26, 27, 28, 29, 30, 31, 32,
25565        );
25566        assert_eq_m512i(r, e);
25567    }
25568
25569    #[simd_test(enable = "avx512fp16")]
25570    unsafe fn test_mm512_mask_cvt_roundph_epu16() {
25571        let a = _mm512_set_ph(
25572            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25573            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25574            31.0, 32.0,
25575        );
25576        let src = _mm512_set_epi16(
25577            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25578            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25579        );
25580        let r = _mm512_mask_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25581            src,
25582            0b01010101010101010101010101010101,
25583            a,
25584        );
25585        let e = _mm512_set_epi16(
25586            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25587            24, 34, 26, 36, 28, 38, 30, 40, 32,
25588        );
25589        assert_eq_m512i(r, e);
25590    }
25591
25592    #[simd_test(enable = "avx512fp16")]
25593    unsafe fn test_mm512_maskz_cvt_roundph_epu16() {
25594        let a = _mm512_set_ph(
25595            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25596            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25597            31.0, 32.0,
25598        );
25599        let r = _mm512_maskz_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25600            0b01010101010101010101010101010101,
25601            a,
25602        );
25603        let e = _mm512_set_epi16(
25604            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25605            0, 28, 0, 30, 0, 32,
25606        );
25607        assert_eq_m512i(r, e);
25608    }
25609
25610    #[simd_test(enable = "avx512fp16,avx512vl")]
25611    unsafe fn test_mm_cvttph_epi16() {
25612        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25613        let r = _mm_cvttph_epi16(a);
25614        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25615        assert_eq_m128i(r, e);
25616    }
25617
25618    #[simd_test(enable = "avx512fp16,avx512vl")]
25619    unsafe fn test_mm_mask_cvttph_epi16() {
25620        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25621        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25622        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
25623        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25624        assert_eq_m128i(r, e);
25625    }
25626
25627    #[simd_test(enable = "avx512fp16,avx512vl")]
25628    unsafe fn test_mm_maskz_cvttph_epi16() {
25629        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25630        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
25631        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25632        assert_eq_m128i(r, e);
25633    }
25634
25635    #[simd_test(enable = "avx512fp16,avx512vl")]
25636    unsafe fn test_mm256_cvttph_epi16() {
25637        let a = _mm256_set_ph(
25638            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25639        );
25640        let r = _mm256_cvttph_epi16(a);
25641        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25642        assert_eq_m256i(r, e);
25643    }
25644
25645    #[simd_test(enable = "avx512fp16,avx512vl")]
25646    unsafe fn test_mm256_mask_cvttph_epi16() {
25647        let a = _mm256_set_ph(
25648            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25649        );
25650        let src = _mm256_set_epi16(
25651            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25652        );
25653        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
25654        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25655        assert_eq_m256i(r, e);
25656    }
25657
25658    #[simd_test(enable = "avx512fp16,avx512vl")]
25659    unsafe fn test_mm256_maskz_cvttph_epi16() {
25660        let a = _mm256_set_ph(
25661            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25662        );
25663        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
25664        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25665        assert_eq_m256i(r, e);
25666    }
25667
25668    #[simd_test(enable = "avx512fp16")]
25669    unsafe fn test_mm512_cvttph_epi16() {
25670        let a = _mm512_set_ph(
25671            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25672            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25673            31.0, 32.0,
25674        );
25675        let r = _mm512_cvttph_epi16(a);
25676        let e = _mm512_set_epi16(
25677            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25678            25, 26, 27, 28, 29, 30, 31, 32,
25679        );
25680        assert_eq_m512i(r, e);
25681    }
25682
25683    #[simd_test(enable = "avx512fp16")]
25684    unsafe fn test_mm512_mask_cvttph_epi16() {
25685        let a = _mm512_set_ph(
25686            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25687            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25688            31.0, 32.0,
25689        );
25690        let src = _mm512_set_epi16(
25691            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25692            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25693        );
25694        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
25695        let e = _mm512_set_epi16(
25696            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25697            24, 34, 26, 36, 28, 38, 30, 40, 32,
25698        );
25699        assert_eq_m512i(r, e);
25700    }
25701
25702    #[simd_test(enable = "avx512fp16")]
25703    unsafe fn test_mm512_maskz_cvttph_epi16() {
25704        let a = _mm512_set_ph(
25705            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25706            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25707            31.0, 32.0,
25708        );
25709        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
25710        let e = _mm512_set_epi16(
25711            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25712            0, 28, 0, 30, 0, 32,
25713        );
25714        assert_eq_m512i(r, e);
25715    }
25716
25717    #[simd_test(enable = "avx512fp16")]
25718    unsafe fn test_mm512_cvtt_roundph_epi16() {
25719        let a = _mm512_set_ph(
25720            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25721            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25722            31.0, 32.0,
25723        );
25724        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
25725        let e = _mm512_set_epi16(
25726            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25727            25, 26, 27, 28, 29, 30, 31, 32,
25728        );
25729        assert_eq_m512i(r, e);
25730    }
25731
25732    #[simd_test(enable = "avx512fp16")]
25733    unsafe fn test_mm512_mask_cvtt_roundph_epi16() {
25734        let a = _mm512_set_ph(
25735            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25736            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25737            31.0, 32.0,
25738        );
25739        let src = _mm512_set_epi16(
25740            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25741            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25742        );
25743        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25744            src,
25745            0b01010101010101010101010101010101,
25746            a,
25747        );
25748        let e = _mm512_set_epi16(
25749            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25750            24, 34, 26, 36, 28, 38, 30, 40, 32,
25751        );
25752        assert_eq_m512i(r, e);
25753    }
25754
25755    #[simd_test(enable = "avx512fp16")]
25756    unsafe fn test_mm512_maskz_cvtt_roundph_epi16() {
25757        let a = _mm512_set_ph(
25758            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25759            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25760            31.0, 32.0,
25761        );
25762        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25763            0b01010101010101010101010101010101,
25764            a,
25765        );
25766        let e = _mm512_set_epi16(
25767            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25768            0, 28, 0, 30, 0, 32,
25769        );
25770        assert_eq_m512i(r, e);
25771    }
25772
25773    #[simd_test(enable = "avx512fp16,avx512vl")]
25774    unsafe fn test_mm_cvttph_epu16() {
25775        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25776        let r = _mm_cvttph_epu16(a);
25777        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25778        assert_eq_m128i(r, e);
25779    }
25780
25781    #[simd_test(enable = "avx512fp16,avx512vl")]
25782    unsafe fn test_mm_mask_cvttph_epu16() {
25783        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25784        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25785        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
25786        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25787        assert_eq_m128i(r, e);
25788    }
25789
25790    #[simd_test(enable = "avx512fp16,avx512vl")]
25791    unsafe fn test_mm_maskz_cvttph_epu16() {
25792        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25793        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
25794        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25795        assert_eq_m128i(r, e);
25796    }
25797
25798    #[simd_test(enable = "avx512fp16,avx512vl")]
25799    unsafe fn test_mm256_cvttph_epu16() {
25800        let a = _mm256_set_ph(
25801            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25802        );
25803        let r = _mm256_cvttph_epu16(a);
25804        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25805        assert_eq_m256i(r, e);
25806    }
25807
25808    #[simd_test(enable = "avx512fp16,avx512vl")]
25809    unsafe fn test_mm256_mask_cvttph_epu16() {
25810        let a = _mm256_set_ph(
25811            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25812        );
25813        let src = _mm256_set_epi16(
25814            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25815        );
25816        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
25817        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25818        assert_eq_m256i(r, e);
25819    }
25820
25821    #[simd_test(enable = "avx512fp16,avx512vl")]
25822    unsafe fn test_mm256_maskz_cvttph_epu16() {
25823        let a = _mm256_set_ph(
25824            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25825        );
25826        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
25827        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25828        assert_eq_m256i(r, e);
25829    }
25830
25831    #[simd_test(enable = "avx512fp16")]
25832    unsafe fn test_mm512_cvttph_epu16() {
25833        let a = _mm512_set_ph(
25834            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25835            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25836            31.0, 32.0,
25837        );
25838        let r = _mm512_cvttph_epu16(a);
25839        let e = _mm512_set_epi16(
25840            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25841            25, 26, 27, 28, 29, 30, 31, 32,
25842        );
25843        assert_eq_m512i(r, e);
25844    }
25845
25846    #[simd_test(enable = "avx512fp16")]
25847    unsafe fn test_mm512_mask_cvttph_epu16() {
25848        let a = _mm512_set_ph(
25849            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25850            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25851            31.0, 32.0,
25852        );
25853        let src = _mm512_set_epi16(
25854            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25855            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25856        );
25857        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
25858        let e = _mm512_set_epi16(
25859            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25860            24, 34, 26, 36, 28, 38, 30, 40, 32,
25861        );
25862        assert_eq_m512i(r, e);
25863    }
25864
25865    #[simd_test(enable = "avx512fp16")]
25866    unsafe fn test_mm512_maskz_cvttph_epu16() {
25867        let a = _mm512_set_ph(
25868            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25869            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25870            31.0, 32.0,
25871        );
25872        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
25873        let e = _mm512_set_epi16(
25874            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25875            0, 28, 0, 30, 0, 32,
25876        );
25877        assert_eq_m512i(r, e);
25878    }
25879
25880    #[simd_test(enable = "avx512fp16")]
25881    unsafe fn test_mm512_cvtt_roundph_epu16() {
25882        let a = _mm512_set_ph(
25883            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25884            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25885            31.0, 32.0,
25886        );
25887        let r = _mm512_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(a);
25888        let e = _mm512_set_epi16(
25889            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25890            25, 26, 27, 28, 29, 30, 31, 32,
25891        );
25892        assert_eq_m512i(r, e);
25893    }
25894
25895    #[simd_test(enable = "avx512fp16")]
25896    unsafe fn test_mm512_mask_cvtt_roundph_epu16() {
25897        let a = _mm512_set_ph(
25898            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25899            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25900            31.0, 32.0,
25901        );
25902        let src = _mm512_set_epi16(
25903            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25904            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25905        );
25906        let r = _mm512_mask_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
25907            src,
25908            0b01010101010101010101010101010101,
25909            a,
25910        );
25911        let e = _mm512_set_epi16(
25912            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25913            24, 34, 26, 36, 28, 38, 30, 40, 32,
25914        );
25915        assert_eq_m512i(r, e);
25916    }
25917
25918    #[simd_test(enable = "avx512fp16")]
25919    unsafe fn test_mm512_maskz_cvtt_roundph_epu16() {
25920        let a = _mm512_set_ph(
25921            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25922            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25923            31.0, 32.0,
25924        );
25925        let r = _mm512_maskz_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
25926            0b01010101010101010101010101010101,
25927            a,
25928        );
25929        let e = _mm512_set_epi16(
25930            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25931            0, 28, 0, 30, 0, 32,
25932        );
25933        assert_eq_m512i(r, e);
25934    }
25935
25936    #[simd_test(enable = "avx512fp16,avx512vl")]
25937    unsafe fn test_mm_cvtph_epi32() {
25938        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25939        let r = _mm_cvtph_epi32(a);
25940        let e = _mm_set_epi32(1, 2, 3, 4);
25941        assert_eq_m128i(r, e);
25942    }
25943
25944    #[simd_test(enable = "avx512fp16,avx512vl")]
25945    unsafe fn test_mm_mask_cvtph_epi32() {
25946        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25947        let src = _mm_set_epi32(10, 11, 12, 13);
25948        let r = _mm_mask_cvtph_epi32(src, 0b0101, a);
25949        let e = _mm_set_epi32(10, 2, 12, 4);
25950        assert_eq_m128i(r, e);
25951    }
25952
25953    #[simd_test(enable = "avx512fp16,avx512vl")]
25954    unsafe fn test_mm_maskz_cvtph_epi32() {
25955        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25956        let r = _mm_maskz_cvtph_epi32(0b0101, a);
25957        let e = _mm_set_epi32(0, 2, 0, 4);
25958        assert_eq_m128i(r, e);
25959    }
25960
25961    #[simd_test(enable = "avx512fp16,avx512vl")]
25962    unsafe fn test_mm256_cvtph_epi32() {
25963        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25964        let r = _mm256_cvtph_epi32(a);
25965        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
25966        assert_eq_m256i(r, e);
25967    }
25968
25969    #[simd_test(enable = "avx512fp16,avx512vl")]
25970    unsafe fn test_mm256_mask_cvtph_epi32() {
25971        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25972        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
25973        let r = _mm256_mask_cvtph_epi32(src, 0b01010101, a);
25974        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
25975        assert_eq_m256i(r, e);
25976    }
25977
25978    #[simd_test(enable = "avx512fp16,avx512vl")]
25979    unsafe fn test_mm256_maskz_cvtph_epi32() {
25980        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25981        let r = _mm256_maskz_cvtph_epi32(0b01010101, a);
25982        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
25983        assert_eq_m256i(r, e);
25984    }
25985
25986    #[simd_test(enable = "avx512fp16")]
25987    unsafe fn test_mm512_cvtph_epi32() {
25988        let a = _mm256_set_ph(
25989            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25990        );
25991        let r = _mm512_cvtph_epi32(a);
25992        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25993        assert_eq_m512i(r, e);
25994    }
25995
25996    #[simd_test(enable = "avx512fp16")]
25997    unsafe fn test_mm512_mask_cvtph_epi32() {
25998        let a = _mm256_set_ph(
25999            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26000        );
26001        let src = _mm512_set_epi32(
26002            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26003        );
26004        let r = _mm512_mask_cvtph_epi32(src, 0b0101010101010101, a);
26005        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26006        assert_eq_m512i(r, e);
26007    }
26008
26009    #[simd_test(enable = "avx512fp16")]
26010    unsafe fn test_mm512_maskz_cvtph_epi32() {
26011        let a = _mm256_set_ph(
26012            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26013        );
26014        let r = _mm512_maskz_cvtph_epi32(0b0101010101010101, a);
26015        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26016        assert_eq_m512i(r, e);
26017    }
26018
26019    #[simd_test(enable = "avx512fp16")]
26020    unsafe fn test_mm512_cvt_roundph_epi32() {
26021        let a = _mm256_set_ph(
26022            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26023        );
26024        let r = _mm512_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26025        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26026        assert_eq_m512i(r, e);
26027    }
26028
26029    #[simd_test(enable = "avx512fp16")]
26030    unsafe fn test_mm512_mask_cvt_roundph_epi32() {
26031        let a = _mm256_set_ph(
26032            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26033        );
26034        let src = _mm512_set_epi32(
26035            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26036        );
26037        let r = _mm512_mask_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26038            src,
26039            0b0101010101010101,
26040            a,
26041        );
26042        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26043        assert_eq_m512i(r, e);
26044    }
26045
26046    #[simd_test(enable = "avx512fp16")]
26047    unsafe fn test_mm512_maskz_cvt_roundph_epi32() {
26048        let a = _mm256_set_ph(
26049            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26050        );
26051        let r = _mm512_maskz_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26052            0b0101010101010101,
26053            a,
26054        );
26055        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26056        assert_eq_m512i(r, e);
26057    }
26058
26059    #[simd_test(enable = "avx512fp16")]
26060    unsafe fn test_mm_cvtsh_i32() {
26061        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26062        let r = _mm_cvtsh_i32(a);
26063        assert_eq!(r, 1);
26064    }
26065
26066    #[simd_test(enable = "avx512fp16")]
26067    unsafe fn test_mm_cvt_roundsh_i32() {
26068        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26069        let r = _mm_cvt_roundsh_i32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26070        assert_eq!(r, 1);
26071    }
26072
26073    #[simd_test(enable = "avx512fp16,avx512vl")]
26074    unsafe fn test_mm_cvtph_epu32() {
26075        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26076        let r = _mm_cvtph_epu32(a);
26077        let e = _mm_set_epi32(1, 2, 3, 4);
26078        assert_eq_m128i(r, e);
26079    }
26080
26081    #[simd_test(enable = "avx512fp16,avx512vl")]
26082    unsafe fn test_mm_mask_cvtph_epu32() {
26083        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26084        let src = _mm_set_epi32(10, 11, 12, 13);
26085        let r = _mm_mask_cvtph_epu32(src, 0b0101, a);
26086        let e = _mm_set_epi32(10, 2, 12, 4);
26087        assert_eq_m128i(r, e);
26088    }
26089
26090    #[simd_test(enable = "avx512fp16,avx512vl")]
26091    unsafe fn test_mm_maskz_cvtph_epu32() {
26092        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26093        let r = _mm_maskz_cvtph_epu32(0b0101, a);
26094        let e = _mm_set_epi32(0, 2, 0, 4);
26095        assert_eq_m128i(r, e);
26096    }
26097
26098    #[simd_test(enable = "avx512fp16,avx512vl")]
26099    unsafe fn test_mm256_cvtph_epu32() {
26100        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26101        let r = _mm256_cvtph_epu32(a);
26102        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26103        assert_eq_m256i(r, e);
26104    }
26105
26106    #[simd_test(enable = "avx512fp16,avx512vl")]
26107    unsafe fn test_mm256_mask_cvtph_epu32() {
26108        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26109        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26110        let r = _mm256_mask_cvtph_epu32(src, 0b01010101, a);
26111        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26112        assert_eq_m256i(r, e);
26113    }
26114
26115    #[simd_test(enable = "avx512fp16,avx512vl")]
26116    unsafe fn test_mm256_maskz_cvtph_epu32() {
26117        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26118        let r = _mm256_maskz_cvtph_epu32(0b01010101, a);
26119        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26120        assert_eq_m256i(r, e);
26121    }
26122
26123    #[simd_test(enable = "avx512fp16")]
26124    unsafe fn test_mm512_cvtph_epu32() {
26125        let a = _mm256_set_ph(
26126            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26127        );
26128        let r = _mm512_cvtph_epu32(a);
26129        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26130        assert_eq_m512i(r, e);
26131    }
26132
26133    #[simd_test(enable = "avx512fp16")]
26134    unsafe fn test_mm512_mask_cvtph_epu32() {
26135        let a = _mm256_set_ph(
26136            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26137        );
26138        let src = _mm512_set_epi32(
26139            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26140        );
26141        let r = _mm512_mask_cvtph_epu32(src, 0b0101010101010101, a);
26142        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26143        assert_eq_m512i(r, e);
26144    }
26145
26146    #[simd_test(enable = "avx512fp16")]
26147    unsafe fn test_mm512_maskz_cvtph_epu32() {
26148        let a = _mm256_set_ph(
26149            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26150        );
26151        let r = _mm512_maskz_cvtph_epu32(0b0101010101010101, a);
26152        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26153        assert_eq_m512i(r, e);
26154    }
26155
26156    #[simd_test(enable = "avx512fp16")]
26157    unsafe fn test_mm512_cvt_roundph_epu32() {
26158        let a = _mm256_set_ph(
26159            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26160        );
26161        let r = _mm512_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26162        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26163        assert_eq_m512i(r, e);
26164    }
26165
26166    #[simd_test(enable = "avx512fp16")]
26167    unsafe fn test_mm512_mask_cvt_roundph_epu32() {
26168        let a = _mm256_set_ph(
26169            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26170        );
26171        let src = _mm512_set_epi32(
26172            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26173        );
26174        let r = _mm512_mask_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26175            src,
26176            0b0101010101010101,
26177            a,
26178        );
26179        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26180        assert_eq_m512i(r, e);
26181    }
26182
26183    #[simd_test(enable = "avx512fp16")]
26184    unsafe fn test_mm512_maskz_cvt_roundph_epu32() {
26185        let a = _mm256_set_ph(
26186            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26187        );
26188        let r = _mm512_maskz_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26189            0b0101010101010101,
26190            a,
26191        );
26192        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26193        assert_eq_m512i(r, e);
26194    }
26195
26196    #[simd_test(enable = "avx512fp16")]
26197    unsafe fn test_mm_cvtsh_u32() {
26198        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26199        let r = _mm_cvtsh_u32(a);
26200        assert_eq!(r, 1);
26201    }
26202
26203    #[simd_test(enable = "avx512fp16")]
26204    unsafe fn test_mm_cvt_roundsh_u32() {
26205        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26206        let r = _mm_cvt_roundsh_u32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26207        assert_eq!(r, 1);
26208    }
26209
26210    #[simd_test(enable = "avx512fp16,avx512vl")]
26211    unsafe fn test_mm_cvttph_epi32() {
26212        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26213        let r = _mm_cvttph_epi32(a);
26214        let e = _mm_set_epi32(1, 2, 3, 4);
26215        assert_eq_m128i(r, e);
26216    }
26217
26218    #[simd_test(enable = "avx512fp16,avx512vl")]
26219    unsafe fn test_mm_mask_cvttph_epi32() {
26220        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26221        let src = _mm_set_epi32(10, 11, 12, 13);
26222        let r = _mm_mask_cvttph_epi32(src, 0b0101, a);
26223        let e = _mm_set_epi32(10, 2, 12, 4);
26224        assert_eq_m128i(r, e);
26225    }
26226
26227    #[simd_test(enable = "avx512fp16,avx512vl")]
26228    unsafe fn test_mm_maskz_cvttph_epi32() {
26229        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26230        let r = _mm_maskz_cvttph_epi32(0b0101, a);
26231        let e = _mm_set_epi32(0, 2, 0, 4);
26232        assert_eq_m128i(r, e);
26233    }
26234
26235    #[simd_test(enable = "avx512fp16,avx512vl")]
26236    unsafe fn test_mm256_cvttph_epi32() {
26237        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26238        let r = _mm256_cvttph_epi32(a);
26239        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26240        assert_eq_m256i(r, e);
26241    }
26242
26243    #[simd_test(enable = "avx512fp16,avx512vl")]
26244    unsafe fn test_mm256_mask_cvttph_epi32() {
26245        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26246        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26247        let r = _mm256_mask_cvttph_epi32(src, 0b01010101, a);
26248        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26249        assert_eq_m256i(r, e);
26250    }
26251
26252    #[simd_test(enable = "avx512fp16,avx512vl")]
26253    unsafe fn test_mm256_maskz_cvttph_epi32() {
26254        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26255        let r = _mm256_maskz_cvttph_epi32(0b01010101, a);
26256        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26257        assert_eq_m256i(r, e);
26258    }
26259
26260    #[simd_test(enable = "avx512fp16")]
26261    unsafe fn test_mm512_cvttph_epi32() {
26262        let a = _mm256_set_ph(
26263            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26264        );
26265        let r = _mm512_cvttph_epi32(a);
26266        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26267        assert_eq_m512i(r, e);
26268    }
26269
26270    #[simd_test(enable = "avx512fp16")]
26271    unsafe fn test_mm512_mask_cvttph_epi32() {
26272        let a = _mm256_set_ph(
26273            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26274        );
26275        let src = _mm512_set_epi32(
26276            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26277        );
26278        let r = _mm512_mask_cvttph_epi32(src, 0b0101010101010101, a);
26279        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26280        assert_eq_m512i(r, e);
26281    }
26282
26283    #[simd_test(enable = "avx512fp16")]
26284    unsafe fn test_mm512_maskz_cvttph_epi32() {
26285        let a = _mm256_set_ph(
26286            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26287        );
26288        let r = _mm512_maskz_cvttph_epi32(0b0101010101010101, a);
26289        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26290        assert_eq_m512i(r, e);
26291    }
26292
26293    #[simd_test(enable = "avx512fp16")]
26294    unsafe fn test_mm512_cvtt_roundph_epi32() {
26295        let a = _mm256_set_ph(
26296            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26297        );
26298        let r = _mm512_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(a);
26299        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26300        assert_eq_m512i(r, e);
26301    }
26302
26303    #[simd_test(enable = "avx512fp16")]
26304    unsafe fn test_mm512_mask_cvtt_roundph_epi32() {
26305        let a = _mm256_set_ph(
26306            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26307        );
26308        let src = _mm512_set_epi32(
26309            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26310        );
26311        let r = _mm512_mask_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26312        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26313        assert_eq_m512i(r, e);
26314    }
26315
26316    #[simd_test(enable = "avx512fp16")]
26317    unsafe fn test_mm512_maskz_cvtt_roundph_epi32() {
26318        let a = _mm256_set_ph(
26319            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26320        );
26321        let r = _mm512_maskz_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26322        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26323        assert_eq_m512i(r, e);
26324    }
26325
26326    #[simd_test(enable = "avx512fp16")]
26327    unsafe fn test_mm_cvttsh_i32() {
26328        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26329        let r = _mm_cvttsh_i32(a);
26330        assert_eq!(r, 1);
26331    }
26332
26333    #[simd_test(enable = "avx512fp16")]
26334    unsafe fn test_mm_cvtt_roundsh_i32() {
26335        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26336        let r = _mm_cvtt_roundsh_i32::<_MM_FROUND_NO_EXC>(a);
26337        assert_eq!(r, 1);
26338    }
26339
26340    #[simd_test(enable = "avx512fp16,avx512vl")]
26341    unsafe fn test_mm_cvttph_epu32() {
26342        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26343        let r = _mm_cvttph_epu32(a);
26344        let e = _mm_set_epi32(1, 2, 3, 4);
26345        assert_eq_m128i(r, e);
26346    }
26347
26348    #[simd_test(enable = "avx512fp16,avx512vl")]
26349    unsafe fn test_mm_mask_cvttph_epu32() {
26350        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26351        let src = _mm_set_epi32(10, 11, 12, 13);
26352        let r = _mm_mask_cvttph_epu32(src, 0b0101, a);
26353        let e = _mm_set_epi32(10, 2, 12, 4);
26354        assert_eq_m128i(r, e);
26355    }
26356
26357    #[simd_test(enable = "avx512fp16,avx512vl")]
26358    unsafe fn test_mm_maskz_cvttph_epu32() {
26359        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26360        let r = _mm_maskz_cvttph_epu32(0b0101, a);
26361        let e = _mm_set_epi32(0, 2, 0, 4);
26362        assert_eq_m128i(r, e);
26363    }
26364
26365    #[simd_test(enable = "avx512fp16,avx512vl")]
26366    unsafe fn test_mm256_cvttph_epu32() {
26367        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26368        let r = _mm256_cvttph_epu32(a);
26369        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26370        assert_eq_m256i(r, e);
26371    }
26372
26373    #[simd_test(enable = "avx512fp16,avx512vl")]
26374    unsafe fn test_mm256_mask_cvttph_epu32() {
26375        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26376        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26377        let r = _mm256_mask_cvttph_epu32(src, 0b01010101, a);
26378        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26379        assert_eq_m256i(r, e);
26380    }
26381
26382    #[simd_test(enable = "avx512fp16,avx512vl")]
26383    unsafe fn test_mm256_maskz_cvttph_epu32() {
26384        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26385        let r = _mm256_maskz_cvttph_epu32(0b01010101, a);
26386        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26387        assert_eq_m256i(r, e);
26388    }
26389
26390    #[simd_test(enable = "avx512fp16")]
26391    unsafe fn test_mm512_cvttph_epu32() {
26392        let a = _mm256_set_ph(
26393            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26394        );
26395        let r = _mm512_cvttph_epu32(a);
26396        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26397        assert_eq_m512i(r, e);
26398    }
26399
26400    #[simd_test(enable = "avx512fp16")]
26401    unsafe fn test_mm512_mask_cvttph_epu32() {
26402        let a = _mm256_set_ph(
26403            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26404        );
26405        let src = _mm512_set_epi32(
26406            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26407        );
26408        let r = _mm512_mask_cvttph_epu32(src, 0b0101010101010101, a);
26409        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26410        assert_eq_m512i(r, e);
26411    }
26412
26413    #[simd_test(enable = "avx512fp16")]
26414    unsafe fn test_mm512_maskz_cvttph_epu32() {
26415        let a = _mm256_set_ph(
26416            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26417        );
26418        let r = _mm512_maskz_cvttph_epu32(0b0101010101010101, a);
26419        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26420        assert_eq_m512i(r, e);
26421    }
26422
26423    #[simd_test(enable = "avx512fp16")]
26424    unsafe fn test_mm512_cvtt_roundph_epu32() {
26425        let a = _mm256_set_ph(
26426            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26427        );
26428        let r = _mm512_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(a);
26429        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26430        assert_eq_m512i(r, e);
26431    }
26432
26433    #[simd_test(enable = "avx512fp16")]
26434    unsafe fn test_mm512_mask_cvtt_roundph_epu32() {
26435        let a = _mm256_set_ph(
26436            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26437        );
26438        let src = _mm512_set_epi32(
26439            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26440        );
26441        let r = _mm512_mask_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26442        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26443        assert_eq_m512i(r, e);
26444    }
26445
26446    #[simd_test(enable = "avx512fp16")]
26447    unsafe fn test_mm512_maskz_cvtt_roundph_epu32() {
26448        let a = _mm256_set_ph(
26449            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26450        );
26451        let r = _mm512_maskz_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26452        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26453        assert_eq_m512i(r, e);
26454    }
26455
26456    #[simd_test(enable = "avx512fp16")]
26457    unsafe fn test_mm_cvttsh_u32() {
26458        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26459        let r = _mm_cvttsh_u32(a);
26460        assert_eq!(r, 1);
26461    }
26462
26463    #[simd_test(enable = "avx512fp16")]
26464    unsafe fn test_mm_cvtt_roundsh_u32() {
26465        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26466        let r = _mm_cvtt_roundsh_u32::<_MM_FROUND_NO_EXC>(a);
26467        assert_eq!(r, 1);
26468    }
26469
26470    #[simd_test(enable = "avx512fp16,avx512vl")]
26471    unsafe fn test_mm_cvtph_epi64() {
26472        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26473        let r = _mm_cvtph_epi64(a);
26474        let e = _mm_set_epi64x(1, 2);
26475        assert_eq_m128i(r, e);
26476    }
26477
26478    #[simd_test(enable = "avx512fp16,avx512vl")]
26479    unsafe fn test_mm_mask_cvtph_epi64() {
26480        let src = _mm_set_epi64x(3, 4);
26481        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26482        let r = _mm_mask_cvtph_epi64(src, 0b01, a);
26483        let e = _mm_set_epi64x(3, 2);
26484        assert_eq_m128i(r, e);
26485    }
26486
26487    #[simd_test(enable = "avx512fp16,avx512vl")]
26488    unsafe fn test_mm_maskz_cvtph_epi64() {
26489        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26490        let r = _mm_maskz_cvtph_epi64(0b01, a);
26491        let e = _mm_set_epi64x(0, 2);
26492        assert_eq_m128i(r, e);
26493    }
26494
26495    #[simd_test(enable = "avx512fp16,avx512vl")]
26496    unsafe fn test_mm256_cvtph_epi64() {
26497        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26498        let r = _mm256_cvtph_epi64(a);
26499        let e = _mm256_set_epi64x(1, 2, 3, 4);
26500        assert_eq_m256i(r, e);
26501    }
26502
26503    #[simd_test(enable = "avx512fp16,avx512vl")]
26504    unsafe fn test_mm256_mask_cvtph_epi64() {
26505        let src = _mm256_set_epi64x(5, 6, 7, 8);
26506        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26507        let r = _mm256_mask_cvtph_epi64(src, 0b0101, a);
26508        let e = _mm256_set_epi64x(5, 2, 7, 4);
26509        assert_eq_m256i(r, e);
26510    }
26511
26512    #[simd_test(enable = "avx512fp16,avx512vl")]
26513    unsafe fn test_mm256_maskz_cvtph_epi64() {
26514        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26515        let r = _mm256_maskz_cvtph_epi64(0b0101, a);
26516        let e = _mm256_set_epi64x(0, 2, 0, 4);
26517        assert_eq_m256i(r, e);
26518    }
26519
26520    #[simd_test(enable = "avx512fp16")]
26521    unsafe fn test_mm512_cvtph_epi64() {
26522        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26523        let r = _mm512_cvtph_epi64(a);
26524        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26525        assert_eq_m512i(r, e);
26526    }
26527
26528    #[simd_test(enable = "avx512fp16")]
26529    unsafe fn test_mm512_mask_cvtph_epi64() {
26530        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26531        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26532        let r = _mm512_mask_cvtph_epi64(src, 0b01010101, a);
26533        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26534        assert_eq_m512i(r, e);
26535    }
26536
26537    #[simd_test(enable = "avx512fp16")]
26538    unsafe fn test_mm512_maskz_cvtph_epi64() {
26539        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26540        let r = _mm512_maskz_cvtph_epi64(0b01010101, a);
26541        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26542        assert_eq_m512i(r, e);
26543    }
26544
26545    #[simd_test(enable = "avx512fp16")]
26546    unsafe fn test_mm512_cvt_roundph_epi64() {
26547        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26548        let r = _mm512_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26549        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26550        assert_eq_m512i(r, e);
26551    }
26552
26553    #[simd_test(enable = "avx512fp16")]
26554    unsafe fn test_mm512_mask_cvt_roundph_epi64() {
26555        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26556        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26557        let r = _mm512_mask_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26558            src, 0b01010101, a,
26559        );
26560        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26561        assert_eq_m512i(r, e);
26562    }
26563
26564    #[simd_test(enable = "avx512fp16")]
26565    unsafe fn test_mm512_maskz_cvt_roundph_epi64() {
26566        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26567        let r = _mm512_maskz_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26568            0b01010101, a,
26569        );
26570        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26571        assert_eq_m512i(r, e);
26572    }
26573
26574    #[simd_test(enable = "avx512fp16,avx512vl")]
26575    unsafe fn test_mm_cvtph_epu64() {
26576        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26577        let r = _mm_cvtph_epu64(a);
26578        let e = _mm_set_epi64x(1, 2);
26579        assert_eq_m128i(r, e);
26580    }
26581
26582    #[simd_test(enable = "avx512fp16,avx512vl")]
26583    unsafe fn test_mm_mask_cvtph_epu64() {
26584        let src = _mm_set_epi64x(3, 4);
26585        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26586        let r = _mm_mask_cvtph_epu64(src, 0b01, a);
26587        let e = _mm_set_epi64x(3, 2);
26588        assert_eq_m128i(r, e);
26589    }
26590
26591    #[simd_test(enable = "avx512fp16,avx512vl")]
26592    unsafe fn test_mm_maskz_cvtph_epu64() {
26593        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26594        let r = _mm_maskz_cvtph_epu64(0b01, a);
26595        let e = _mm_set_epi64x(0, 2);
26596        assert_eq_m128i(r, e);
26597    }
26598
26599    #[simd_test(enable = "avx512fp16,avx512vl")]
26600    unsafe fn test_mm256_cvtph_epu64() {
26601        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26602        let r = _mm256_cvtph_epu64(a);
26603        let e = _mm256_set_epi64x(1, 2, 3, 4);
26604        assert_eq_m256i(r, e);
26605    }
26606
26607    #[simd_test(enable = "avx512fp16,avx512vl")]
26608    unsafe fn test_mm256_mask_cvtph_epu64() {
26609        let src = _mm256_set_epi64x(5, 6, 7, 8);
26610        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26611        let r = _mm256_mask_cvtph_epu64(src, 0b0101, a);
26612        let e = _mm256_set_epi64x(5, 2, 7, 4);
26613        assert_eq_m256i(r, e);
26614    }
26615
26616    #[simd_test(enable = "avx512fp16,avx512vl")]
26617    unsafe fn test_mm256_maskz_cvtph_epu64() {
26618        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26619        let r = _mm256_maskz_cvtph_epu64(0b0101, a);
26620        let e = _mm256_set_epi64x(0, 2, 0, 4);
26621        assert_eq_m256i(r, e);
26622    }
26623
26624    #[simd_test(enable = "avx512fp16")]
26625    unsafe fn test_mm512_cvtph_epu64() {
26626        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26627        let r = _mm512_cvtph_epu64(a);
26628        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26629        assert_eq_m512i(r, e);
26630    }
26631
26632    #[simd_test(enable = "avx512fp16")]
26633    unsafe fn test_mm512_mask_cvtph_epu64() {
26634        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26635        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26636        let r = _mm512_mask_cvtph_epu64(src, 0b01010101, a);
26637        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26638        assert_eq_m512i(r, e);
26639    }
26640
26641    #[simd_test(enable = "avx512fp16")]
26642    unsafe fn test_mm512_maskz_cvtph_epu64() {
26643        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26644        let r = _mm512_maskz_cvtph_epu64(0b01010101, a);
26645        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26646        assert_eq_m512i(r, e);
26647    }
26648
26649    #[simd_test(enable = "avx512fp16")]
26650    unsafe fn test_mm512_cvt_roundph_epu64() {
26651        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26652        let r = _mm512_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26653        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26654        assert_eq_m512i(r, e);
26655    }
26656
26657    #[simd_test(enable = "avx512fp16")]
26658    unsafe fn test_mm512_mask_cvt_roundph_epu64() {
26659        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26660        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26661        let r = _mm512_mask_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26662            src, 0b01010101, a,
26663        );
26664        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26665        assert_eq_m512i(r, e);
26666    }
26667
26668    #[simd_test(enable = "avx512fp16")]
26669    unsafe fn test_mm512_maskz_cvt_roundph_epu64() {
26670        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26671        let r = _mm512_maskz_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26672            0b01010101, a,
26673        );
26674        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26675        assert_eq_m512i(r, e);
26676    }
26677
26678    #[simd_test(enable = "avx512fp16,avx512vl")]
26679    unsafe fn test_mm_cvttph_epi64() {
26680        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26681        let r = _mm_cvttph_epi64(a);
26682        let e = _mm_set_epi64x(1, 2);
26683        assert_eq_m128i(r, e);
26684    }
26685
26686    #[simd_test(enable = "avx512fp16,avx512vl")]
26687    unsafe fn test_mm_mask_cvttph_epi64() {
26688        let src = _mm_set_epi64x(3, 4);
26689        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26690        let r = _mm_mask_cvttph_epi64(src, 0b01, a);
26691        let e = _mm_set_epi64x(3, 2);
26692        assert_eq_m128i(r, e);
26693    }
26694
26695    #[simd_test(enable = "avx512fp16,avx512vl")]
26696    unsafe fn test_mm_maskz_cvttph_epi64() {
26697        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26698        let r = _mm_maskz_cvttph_epi64(0b01, a);
26699        let e = _mm_set_epi64x(0, 2);
26700        assert_eq_m128i(r, e);
26701    }
26702
26703    #[simd_test(enable = "avx512fp16,avx512vl")]
26704    unsafe fn test_mm256_cvttph_epi64() {
26705        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26706        let r = _mm256_cvttph_epi64(a);
26707        let e = _mm256_set_epi64x(1, 2, 3, 4);
26708        assert_eq_m256i(r, e);
26709    }
26710
26711    #[simd_test(enable = "avx512fp16,avx512vl")]
26712    unsafe fn test_mm256_mask_cvttph_epi64() {
26713        let src = _mm256_set_epi64x(5, 6, 7, 8);
26714        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26715        let r = _mm256_mask_cvttph_epi64(src, 0b0101, a);
26716        let e = _mm256_set_epi64x(5, 2, 7, 4);
26717        assert_eq_m256i(r, e);
26718    }
26719
26720    #[simd_test(enable = "avx512fp16,avx512vl")]
26721    unsafe fn test_mm256_maskz_cvttph_epi64() {
26722        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26723        let r = _mm256_maskz_cvttph_epi64(0b0101, a);
26724        let e = _mm256_set_epi64x(0, 2, 0, 4);
26725        assert_eq_m256i(r, e);
26726    }
26727
26728    #[simd_test(enable = "avx512fp16")]
26729    unsafe fn test_mm512_cvttph_epi64() {
26730        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26731        let r = _mm512_cvttph_epi64(a);
26732        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26733        assert_eq_m512i(r, e);
26734    }
26735
26736    #[simd_test(enable = "avx512fp16")]
26737    unsafe fn test_mm512_mask_cvttph_epi64() {
26738        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26739        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26740        let r = _mm512_mask_cvttph_epi64(src, 0b01010101, a);
26741        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26742        assert_eq_m512i(r, e);
26743    }
26744
26745    #[simd_test(enable = "avx512fp16")]
26746    unsafe fn test_mm512_maskz_cvttph_epi64() {
26747        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26748        let r = _mm512_maskz_cvttph_epi64(0b01010101, a);
26749        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26750        assert_eq_m512i(r, e);
26751    }
26752
26753    #[simd_test(enable = "avx512fp16")]
26754    unsafe fn test_mm512_cvtt_roundph_epi64() {
26755        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26756        let r = _mm512_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(a);
26757        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26758        assert_eq_m512i(r, e);
26759    }
26760
26761    #[simd_test(enable = "avx512fp16")]
26762    unsafe fn test_mm512_mask_cvtt_roundph_epi64() {
26763        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26764        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26765        let r = _mm512_mask_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
26766        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26767        assert_eq_m512i(r, e);
26768    }
26769
26770    #[simd_test(enable = "avx512fp16")]
26771    unsafe fn test_mm512_maskz_cvtt_roundph_epi64() {
26772        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26773        let r = _mm512_maskz_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(0b01010101, a);
26774        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26775        assert_eq_m512i(r, e);
26776    }
26777
26778    #[simd_test(enable = "avx512fp16,avx512vl")]
26779    unsafe fn test_mm_cvttph_epu64() {
26780        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26781        let r = _mm_cvttph_epu64(a);
26782        let e = _mm_set_epi64x(1, 2);
26783        assert_eq_m128i(r, e);
26784    }
26785
26786    #[simd_test(enable = "avx512fp16,avx512vl")]
26787    unsafe fn test_mm_mask_cvttph_epu64() {
26788        let src = _mm_set_epi64x(3, 4);
26789        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26790        let r = _mm_mask_cvttph_epu64(src, 0b01, a);
26791        let e = _mm_set_epi64x(3, 2);
26792        assert_eq_m128i(r, e);
26793    }
26794
26795    #[simd_test(enable = "avx512fp16,avx512vl")]
26796    unsafe fn test_mm_maskz_cvttph_epu64() {
26797        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26798        let r = _mm_maskz_cvttph_epu64(0b01, a);
26799        let e = _mm_set_epi64x(0, 2);
26800        assert_eq_m128i(r, e);
26801    }
26802
26803    #[simd_test(enable = "avx512fp16,avx512vl")]
26804    unsafe fn test_mm256_cvttph_epu64() {
26805        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26806        let r = _mm256_cvttph_epu64(a);
26807        let e = _mm256_set_epi64x(1, 2, 3, 4);
26808        assert_eq_m256i(r, e);
26809    }
26810
26811    #[simd_test(enable = "avx512fp16,avx512vl")]
26812    unsafe fn test_mm256_mask_cvttph_epu64() {
26813        let src = _mm256_set_epi64x(5, 6, 7, 8);
26814        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26815        let r = _mm256_mask_cvttph_epu64(src, 0b0101, a);
26816        let e = _mm256_set_epi64x(5, 2, 7, 4);
26817        assert_eq_m256i(r, e);
26818    }
26819
26820    #[simd_test(enable = "avx512fp16,avx512vl")]
26821    unsafe fn test_mm256_maskz_cvttph_epu64() {
26822        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26823        let r = _mm256_maskz_cvttph_epu64(0b0101, a);
26824        let e = _mm256_set_epi64x(0, 2, 0, 4);
26825        assert_eq_m256i(r, e);
26826    }
26827
26828    #[simd_test(enable = "avx512fp16")]
26829    unsafe fn test_mm512_cvttph_epu64() {
26830        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26831        let r = _mm512_cvttph_epu64(a);
26832        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26833        assert_eq_m512i(r, e);
26834    }
26835
26836    #[simd_test(enable = "avx512fp16")]
26837    unsafe fn test_mm512_mask_cvttph_epu64() {
26838        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26839        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26840        let r = _mm512_mask_cvttph_epu64(src, 0b01010101, a);
26841        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26842        assert_eq_m512i(r, e);
26843    }
26844
26845    #[simd_test(enable = "avx512fp16")]
26846    unsafe fn test_mm512_maskz_cvttph_epu64() {
26847        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26848        let r = _mm512_maskz_cvttph_epu64(0b01010101, a);
26849        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26850        assert_eq_m512i(r, e);
26851    }
26852
26853    #[simd_test(enable = "avx512fp16")]
26854    unsafe fn test_mm512_cvtt_roundph_epu64() {
26855        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26856        let r = _mm512_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(a);
26857        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26858        assert_eq_m512i(r, e);
26859    }
26860
26861    #[simd_test(enable = "avx512fp16")]
26862    unsafe fn test_mm512_mask_cvtt_roundph_epu64() {
26863        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26864        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26865        let r = _mm512_mask_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
26866        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26867        assert_eq_m512i(r, e);
26868    }
26869
26870    #[simd_test(enable = "avx512fp16")]
26871    unsafe fn test_mm512_maskz_cvtt_roundph_epu64() {
26872        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26873        let r = _mm512_maskz_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(0b01010101, a);
26874        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26875        assert_eq_m512i(r, e);
26876    }
26877
26878    #[simd_test(enable = "avx512fp16,avx512vl")]
26879    unsafe fn test_mm_cvtxph_ps() {
26880        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26881        let r = _mm_cvtxph_ps(a);
26882        let e = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
26883        assert_eq_m128(r, e);
26884    }
26885
26886    #[simd_test(enable = "avx512fp16,avx512vl")]
26887    unsafe fn test_mm_mask_cvtxph_ps() {
26888        let src = _mm_set_ps(10.0, 11.0, 12.0, 13.0);
26889        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26890        let r = _mm_mask_cvtxph_ps(src, 0b0101, a);
26891        let e = _mm_set_ps(10.0, 2.0, 12.0, 4.0);
26892        assert_eq_m128(r, e);
26893    }
26894
26895    #[simd_test(enable = "avx512fp16,avx512vl")]
26896    unsafe fn test_mm_maskz_cvtxph_ps() {
26897        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26898        let r = _mm_maskz_cvtxph_ps(0b0101, a);
26899        let e = _mm_set_ps(0.0, 2.0, 0.0, 4.0);
26900        assert_eq_m128(r, e);
26901    }
26902
26903    #[simd_test(enable = "avx512fp16,avx512vl")]
26904    unsafe fn test_mm256_cvtxph_ps() {
26905        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26906        let r = _mm256_cvtxph_ps(a);
26907        let e = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26908        assert_eq_m256(r, e);
26909    }
26910
26911    #[simd_test(enable = "avx512fp16,avx512vl")]
26912    unsafe fn test_mm256_mask_cvtxph_ps() {
26913        let src = _mm256_set_ps(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
26914        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26915        let r = _mm256_mask_cvtxph_ps(src, 0b01010101, a);
26916        let e = _mm256_set_ps(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
26917        assert_eq_m256(r, e);
26918    }
26919
26920    #[simd_test(enable = "avx512fp16,avx512vl")]
26921    unsafe fn test_mm256_maskz_cvtxph_ps() {
26922        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26923        let r = _mm256_maskz_cvtxph_ps(0b01010101, a);
26924        let e = _mm256_set_ps(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
26925        assert_eq_m256(r, e);
26926    }
26927
26928    #[simd_test(enable = "avx512fp16")]
26929    unsafe fn test_mm512_cvtxph_ps() {
26930        let a = _mm256_set_ph(
26931            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26932        );
26933        let r = _mm512_cvtxph_ps(a);
26934        let e = _mm512_set_ps(
26935            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26936        );
26937        assert_eq_m512(r, e);
26938    }
26939
26940    #[simd_test(enable = "avx512fp16")]
26941    unsafe fn test_mm512_mask_cvtxph_ps() {
26942        let src = _mm512_set_ps(
26943            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
26944            24.0, 25.0,
26945        );
26946        let a = _mm256_set_ph(
26947            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26948        );
26949        let r = _mm512_mask_cvtxph_ps(src, 0b0101010101010101, a);
26950        let e = _mm512_set_ps(
26951            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
26952            16.0,
26953        );
26954        assert_eq_m512(r, e);
26955    }
26956
26957    #[simd_test(enable = "avx512fp16")]
26958    unsafe fn test_mm512_maskz_cvtxph_ps() {
26959        let a = _mm256_set_ph(
26960            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26961        );
26962        let r = _mm512_maskz_cvtxph_ps(0b0101010101010101, a);
26963        let e = _mm512_set_ps(
26964            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
26965        );
26966        assert_eq_m512(r, e);
26967    }
26968
26969    #[simd_test(enable = "avx512fp16")]
26970    unsafe fn test_mm512_cvtx_roundph_ps() {
26971        let a = _mm256_set_ph(
26972            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26973        );
26974        let r = _mm512_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(a);
26975        let e = _mm512_set_ps(
26976            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26977        );
26978        assert_eq_m512(r, e);
26979    }
26980
26981    #[simd_test(enable = "avx512fp16")]
26982    unsafe fn test_mm512_mask_cvtx_roundph_ps() {
26983        let src = _mm512_set_ps(
26984            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
26985            24.0, 25.0,
26986        );
26987        let a = _mm256_set_ph(
26988            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26989        );
26990        let r = _mm512_mask_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26991        let e = _mm512_set_ps(
26992            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
26993            16.0,
26994        );
26995        assert_eq_m512(r, e);
26996    }
26997
26998    #[simd_test(enable = "avx512fp16")]
26999    unsafe fn test_mm512_maskz_cvtx_roundph_ps() {
27000        let a = _mm256_set_ph(
27001            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27002        );
27003        let r = _mm512_maskz_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
27004        let e = _mm512_set_ps(
27005            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
27006        );
27007        assert_eq_m512(r, e);
27008    }
27009
27010    #[simd_test(enable = "avx512fp16")]
27011    unsafe fn test_mm_cvtsh_ss() {
27012        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27013        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27014        let r = _mm_cvtsh_ss(a, b);
27015        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27016        assert_eq_m128(r, e);
27017    }
27018
27019    #[simd_test(enable = "avx512fp16")]
27020    unsafe fn test_mm_mask_cvtsh_ss() {
27021        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
27022        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27023        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27024        let r = _mm_mask_cvtsh_ss(src, 0, a, b);
27025        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
27026        assert_eq_m128(r, e);
27027        let r = _mm_mask_cvtsh_ss(src, 1, a, b);
27028        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27029        assert_eq_m128(r, e);
27030    }
27031
27032    #[simd_test(enable = "avx512fp16")]
27033    unsafe fn test_mm_maskz_cvtsh_ss() {
27034        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27035        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27036        let r = _mm_maskz_cvtsh_ss(0, a, b);
27037        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
27038        assert_eq_m128(r, e);
27039        let r = _mm_maskz_cvtsh_ss(1, a, b);
27040        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27041        assert_eq_m128(r, e);
27042    }
27043
27044    #[simd_test(enable = "avx512fp16")]
27045    unsafe fn test_mm_cvt_roundsh_ss() {
27046        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27047        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27048        let r = _mm_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(a, b);
27049        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27050        assert_eq_m128(r, e);
27051    }
27052
27053    #[simd_test(enable = "avx512fp16")]
27054    unsafe fn test_mm_mask_cvt_roundsh_ss() {
27055        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
27056        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27057        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27058        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 0, a, b);
27059        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
27060        assert_eq_m128(r, e);
27061        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 1, a, b);
27062        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27063        assert_eq_m128(r, e);
27064    }
27065
27066    #[simd_test(enable = "avx512fp16")]
27067    unsafe fn test_mm_maskz_cvt_roundsh_ss() {
27068        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27069        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27070        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(0, a, b);
27071        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
27072        assert_eq_m128(r, e);
27073        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(1, a, b);
27074        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27075        assert_eq_m128(r, e);
27076    }
27077
27078    #[simd_test(enable = "avx512fp16,avx512vl")]
27079    unsafe fn test_mm_cvtph_pd() {
27080        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27081        let r = _mm_cvtph_pd(a);
27082        let e = _mm_set_pd(1.0, 2.0);
27083        assert_eq_m128d(r, e);
27084    }
27085
27086    #[simd_test(enable = "avx512fp16,avx512vl")]
27087    unsafe fn test_mm_mask_cvtph_pd() {
27088        let src = _mm_set_pd(10.0, 11.0);
27089        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27090        let r = _mm_mask_cvtph_pd(src, 0b01, a);
27091        let e = _mm_set_pd(10.0, 2.0);
27092        assert_eq_m128d(r, e);
27093    }
27094
27095    #[simd_test(enable = "avx512fp16,avx512vl")]
27096    unsafe fn test_mm_maskz_cvtph_pd() {
27097        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27098        let r = _mm_maskz_cvtph_pd(0b01, a);
27099        let e = _mm_set_pd(0.0, 2.0);
27100        assert_eq_m128d(r, e);
27101    }
27102
27103    #[simd_test(enable = "avx512fp16,avx512vl")]
27104    unsafe fn test_mm256_cvtph_pd() {
27105        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27106        let r = _mm256_cvtph_pd(a);
27107        let e = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
27108        assert_eq_m256d(r, e);
27109    }
27110
27111    #[simd_test(enable = "avx512fp16,avx512vl")]
27112    unsafe fn test_mm256_mask_cvtph_pd() {
27113        let src = _mm256_set_pd(10.0, 11.0, 12.0, 13.0);
27114        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27115        let r = _mm256_mask_cvtph_pd(src, 0b0101, a);
27116        let e = _mm256_set_pd(10.0, 2.0, 12.0, 4.0);
27117        assert_eq_m256d(r, e);
27118    }
27119
27120    #[simd_test(enable = "avx512fp16,avx512vl")]
27121    unsafe fn test_mm256_maskz_cvtph_pd() {
27122        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27123        let r = _mm256_maskz_cvtph_pd(0b0101, a);
27124        let e = _mm256_set_pd(0.0, 2.0, 0.0, 4.0);
27125        assert_eq_m256d(r, e);
27126    }
27127
27128    #[simd_test(enable = "avx512fp16")]
27129    unsafe fn test_mm512_cvtph_pd() {
27130        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27131        let r = _mm512_cvtph_pd(a);
27132        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27133        assert_eq_m512d(r, e);
27134    }
27135
27136    #[simd_test(enable = "avx512fp16")]
27137    unsafe fn test_mm512_mask_cvtph_pd() {
27138        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
27139        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27140        let r = _mm512_mask_cvtph_pd(src, 0b01010101, a);
27141        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
27142        assert_eq_m512d(r, e);
27143    }
27144
27145    #[simd_test(enable = "avx512fp16")]
27146    unsafe fn test_mm512_maskz_cvtph_pd() {
27147        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27148        let r = _mm512_maskz_cvtph_pd(0b01010101, a);
27149        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
27150        assert_eq_m512d(r, e);
27151    }
27152
27153    #[simd_test(enable = "avx512fp16")]
27154    unsafe fn test_mm512_cvt_roundph_pd() {
27155        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27156        let r = _mm512_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(a);
27157        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27158        assert_eq_m512d(r, e);
27159    }
27160
27161    #[simd_test(enable = "avx512fp16")]
27162    unsafe fn test_mm512_mask_cvt_roundph_pd() {
27163        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
27164        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27165        let r = _mm512_mask_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
27166        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
27167        assert_eq_m512d(r, e);
27168    }
27169
27170    #[simd_test(enable = "avx512fp16")]
27171    unsafe fn test_mm512_maskz_cvt_roundph_pd() {
27172        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27173        let r = _mm512_maskz_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(0b01010101, a);
27174        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
27175        assert_eq_m512d(r, e);
27176    }
27177
27178    #[simd_test(enable = "avx512fp16")]
27179    unsafe fn test_mm_cvtsh_sd() {
27180        let a = _mm_setr_pd(2.0, 20.0);
27181        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27182        let r = _mm_cvtsh_sd(a, b);
27183        let e = _mm_setr_pd(1.0, 20.0);
27184        assert_eq_m128d(r, e);
27185    }
27186
27187    #[simd_test(enable = "avx512fp16")]
27188    unsafe fn test_mm_mask_cvtsh_sd() {
27189        let src = _mm_setr_pd(3.0, 11.0);
27190        let a = _mm_setr_pd(2.0, 20.0);
27191        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27192        let r = _mm_mask_cvtsh_sd(src, 0, a, b);
27193        let e = _mm_setr_pd(3.0, 20.0);
27194        assert_eq_m128d(r, e);
27195        let r = _mm_mask_cvtsh_sd(src, 1, a, b);
27196        let e = _mm_setr_pd(1.0, 20.0);
27197        assert_eq_m128d(r, e);
27198    }
27199
27200    #[simd_test(enable = "avx512fp16")]
27201    unsafe fn test_mm_maskz_cvtsh_sd() {
27202        let a = _mm_setr_pd(2.0, 20.0);
27203        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27204        let r = _mm_maskz_cvtsh_sd(0, a, b);
27205        let e = _mm_setr_pd(0.0, 20.0);
27206        assert_eq_m128d(r, e);
27207        let r = _mm_maskz_cvtsh_sd(1, a, b);
27208        let e = _mm_setr_pd(1.0, 20.0);
27209        assert_eq_m128d(r, e);
27210    }
27211
27212    #[simd_test(enable = "avx512fp16")]
27213    unsafe fn test_mm_cvt_roundsh_sd() {
27214        let a = _mm_setr_pd(2.0, 20.0);
27215        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27216        let r = _mm_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(a, b);
27217        let e = _mm_setr_pd(1.0, 20.0);
27218        assert_eq_m128d(r, e);
27219    }
27220
27221    #[simd_test(enable = "avx512fp16")]
27222    unsafe fn test_mm_mask_cvt_roundsh_sd() {
27223        let src = _mm_setr_pd(3.0, 11.0);
27224        let a = _mm_setr_pd(2.0, 20.0);
27225        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27226        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 0, a, b);
27227        let e = _mm_setr_pd(3.0, 20.0);
27228        assert_eq_m128d(r, e);
27229        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 1, a, b);
27230        let e = _mm_setr_pd(1.0, 20.0);
27231        assert_eq_m128d(r, e);
27232    }
27233
27234    #[simd_test(enable = "avx512fp16")]
27235    unsafe fn test_mm_maskz_cvt_roundsh_sd() {
27236        let a = _mm_setr_pd(2.0, 20.0);
27237        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27238        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(0, a, b);
27239        let e = _mm_setr_pd(0.0, 20.0);
27240        assert_eq_m128d(r, e);
27241        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(1, a, b);
27242        let e = _mm_setr_pd(1.0, 20.0);
27243        assert_eq_m128d(r, e);
27244    }
27245
27246    #[simd_test(enable = "avx512fp16")]
27247    unsafe fn test_mm_cvtsh_h() {
27248        let a = _mm_setr_ph(1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0);
27249        let r = _mm_cvtsh_h(a);
27250        assert_eq!(r, 1.0);
27251    }
27252
27253    #[simd_test(enable = "avx512fp16")]
27254    unsafe fn test_mm256_cvtsh_h() {
27255        let a = _mm256_setr_ph(
27256            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27257        );
27258        let r = _mm256_cvtsh_h(a);
27259        assert_eq!(r, 1.0);
27260    }
27261
27262    #[simd_test(enable = "avx512fp16")]
27263    unsafe fn test_mm512_cvtsh_h() {
27264        let a = _mm512_setr_ph(
27265            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27266            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
27267            31.0, 32.0,
27268        );
27269        let r = _mm512_cvtsh_h(a);
27270        assert_eq!(r, 1.0);
27271    }
27272
27273    #[simd_test(enable = "avx512fp16")]
27274    unsafe fn test_mm_cvtsi128_si16() {
27275        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
27276        let r = _mm_cvtsi128_si16(a);
27277        assert_eq!(r, 1);
27278    }
27279
27280    #[simd_test(enable = "avx512fp16")]
27281    unsafe fn test_mm_cvtsi16_si128() {
27282        let a = 1;
27283        let r = _mm_cvtsi16_si128(a);
27284        let e = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
27285        assert_eq_m128i(r, e);
27286    }
27287}