core/stdarch/crates/core_arch/src/x86/
fma.rs

1//! Fused Multiply-Add instruction set (FMA)
2//!
3//! The FMA instruction set is an extension to the 128 and 256-bit SSE
4//! instructions in the x86 microprocessor instruction set to perform fused
5//! multiply–add (FMA) operations.
6//!
7//! The references are:
8//!
9//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
10//!   Instruction Set Reference, A-Z][intel64_ref].
11//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
12//!   System Instructions][amd64_ref].
13//!
14//! Wikipedia's [FMA][wiki_fma] page provides a quick overview of the
15//! instructions available.
16//!
17//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
18//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
19//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
20
21use crate::core_arch::x86::*;
22use crate::intrinsics::simd::{simd_fma, simd_neg};
23use crate::intrinsics::{fmaf32, fmaf64};
24
25#[cfg(test)]
26use stdarch_test::assert_instr;
27
28/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
29/// and `b`, and add the intermediate result to packed elements in `c`.
30///
31/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_pd)
32#[inline]
33#[target_feature(enable = "fma")]
34#[cfg_attr(test, assert_instr(vfmadd))]
35#[stable(feature = "simd_x86", since = "1.27.0")]
36pub unsafe fn _mm_fmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
37    simd_fma(a, b, c)
38}
39
40/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
41/// and `b`, and add the intermediate result to packed elements in `c`.
42///
43/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmadd_pd)
44#[inline]
45#[target_feature(enable = "fma")]
46#[cfg_attr(test, assert_instr(vfmadd))]
47#[stable(feature = "simd_x86", since = "1.27.0")]
48pub unsafe fn _mm256_fmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
49    simd_fma(a, b, c)
50}
51
52/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
53/// and `b`, and add the intermediate result to packed elements in `c`.
54///
55/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_ps)
56#[inline]
57#[target_feature(enable = "fma")]
58#[cfg_attr(test, assert_instr(vfmadd))]
59#[stable(feature = "simd_x86", since = "1.27.0")]
60pub unsafe fn _mm_fmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
61    simd_fma(a, b, c)
62}
63
64/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
65/// and `b`, and add the intermediate result to packed elements in `c`.
66///
67/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmadd_ps)
68#[inline]
69#[target_feature(enable = "fma")]
70#[cfg_attr(test, assert_instr(vfmadd))]
71#[stable(feature = "simd_x86", since = "1.27.0")]
72pub unsafe fn _mm256_fmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
73    simd_fma(a, b, c)
74}
75
76/// Multiplies the lower double-precision (64-bit) floating-point elements in
77/// `a` and `b`, and add the intermediate result to the lower element in `c`.
78/// Stores the result in the lower element of the returned value, and copy the
79/// upper element from `a` to the upper elements of the result.
80///
81/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_sd)
82#[inline]
83#[target_feature(enable = "fma")]
84#[cfg_attr(test, assert_instr(vfmadd))]
85#[stable(feature = "simd_x86", since = "1.27.0")]
86pub unsafe fn _mm_fmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
87    simd_insert!(
88        a,
89        0,
90        fmaf64(_mm_cvtsd_f64(a), _mm_cvtsd_f64(b), _mm_cvtsd_f64(c))
91    )
92}
93
94/// Multiplies the lower single-precision (32-bit) floating-point elements in
95/// `a` and `b`, and add the intermediate result to the lower element in `c`.
96/// Stores the result in the lower element of the returned value, and copy the
97/// 3 upper elements from `a` to the upper elements of the result.
98///
99/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_ss)
100#[inline]
101#[target_feature(enable = "fma")]
102#[cfg_attr(test, assert_instr(vfmadd))]
103#[stable(feature = "simd_x86", since = "1.27.0")]
104pub unsafe fn _mm_fmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
105    simd_insert!(
106        a,
107        0,
108        fmaf32(_mm_cvtss_f32(a), _mm_cvtss_f32(b), _mm_cvtss_f32(c))
109    )
110}
111
112/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
113/// and `b`, and alternatively add and subtract packed elements in `c` to/from
114/// the intermediate result.
115///
116/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_pd)
117#[inline]
118#[target_feature(enable = "fma")]
119#[cfg_attr(test, assert_instr(vfmaddsub))]
120#[stable(feature = "simd_x86", since = "1.27.0")]
121pub unsafe fn _mm_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
122    let add = simd_fma(a, b, c);
123    let sub = simd_fma(a, b, simd_neg(c));
124    simd_shuffle!(add, sub, [2, 1])
125}
126
127/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
128/// and `b`, and alternatively add and subtract packed elements in `c` to/from
129/// the intermediate result.
130///
131/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_pd)
132#[inline]
133#[target_feature(enable = "fma")]
134#[cfg_attr(test, assert_instr(vfmaddsub))]
135#[stable(feature = "simd_x86", since = "1.27.0")]
136pub unsafe fn _mm256_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
137    let add = simd_fma(a, b, c);
138    let sub = simd_fma(a, b, simd_neg(c));
139    simd_shuffle!(add, sub, [4, 1, 6, 3])
140}
141
142/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
143/// and `b`, and alternatively add and subtract packed elements in `c` to/from
144/// the intermediate result.
145///
146/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_ps)
147#[inline]
148#[target_feature(enable = "fma")]
149#[cfg_attr(test, assert_instr(vfmaddsub))]
150#[stable(feature = "simd_x86", since = "1.27.0")]
151pub unsafe fn _mm_fmaddsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
152    let add = simd_fma(a, b, c);
153    let sub = simd_fma(a, b, simd_neg(c));
154    simd_shuffle!(add, sub, [4, 1, 6, 3])
155}
156
157/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
158/// and `b`, and alternatively add and subtract packed elements in `c` to/from
159/// the intermediate result.
160///
161/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_ps)
162#[inline]
163#[target_feature(enable = "fma")]
164#[cfg_attr(test, assert_instr(vfmaddsub))]
165#[stable(feature = "simd_x86", since = "1.27.0")]
166pub unsafe fn _mm256_fmaddsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
167    let add = simd_fma(a, b, c);
168    let sub = simd_fma(a, b, simd_neg(c));
169    simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
170}
171
172/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
173/// and `b`, and subtract packed elements in `c` from the intermediate result.
174///
175/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_pd)
176#[inline]
177#[target_feature(enable = "fma")]
178#[cfg_attr(test, assert_instr(vfmsub))]
179#[stable(feature = "simd_x86", since = "1.27.0")]
180pub unsafe fn _mm_fmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
181    simd_fma(a, b, simd_neg(c))
182}
183
184/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
185/// and `b`, and subtract packed elements in `c` from the intermediate result.
186///
187/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsub_pd)
188#[inline]
189#[target_feature(enable = "fma")]
190#[cfg_attr(test, assert_instr(vfmsub))]
191#[stable(feature = "simd_x86", since = "1.27.0")]
192pub unsafe fn _mm256_fmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
193    simd_fma(a, b, simd_neg(c))
194}
195
196/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
197/// and `b`, and subtract packed elements in `c` from the intermediate result.
198///
199/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_ps)
200#[inline]
201#[target_feature(enable = "fma")]
202#[cfg_attr(test, assert_instr(vfmsub213ps))]
203#[stable(feature = "simd_x86", since = "1.27.0")]
204pub unsafe fn _mm_fmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
205    simd_fma(a, b, simd_neg(c))
206}
207
208/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
209/// and `b`, and subtract packed elements in `c` from the intermediate result.
210///
211/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsub_ps)
212#[inline]
213#[target_feature(enable = "fma")]
214#[cfg_attr(test, assert_instr(vfmsub213ps))]
215#[stable(feature = "simd_x86", since = "1.27.0")]
216pub unsafe fn _mm256_fmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
217    simd_fma(a, b, simd_neg(c))
218}
219
220/// Multiplies the lower double-precision (64-bit) floating-point elements in
221/// `a` and `b`, and subtract the lower element in `c` from the intermediate
222/// result. Store the result in the lower element of the returned value, and
223/// copy the upper element from `a` to the upper elements of the result.
224///
225/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_sd)
226#[inline]
227#[target_feature(enable = "fma")]
228#[cfg_attr(test, assert_instr(vfmsub))]
229#[stable(feature = "simd_x86", since = "1.27.0")]
230pub unsafe fn _mm_fmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
231    simd_insert!(
232        a,
233        0,
234        fmaf64(_mm_cvtsd_f64(a), _mm_cvtsd_f64(b), -_mm_cvtsd_f64(c))
235    )
236}
237
238/// Multiplies the lower single-precision (32-bit) floating-point elements in
239/// `a` and `b`,  and subtract the lower element in `c` from the intermediate
240/// result. Store the result in the lower element of the returned value, and
241/// copy the 3 upper elements from `a` to the upper elements of the result.
242///
243/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_ss)
244#[inline]
245#[target_feature(enable = "fma")]
246#[cfg_attr(test, assert_instr(vfmsub))]
247#[stable(feature = "simd_x86", since = "1.27.0")]
248pub unsafe fn _mm_fmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
249    simd_insert!(
250        a,
251        0,
252        fmaf32(_mm_cvtss_f32(a), _mm_cvtss_f32(b), -_mm_cvtss_f32(c))
253    )
254}
255
256/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
257/// and `b`, and alternatively subtract and add packed elements in `c` from/to
258/// the intermediate result.
259///
260/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_pd)
261#[inline]
262#[target_feature(enable = "fma")]
263#[cfg_attr(test, assert_instr(vfmsubadd))]
264#[stable(feature = "simd_x86", since = "1.27.0")]
265pub unsafe fn _mm_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
266    let add = simd_fma(a, b, c);
267    let sub = simd_fma(a, b, simd_neg(c));
268    simd_shuffle!(add, sub, [0, 3])
269}
270
271/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
272/// and `b`, and alternatively subtract and add packed elements in `c` from/to
273/// the intermediate result.
274///
275/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_pd)
276#[inline]
277#[target_feature(enable = "fma")]
278#[cfg_attr(test, assert_instr(vfmsubadd))]
279#[stable(feature = "simd_x86", since = "1.27.0")]
280pub unsafe fn _mm256_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
281    let add = simd_fma(a, b, c);
282    let sub = simd_fma(a, b, simd_neg(c));
283    simd_shuffle!(add, sub, [0, 5, 2, 7])
284}
285
286/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
287/// and `b`, and alternatively subtract and add packed elements in `c` from/to
288/// the intermediate result.
289///
290/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_ps)
291#[inline]
292#[target_feature(enable = "fma")]
293#[cfg_attr(test, assert_instr(vfmsubadd))]
294#[stable(feature = "simd_x86", since = "1.27.0")]
295pub unsafe fn _mm_fmsubadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
296    let add = simd_fma(a, b, c);
297    let sub = simd_fma(a, b, simd_neg(c));
298    simd_shuffle!(add, sub, [0, 5, 2, 7])
299}
300
301/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
302/// and `b`, and alternatively subtract and add packed elements in `c` from/to
303/// the intermediate result.
304///
305/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_ps)
306#[inline]
307#[target_feature(enable = "fma")]
308#[cfg_attr(test, assert_instr(vfmsubadd))]
309#[stable(feature = "simd_x86", since = "1.27.0")]
310pub unsafe fn _mm256_fmsubadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
311    let add = simd_fma(a, b, c);
312    let sub = simd_fma(a, b, simd_neg(c));
313    simd_shuffle!(add, sub, [0, 9, 2, 11, 4, 13, 6, 15])
314}
315
316/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
317/// and `b`, and add the negated intermediate result to packed elements in `c`.
318///
319/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_pd)
320#[inline]
321#[target_feature(enable = "fma")]
322#[cfg_attr(test, assert_instr(vfnmadd))]
323#[stable(feature = "simd_x86", since = "1.27.0")]
324pub unsafe fn _mm_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
325    simd_fma(simd_neg(a), b, c)
326}
327
328/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
329/// and `b`, and add the negated intermediate result to packed elements in `c`.
330///
331/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_pd)
332#[inline]
333#[target_feature(enable = "fma")]
334#[cfg_attr(test, assert_instr(vfnmadd))]
335#[stable(feature = "simd_x86", since = "1.27.0")]
336pub unsafe fn _mm256_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
337    simd_fma(simd_neg(a), b, c)
338}
339
340/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
341/// and `b`, and add the negated intermediate result to packed elements in `c`.
342///
343/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ps)
344#[inline]
345#[target_feature(enable = "fma")]
346#[cfg_attr(test, assert_instr(vfnmadd))]
347#[stable(feature = "simd_x86", since = "1.27.0")]
348pub unsafe fn _mm_fnmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
349    simd_fma(simd_neg(a), b, c)
350}
351
352/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
353/// and `b`, and add the negated intermediate result to packed elements in `c`.
354///
355/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_ps)
356#[inline]
357#[target_feature(enable = "fma")]
358#[cfg_attr(test, assert_instr(vfnmadd))]
359#[stable(feature = "simd_x86", since = "1.27.0")]
360pub unsafe fn _mm256_fnmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
361    simd_fma(simd_neg(a), b, c)
362}
363
364/// Multiplies the lower double-precision (64-bit) floating-point elements in
365/// `a` and `b`, and add the negated intermediate result to the lower element
366/// in `c`. Store the result in the lower element of the returned value, and
367/// copy the upper element from `a` to the upper elements of the result.
368///
369/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_sd)
370#[inline]
371#[target_feature(enable = "fma")]
372#[cfg_attr(test, assert_instr(vfnmadd))]
373#[stable(feature = "simd_x86", since = "1.27.0")]
374pub unsafe fn _mm_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
375    simd_insert!(
376        a,
377        0,
378        fmaf64(_mm_cvtsd_f64(a), -_mm_cvtsd_f64(b), _mm_cvtsd_f64(c))
379    )
380}
381
382/// Multiplies the lower single-precision (32-bit) floating-point elements in
383/// `a` and `b`, and add the negated intermediate result to the lower element
384/// in `c`. Store the result in the lower element of the returned value, and
385/// copy the 3 upper elements from `a` to the upper elements of the result.
386///
387/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ss)
388#[inline]
389#[target_feature(enable = "fma")]
390#[cfg_attr(test, assert_instr(vfnmadd))]
391#[stable(feature = "simd_x86", since = "1.27.0")]
392pub unsafe fn _mm_fnmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
393    simd_insert!(
394        a,
395        0,
396        fmaf32(_mm_cvtss_f32(a), -_mm_cvtss_f32(b), _mm_cvtss_f32(c))
397    )
398}
399
400/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
401/// and `b`, and subtract packed elements in `c` from the negated intermediate
402/// result.
403///
404/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_pd)
405#[inline]
406#[target_feature(enable = "fma")]
407#[cfg_attr(test, assert_instr(vfnmsub))]
408#[stable(feature = "simd_x86", since = "1.27.0")]
409pub unsafe fn _mm_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
410    simd_fma(simd_neg(a), b, simd_neg(c))
411}
412
413/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
414/// and `b`, and subtract packed elements in `c` from the negated intermediate
415/// result.
416///
417/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmsub_pd)
418#[inline]
419#[target_feature(enable = "fma")]
420#[cfg_attr(test, assert_instr(vfnmsub))]
421#[stable(feature = "simd_x86", since = "1.27.0")]
422pub unsafe fn _mm256_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
423    simd_fma(simd_neg(a), b, simd_neg(c))
424}
425
426/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
427/// and `b`, and subtract packed elements in `c` from the negated intermediate
428/// result.
429///
430/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_ps)
431#[inline]
432#[target_feature(enable = "fma")]
433#[cfg_attr(test, assert_instr(vfnmsub))]
434#[stable(feature = "simd_x86", since = "1.27.0")]
435pub unsafe fn _mm_fnmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
436    simd_fma(simd_neg(a), b, simd_neg(c))
437}
438
439/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
440/// and `b`, and subtract packed elements in `c` from the negated intermediate
441/// result.
442///
443/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmsub_ps)
444#[inline]
445#[target_feature(enable = "fma")]
446#[cfg_attr(test, assert_instr(vfnmsub))]
447#[stable(feature = "simd_x86", since = "1.27.0")]
448pub unsafe fn _mm256_fnmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
449    simd_fma(simd_neg(a), b, simd_neg(c))
450}
451
452/// Multiplies the lower double-precision (64-bit) floating-point elements in
453/// `a` and `b`, and subtract packed elements in `c` from the negated
454/// intermediate result. Store the result in the lower element of the returned
455/// value, and copy the upper element from `a` to the upper elements of the
456/// result.
457///
458/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_sd)
459#[inline]
460#[target_feature(enable = "fma")]
461#[cfg_attr(test, assert_instr(vfnmsub))]
462#[stable(feature = "simd_x86", since = "1.27.0")]
463pub unsafe fn _mm_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
464    simd_insert!(
465        a,
466        0,
467        fmaf64(_mm_cvtsd_f64(a), -_mm_cvtsd_f64(b), -_mm_cvtsd_f64(c))
468    )
469}
470
471/// Multiplies the lower single-precision (32-bit) floating-point elements in
472/// `a` and `b`, and subtract packed elements in `c` from the negated
473/// intermediate result. Store the result in the lower element of the
474/// returned value, and copy the 3 upper elements from `a` to the upper
475/// elements of the result.
476///
477/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_ss)
478#[inline]
479#[target_feature(enable = "fma")]
480#[cfg_attr(test, assert_instr(vfnmsub))]
481#[stable(feature = "simd_x86", since = "1.27.0")]
482pub unsafe fn _mm_fnmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
483    simd_insert!(
484        a,
485        0,
486        fmaf32(_mm_cvtss_f32(a), -_mm_cvtss_f32(b), -_mm_cvtss_f32(c))
487    )
488}
489
490#[cfg(test)]
491mod tests {
492
493    use stdarch_test::simd_test;
494
495    use crate::core_arch::x86::*;
496
497    #[simd_test(enable = "fma")]
498    unsafe fn test_mm_fmadd_pd() {
499        let a = _mm_setr_pd(1., 2.);
500        let b = _mm_setr_pd(5., 3.);
501        let c = _mm_setr_pd(4., 9.);
502        let r = _mm_setr_pd(9., 15.);
503        assert_eq_m128d(_mm_fmadd_pd(a, b, c), r);
504    }
505
506    #[simd_test(enable = "fma")]
507    unsafe fn test_mm256_fmadd_pd() {
508        let a = _mm256_setr_pd(1., 2., 3., 4.);
509        let b = _mm256_setr_pd(5., 3., 7., 2.);
510        let c = _mm256_setr_pd(4., 9., 1., 7.);
511        let r = _mm256_setr_pd(9., 15., 22., 15.);
512        assert_eq_m256d(_mm256_fmadd_pd(a, b, c), r);
513    }
514
515    #[simd_test(enable = "fma")]
516    unsafe fn test_mm_fmadd_ps() {
517        let a = _mm_setr_ps(1., 2., 3., 4.);
518        let b = _mm_setr_ps(5., 3., 7., 2.);
519        let c = _mm_setr_ps(4., 9., 1., 7.);
520        let r = _mm_setr_ps(9., 15., 22., 15.);
521        assert_eq_m128(_mm_fmadd_ps(a, b, c), r);
522    }
523
524    #[simd_test(enable = "fma")]
525    unsafe fn test_mm256_fmadd_ps() {
526        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
527        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
528        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
529        let r = _mm256_setr_ps(9., 15., 22., 15., -5., -49., -2., -31.);
530        assert_eq_m256(_mm256_fmadd_ps(a, b, c), r);
531    }
532
533    #[simd_test(enable = "fma")]
534    unsafe fn test_mm_fmadd_sd() {
535        let a = _mm_setr_pd(1., 2.);
536        let b = _mm_setr_pd(5., 3.);
537        let c = _mm_setr_pd(4., 9.);
538        let r = _mm_setr_pd(9., 2.);
539        assert_eq_m128d(_mm_fmadd_sd(a, b, c), r);
540    }
541
542    #[simd_test(enable = "fma")]
543    unsafe fn test_mm_fmadd_ss() {
544        let a = _mm_setr_ps(1., 2., 3., 4.);
545        let b = _mm_setr_ps(5., 3., 7., 2.);
546        let c = _mm_setr_ps(4., 9., 1., 7.);
547        let r = _mm_setr_ps(9., 2., 3., 4.);
548        assert_eq_m128(_mm_fmadd_ss(a, b, c), r);
549    }
550
551    #[simd_test(enable = "fma")]
552    unsafe fn test_mm_fmaddsub_pd() {
553        let a = _mm_setr_pd(1., 2.);
554        let b = _mm_setr_pd(5., 3.);
555        let c = _mm_setr_pd(4., 9.);
556        let r = _mm_setr_pd(1., 15.);
557        assert_eq_m128d(_mm_fmaddsub_pd(a, b, c), r);
558    }
559
560    #[simd_test(enable = "fma")]
561    unsafe fn test_mm256_fmaddsub_pd() {
562        let a = _mm256_setr_pd(1., 2., 3., 4.);
563        let b = _mm256_setr_pd(5., 3., 7., 2.);
564        let c = _mm256_setr_pd(4., 9., 1., 7.);
565        let r = _mm256_setr_pd(1., 15., 20., 15.);
566        assert_eq_m256d(_mm256_fmaddsub_pd(a, b, c), r);
567    }
568
569    #[simd_test(enable = "fma")]
570    unsafe fn test_mm_fmaddsub_ps() {
571        let a = _mm_setr_ps(1., 2., 3., 4.);
572        let b = _mm_setr_ps(5., 3., 7., 2.);
573        let c = _mm_setr_ps(4., 9., 1., 7.);
574        let r = _mm_setr_ps(1., 15., 20., 15.);
575        assert_eq_m128(_mm_fmaddsub_ps(a, b, c), r);
576    }
577
578    #[simd_test(enable = "fma")]
579    unsafe fn test_mm256_fmaddsub_ps() {
580        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
581        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
582        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
583        let r = _mm256_setr_ps(1., 15., 20., 15., 5., -49., 2., -31.);
584        assert_eq_m256(_mm256_fmaddsub_ps(a, b, c), r);
585    }
586
587    #[simd_test(enable = "fma")]
588    unsafe fn test_mm_fmsub_pd() {
589        let a = _mm_setr_pd(1., 2.);
590        let b = _mm_setr_pd(5., 3.);
591        let c = _mm_setr_pd(4., 9.);
592        let r = _mm_setr_pd(1., -3.);
593        assert_eq_m128d(_mm_fmsub_pd(a, b, c), r);
594    }
595
596    #[simd_test(enable = "fma")]
597    unsafe fn test_mm256_fmsub_pd() {
598        let a = _mm256_setr_pd(1., 2., 3., 4.);
599        let b = _mm256_setr_pd(5., 3., 7., 2.);
600        let c = _mm256_setr_pd(4., 9., 1., 7.);
601        let r = _mm256_setr_pd(1., -3., 20., 1.);
602        assert_eq_m256d(_mm256_fmsub_pd(a, b, c), r);
603    }
604
605    #[simd_test(enable = "fma")]
606    unsafe fn test_mm_fmsub_ps() {
607        let a = _mm_setr_ps(1., 2., 3., 4.);
608        let b = _mm_setr_ps(5., 3., 7., 2.);
609        let c = _mm_setr_ps(4., 9., 1., 7.);
610        let r = _mm_setr_ps(1., -3., 20., 1.);
611        assert_eq_m128(_mm_fmsub_ps(a, b, c), r);
612    }
613
614    #[simd_test(enable = "fma")]
615    unsafe fn test_mm256_fmsub_ps() {
616        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
617        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
618        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
619        let r = _mm256_setr_ps(1., -3., 20., 1., 5., -71., 2., -25.);
620        assert_eq_m256(_mm256_fmsub_ps(a, b, c), r);
621    }
622
623    #[simd_test(enable = "fma")]
624    unsafe fn test_mm_fmsub_sd() {
625        let a = _mm_setr_pd(1., 2.);
626        let b = _mm_setr_pd(5., 3.);
627        let c = _mm_setr_pd(4., 9.);
628        let r = _mm_setr_pd(1., 2.);
629        assert_eq_m128d(_mm_fmsub_sd(a, b, c), r);
630    }
631
632    #[simd_test(enable = "fma")]
633    unsafe fn test_mm_fmsub_ss() {
634        let a = _mm_setr_ps(1., 2., 3., 4.);
635        let b = _mm_setr_ps(5., 3., 7., 2.);
636        let c = _mm_setr_ps(4., 9., 1., 7.);
637        let r = _mm_setr_ps(1., 2., 3., 4.);
638        assert_eq_m128(_mm_fmsub_ss(a, b, c), r);
639    }
640
641    #[simd_test(enable = "fma")]
642    unsafe fn test_mm_fmsubadd_pd() {
643        let a = _mm_setr_pd(1., 2.);
644        let b = _mm_setr_pd(5., 3.);
645        let c = _mm_setr_pd(4., 9.);
646        let r = _mm_setr_pd(9., -3.);
647        assert_eq_m128d(_mm_fmsubadd_pd(a, b, c), r);
648    }
649
650    #[simd_test(enable = "fma")]
651    unsafe fn test_mm256_fmsubadd_pd() {
652        let a = _mm256_setr_pd(1., 2., 3., 4.);
653        let b = _mm256_setr_pd(5., 3., 7., 2.);
654        let c = _mm256_setr_pd(4., 9., 1., 7.);
655        let r = _mm256_setr_pd(9., -3., 22., 1.);
656        assert_eq_m256d(_mm256_fmsubadd_pd(a, b, c), r);
657    }
658
659    #[simd_test(enable = "fma")]
660    unsafe fn test_mm_fmsubadd_ps() {
661        let a = _mm_setr_ps(1., 2., 3., 4.);
662        let b = _mm_setr_ps(5., 3., 7., 2.);
663        let c = _mm_setr_ps(4., 9., 1., 7.);
664        let r = _mm_setr_ps(9., -3., 22., 1.);
665        assert_eq_m128(_mm_fmsubadd_ps(a, b, c), r);
666    }
667
668    #[simd_test(enable = "fma")]
669    unsafe fn test_mm256_fmsubadd_ps() {
670        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
671        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
672        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
673        let r = _mm256_setr_ps(9., -3., 22., 1., -5., -71., -2., -25.);
674        assert_eq_m256(_mm256_fmsubadd_ps(a, b, c), r);
675    }
676
677    #[simd_test(enable = "fma")]
678    unsafe fn test_mm_fnmadd_pd() {
679        let a = _mm_setr_pd(1., 2.);
680        let b = _mm_setr_pd(5., 3.);
681        let c = _mm_setr_pd(4., 9.);
682        let r = _mm_setr_pd(-1., 3.);
683        assert_eq_m128d(_mm_fnmadd_pd(a, b, c), r);
684    }
685
686    #[simd_test(enable = "fma")]
687    unsafe fn test_mm256_fnmadd_pd() {
688        let a = _mm256_setr_pd(1., 2., 3., 4.);
689        let b = _mm256_setr_pd(5., 3., 7., 2.);
690        let c = _mm256_setr_pd(4., 9., 1., 7.);
691        let r = _mm256_setr_pd(-1., 3., -20., -1.);
692        assert_eq_m256d(_mm256_fnmadd_pd(a, b, c), r);
693    }
694
695    #[simd_test(enable = "fma")]
696    unsafe fn test_mm_fnmadd_ps() {
697        let a = _mm_setr_ps(1., 2., 3., 4.);
698        let b = _mm_setr_ps(5., 3., 7., 2.);
699        let c = _mm_setr_ps(4., 9., 1., 7.);
700        let r = _mm_setr_ps(-1., 3., -20., -1.);
701        assert_eq_m128(_mm_fnmadd_ps(a, b, c), r);
702    }
703
704    #[simd_test(enable = "fma")]
705    unsafe fn test_mm256_fnmadd_ps() {
706        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
707        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
708        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
709        let r = _mm256_setr_ps(-1., 3., -20., -1., -5., 71., -2., 25.);
710        assert_eq_m256(_mm256_fnmadd_ps(a, b, c), r);
711    }
712
713    #[simd_test(enable = "fma")]
714    unsafe fn test_mm_fnmadd_sd() {
715        let a = _mm_setr_pd(1., 2.);
716        let b = _mm_setr_pd(5., 3.);
717        let c = _mm_setr_pd(4., 9.);
718        let r = _mm_setr_pd(-1., 2.);
719        assert_eq_m128d(_mm_fnmadd_sd(a, b, c), r);
720    }
721
722    #[simd_test(enable = "fma")]
723    unsafe fn test_mm_fnmadd_ss() {
724        let a = _mm_setr_ps(1., 2., 3., 4.);
725        let b = _mm_setr_ps(5., 3., 7., 2.);
726        let c = _mm_setr_ps(4., 9., 1., 7.);
727        let r = _mm_setr_ps(-1., 2., 3., 4.);
728        assert_eq_m128(_mm_fnmadd_ss(a, b, c), r);
729    }
730
731    #[simd_test(enable = "fma")]
732    unsafe fn test_mm_fnmsub_pd() {
733        let a = _mm_setr_pd(1., 2.);
734        let b = _mm_setr_pd(5., 3.);
735        let c = _mm_setr_pd(4., 9.);
736        let r = _mm_setr_pd(-9., -15.);
737        assert_eq_m128d(_mm_fnmsub_pd(a, b, c), r);
738    }
739
740    #[simd_test(enable = "fma")]
741    unsafe fn test_mm256_fnmsub_pd() {
742        let a = _mm256_setr_pd(1., 2., 3., 4.);
743        let b = _mm256_setr_pd(5., 3., 7., 2.);
744        let c = _mm256_setr_pd(4., 9., 1., 7.);
745        let r = _mm256_setr_pd(-9., -15., -22., -15.);
746        assert_eq_m256d(_mm256_fnmsub_pd(a, b, c), r);
747    }
748
749    #[simd_test(enable = "fma")]
750    unsafe fn test_mm_fnmsub_ps() {
751        let a = _mm_setr_ps(1., 2., 3., 4.);
752        let b = _mm_setr_ps(5., 3., 7., 2.);
753        let c = _mm_setr_ps(4., 9., 1., 7.);
754        let r = _mm_setr_ps(-9., -15., -22., -15.);
755        assert_eq_m128(_mm_fnmsub_ps(a, b, c), r);
756    }
757
758    #[simd_test(enable = "fma")]
759    unsafe fn test_mm256_fnmsub_ps() {
760        let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.);
761        let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.);
762        let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.);
763        let r = _mm256_setr_ps(-9., -15., -22., -15., 5., 49., 2., 31.);
764        assert_eq_m256(_mm256_fnmsub_ps(a, b, c), r);
765    }
766
767    #[simd_test(enable = "fma")]
768    unsafe fn test_mm_fnmsub_sd() {
769        let a = _mm_setr_pd(1., 2.);
770        let b = _mm_setr_pd(5., 3.);
771        let c = _mm_setr_pd(4., 9.);
772        let r = _mm_setr_pd(-9., 2.);
773        assert_eq_m128d(_mm_fnmsub_sd(a, b, c), r);
774    }
775
776    #[simd_test(enable = "fma")]
777    unsafe fn test_mm_fnmsub_ss() {
778        let a = _mm_setr_ps(1., 2., 3., 4.);
779        let b = _mm_setr_ps(5., 3., 7., 2.);
780        let c = _mm_setr_ps(4., 9., 1., 7.);
781        let r = _mm_setr_ps(-9., 2., 3., 4.);
782        assert_eq_m128(_mm_fnmsub_ss(a, b, c), r);
783    }
784}