core/portable-simd/crates/core_simd/src/
swizzle_dyn.rs1use crate::simd::{LaneCount, Simd, SupportedLaneCount};
2use core::mem;
3
4impl<const N: usize> Simd<u8, N>
5where
6 LaneCount<N>: SupportedLaneCount,
7{
8 #[inline]
17 pub fn swizzle_dyn(self, idxs: Simd<u8, N>) -> Self {
18 #![allow(unused_imports, unused_unsafe)]
19 #[cfg(all(
20 any(target_arch = "aarch64", target_arch = "arm64ec"),
21 target_endian = "little"
22 ))]
23 use core::arch::aarch64::{uint8x8_t, vqtbl1q_u8, vtbl1_u8};
24 #[cfg(all(
25 target_arch = "arm",
26 target_feature = "v7",
27 target_feature = "neon",
28 target_endian = "little"
29 ))]
30 use core::arch::arm::{uint8x8_t, vtbl1_u8};
31 #[cfg(target_arch = "wasm32")]
32 use core::arch::wasm32 as wasm;
33 #[cfg(target_arch = "wasm64")]
34 use core::arch::wasm64 as wasm;
35 #[cfg(target_arch = "x86")]
36 use core::arch::x86;
37 #[cfg(target_arch = "x86_64")]
38 use core::arch::x86_64 as x86;
39 unsafe {
41 match N {
42 #[cfg(all(
43 any(
44 target_arch = "aarch64",
45 target_arch = "arm64ec",
46 all(target_arch = "arm", target_feature = "v7")
47 ),
48 target_feature = "neon",
49 target_endian = "little"
50 ))]
51 8 => transize(vtbl1_u8, self, idxs),
52 #[cfg(target_feature = "ssse3")]
53 16 => transize(x86::_mm_shuffle_epi8, self, zeroing_idxs(idxs)),
54 #[cfg(target_feature = "simd128")]
55 16 => transize(wasm::i8x16_swizzle, self, idxs),
56 #[cfg(all(
57 any(target_arch = "aarch64", target_arch = "arm64ec"),
58 target_feature = "neon",
59 target_endian = "little"
60 ))]
61 16 => transize(vqtbl1q_u8, self, idxs),
62 #[cfg(all(
63 target_arch = "arm",
64 target_feature = "v7",
65 target_feature = "neon",
66 target_endian = "little"
67 ))]
68 16 => transize(armv7_neon_swizzle_u8x16, self, idxs),
69 #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
70 32 => transize(avx2_pshufb, self, idxs),
71 #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
72 32 => {
73 let swizzler = |bytes, idxs| {
75 let mask = x86::_mm256_cmp_epu8_mask::<{ x86::_MM_CMPINT_LT }>(
76 idxs,
77 Simd::<u8, 32>::splat(N as u8).into(),
78 );
79 x86::_mm256_maskz_permutexvar_epi8(mask, idxs, bytes)
80 };
81 transize(swizzler, self, idxs)
82 }
83 #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
85 64 => {
86 let swizzler = |bytes, idxs| {
88 let mask = x86::_mm512_cmp_epu8_mask::<{ x86::_MM_CMPINT_LT }>(
89 idxs,
90 Simd::<u8, 64>::splat(N as u8).into(),
91 );
92 x86::_mm512_maskz_permutexvar_epi8(mask, idxs, bytes)
93 };
94 transize(swizzler, self, idxs)
95 }
96 _ => {
97 let mut array = [0; N];
98 for (i, k) in idxs.to_array().into_iter().enumerate() {
99 if (k as usize) < N {
100 array[i] = self[k as usize];
101 };
102 }
103 array.into()
104 }
105 }
106 }
107 }
108}
109
110#[cfg(all(
116 target_arch = "arm",
117 target_feature = "v7",
118 target_feature = "neon",
119 target_endian = "little"
120))]
121unsafe fn armv7_neon_swizzle_u8x16(bytes: Simd<u8, 16>, idxs: Simd<u8, 16>) -> Simd<u8, 16> {
122 use core::arch::arm::{uint8x8x2_t, vcombine_u8, vget_high_u8, vget_low_u8, vtbl2_u8};
123 unsafe {
125 let bytes = uint8x8x2_t(vget_low_u8(bytes.into()), vget_high_u8(bytes.into()));
126 let lo = vtbl2_u8(bytes, vget_low_u8(idxs.into()));
127 let hi = vtbl2_u8(bytes, vget_high_u8(idxs.into()));
128 vcombine_u8(lo, hi).into()
129 }
130}
131
132#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
137#[target_feature(enable = "avx2")]
138#[allow(unused)]
139#[inline]
140#[allow(clippy::let_and_return)]
141unsafe fn avx2_pshufb(bytes: Simd<u8, 32>, idxs: Simd<u8, 32>) -> Simd<u8, 32> {
142 use crate::simd::cmp::SimdPartialOrd;
143 #[cfg(target_arch = "x86")]
144 use core::arch::x86;
145 #[cfg(target_arch = "x86_64")]
146 use core::arch::x86_64 as x86;
147 use x86::_mm256_permute2x128_si256 as avx2_cross_shuffle;
148 use x86::_mm256_shuffle_epi8 as avx2_half_pshufb;
149 let mid = Simd::splat(16u8);
150 let high = mid + mid;
151 unsafe {
153 let hihi = avx2_cross_shuffle::<0x11>(bytes.into(), bytes.into());
160 let hi_shuf = Simd::from(avx2_half_pshufb(
161 hihi, idxs.into(), ));
164 let compose = idxs.simd_lt(high).select(hi_shuf, Simd::splat(0));
166 let lolo = avx2_cross_shuffle::<0x00>(bytes.into(), bytes.into());
167 let lo_shuf = Simd::from(avx2_half_pshufb(lolo, idxs.into()));
168 let compose = idxs.simd_lt(mid).select(lo_shuf, compose);
170 compose
171 }
172}
173
174#[allow(dead_code)]
182#[inline(always)]
183unsafe fn transize<T, const N: usize>(
184 f: unsafe fn(T, T) -> T,
185 a: Simd<u8, N>,
186 b: Simd<u8, N>,
187) -> Simd<u8, N>
188where
189 LaneCount<N>: SupportedLaneCount,
190{
191 unsafe { mem::transmute_copy(&f(mem::transmute_copy(&a), mem::transmute_copy(&b))) }
193}
194
195#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
197#[allow(unused)]
198#[inline(always)]
199fn zeroing_idxs<const N: usize>(idxs: Simd<u8, N>) -> Simd<u8, N>
200where
201 LaneCount<N>: SupportedLaneCount,
202{
203 use crate::simd::cmp::SimdPartialOrd;
204 idxs.simd_lt(Simd::splat(N as u8))
205 .select(idxs, Simd::splat(u8::MAX))
206}