rustc_span/
analyze_source_file.rs

1use super::*;
2
3#[cfg(test)]
4mod tests;
5
6/// Finds all newlines, multi-byte characters, and non-narrow characters in a
7/// SourceFile.
8///
9/// This function will use an SSE2 enhanced implementation if hardware support
10/// is detected at runtime.
11pub(crate) fn analyze_source_file(src: &str) -> (Vec<RelativeBytePos>, Vec<MultiByteChar>) {
12    let mut lines = vec![RelativeBytePos::from_u32(0)];
13    let mut multi_byte_chars = vec![];
14
15    // Calls the right implementation, depending on hardware support available.
16    analyze_source_file_dispatch(src, &mut lines, &mut multi_byte_chars);
17
18    // The code above optimistically registers a new line *after* each \n
19    // it encounters. If that point is already outside the source_file, remove
20    // it again.
21    if let Some(&last_line_start) = lines.last() {
22        let source_file_end = RelativeBytePos::from_usize(src.len());
23        assert!(source_file_end >= last_line_start);
24        if last_line_start == source_file_end {
25            lines.pop();
26        }
27    }
28
29    (lines, multi_byte_chars)
30}
31
32cfg_select! {
33    any(target_arch = "x86", target_arch = "x86_64") => {
34        fn analyze_source_file_dispatch(
35            src: &str,
36            lines: &mut Vec<RelativeBytePos>,
37            multi_byte_chars: &mut Vec<MultiByteChar>,
38        ) {
39            if is_x86_feature_detected!("sse2") {
40                unsafe {
41                    analyze_source_file_sse2(src, lines, multi_byte_chars);
42                }
43            } else {
44                analyze_source_file_generic(
45                    src,
46                    src.len(),
47                    RelativeBytePos::from_u32(0),
48                    lines,
49                    multi_byte_chars,
50                );
51            }
52        }
53
54        /// Checks 16 byte chunks of text at a time. If the chunk contains
55        /// something other than printable ASCII characters and newlines, the
56        /// function falls back to the generic implementation. Otherwise it uses
57        /// SSE2 intrinsics to quickly find all newlines.
58        #[target_feature(enable = "sse2")]
59        unsafe fn analyze_source_file_sse2(
60            src: &str,
61            lines: &mut Vec<RelativeBytePos>,
62            multi_byte_chars: &mut Vec<MultiByteChar>,
63        ) {
64            #[cfg(target_arch = "x86")]
65            use std::arch::x86::*;
66            #[cfg(target_arch = "x86_64")]
67            use std::arch::x86_64::*;
68
69            const CHUNK_SIZE: usize = 16;
70
71            let (chunks, tail) = src.as_bytes().as_chunks::<CHUNK_SIZE>();
72
73            // This variable keeps track of where we should start decoding a
74            // chunk. If a multi-byte character spans across chunk boundaries,
75            // we need to skip that part in the next chunk because we already
76            // handled it.
77            let mut intra_chunk_offset = 0;
78
79            for (chunk_index, chunk) in chunks.iter().enumerate() {
80                // We don't know if the pointer is aligned to 16 bytes, so we
81                // use `loadu`, which supports unaligned loading.
82                let chunk = unsafe { _mm_loadu_si128(chunk.as_ptr() as *const __m128i) };
83
84                // For each character in the chunk, see if its byte value is < 0,
85                // which indicates that it's part of a UTF-8 char.
86                let multibyte_test = _mm_cmplt_epi8(chunk, _mm_set1_epi8(0));
87                // Create a bit mask from the comparison results.
88                let multibyte_mask = _mm_movemask_epi8(multibyte_test);
89
90                // If the bit mask is all zero, we only have ASCII chars here:
91                if multibyte_mask == 0 {
92                    assert!(intra_chunk_offset == 0);
93
94                    // Check for newlines in the chunk
95                    let newlines_test = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8));
96                    let mut newlines_mask = _mm_movemask_epi8(newlines_test);
97
98                    let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
99
100                    while newlines_mask != 0 {
101                        let index = newlines_mask.trailing_zeros();
102
103                        lines.push(RelativeBytePos(index) + output_offset);
104
105                        // Clear the bit, so we can find the next one.
106                        newlines_mask &= newlines_mask - 1;
107                    }
108                } else {
109                    // The slow path.
110                    // There are multibyte chars in here, fallback to generic decoding.
111                    let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
112                    intra_chunk_offset = analyze_source_file_generic(
113                        &src[scan_start..],
114                        CHUNK_SIZE - intra_chunk_offset,
115                        RelativeBytePos::from_usize(scan_start),
116                        lines,
117                        multi_byte_chars,
118                    );
119                }
120            }
121
122            // There might still be a tail left to analyze
123            let tail_start = src.len() - tail.len() + intra_chunk_offset;
124            if tail_start < src.len() {
125                analyze_source_file_generic(
126                    &src[tail_start..],
127                    src.len() - tail_start,
128                    RelativeBytePos::from_usize(tail_start),
129                    lines,
130                    multi_byte_chars,
131                );
132            }
133        }
134    }
135    target_arch = "loongarch64" => {
136        fn analyze_source_file_dispatch(
137            src: &str,
138            lines: &mut Vec<RelativeBytePos>,
139            multi_byte_chars: &mut Vec<MultiByteChar>,
140        ) {
141            use std::arch::is_loongarch_feature_detected;
142
143            if is_loongarch_feature_detected!("lsx") {
144                unsafe {
145                    analyze_source_file_lsx(src, lines, multi_byte_chars);
146                }
147            } else {
148                analyze_source_file_generic(
149                    src,
150                    src.len(),
151                    RelativeBytePos::from_u32(0),
152                    lines,
153                    multi_byte_chars,
154                );
155            }
156        }
157
158        /// Checks 16 byte chunks of text at a time. If the chunk contains
159        /// something other than printable ASCII characters and newlines, the
160        /// function falls back to the generic implementation. Otherwise it uses
161        /// LSX intrinsics to quickly find all newlines.
162        #[target_feature(enable = "lsx")]
163        unsafe fn analyze_source_file_lsx(
164            src: &str,
165            lines: &mut Vec<RelativeBytePos>,
166            multi_byte_chars: &mut Vec<MultiByteChar>,
167        ) {
168            use std::arch::loongarch64::*;
169
170            const CHUNK_SIZE: usize = 16;
171
172            let (chunks, tail) = src.as_bytes().as_chunks::<CHUNK_SIZE>();
173
174            // This variable keeps track of where we should start decoding a
175            // chunk. If a multi-byte character spans across chunk boundaries,
176            // we need to skip that part in the next chunk because we already
177            // handled it.
178            let mut intra_chunk_offset = 0;
179
180            for (chunk_index, chunk) in chunks.iter().enumerate() {
181                // All LSX memory instructions support unaligned access, so using
182                // vld is fine.
183                let chunk = unsafe { lsx_vld::<0>(chunk.as_ptr() as *const i8) };
184
185                // For each character in the chunk, see if its byte value is < 0,
186                // which indicates that it's part of a UTF-8 char.
187                let multibyte_mask = lsx_vmskltz_b(chunk);
188                // Create a bit mask from the comparison results.
189                let multibyte_mask = lsx_vpickve2gr_w::<0>(multibyte_mask);
190
191                // If the bit mask is all zero, we only have ASCII chars here:
192                if multibyte_mask == 0 {
193                    assert!(intra_chunk_offset == 0);
194
195                    // Check for newlines in the chunk
196                    let newlines_test = lsx_vseqi_b::<{b'\n' as i32}>(chunk);
197                    let newlines_mask = lsx_vmskltz_b(newlines_test);
198                    let mut newlines_mask = lsx_vpickve2gr_w::<0>(newlines_mask);
199
200                    let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
201
202                    while newlines_mask != 0 {
203                        let index = newlines_mask.trailing_zeros();
204
205                        lines.push(RelativeBytePos(index) + output_offset);
206
207                        // Clear the bit, so we can find the next one.
208                        newlines_mask &= newlines_mask - 1;
209                    }
210                } else {
211                    // The slow path.
212                    // There are multibyte chars in here, fallback to generic decoding.
213                    let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
214                    intra_chunk_offset = analyze_source_file_generic(
215                        &src[scan_start..],
216                        CHUNK_SIZE - intra_chunk_offset,
217                        RelativeBytePos::from_usize(scan_start),
218                        lines,
219                        multi_byte_chars,
220                    );
221                }
222            }
223
224            // There might still be a tail left to analyze
225            let tail_start = src.len() - tail.len() + intra_chunk_offset;
226            if tail_start < src.len() {
227                analyze_source_file_generic(
228                    &src[tail_start..],
229                    src.len() - tail_start,
230                    RelativeBytePos::from_usize(tail_start),
231                    lines,
232                    multi_byte_chars,
233                );
234            }
235        }
236    }
237    _ => {
238        // The target (or compiler version) does not support vector instructions
239        // our specialized implementations need (x86 SSE2, loongarch64 LSX)...
240        fn analyze_source_file_dispatch(
241            src: &str,
242            lines: &mut Vec<RelativeBytePos>,
243            multi_byte_chars: &mut Vec<MultiByteChar>,
244        ) {
245            analyze_source_file_generic(
246                src,
247                src.len(),
248                RelativeBytePos::from_u32(0),
249                lines,
250                multi_byte_chars,
251            );
252        }
253    }
254}
255
256// `scan_len` determines the number of bytes in `src` to scan. Note that the
257// function can read past `scan_len` if a multi-byte character start within the
258// range but extends past it. The overflow is returned by the function.
259fn analyze_source_file_generic(
260    src: &str,
261    scan_len: usize,
262    output_offset: RelativeBytePos,
263    lines: &mut Vec<RelativeBytePos>,
264    multi_byte_chars: &mut Vec<MultiByteChar>,
265) -> usize {
266    assert!(src.len() >= scan_len);
267    let mut i = 0;
268    let src_bytes = src.as_bytes();
269
270    while i < scan_len {
271        let byte = unsafe {
272            // We verified that i < scan_len <= src.len()
273            *src_bytes.get_unchecked(i)
274        };
275
276        // How much to advance in order to get to the next UTF-8 char in the
277        // string.
278        let mut char_len = 1;
279
280        if byte == b'\n' {
281            let pos = RelativeBytePos::from_usize(i) + output_offset;
282            lines.push(pos + RelativeBytePos(1));
283        } else if byte >= 128 {
284            // This is the beginning of a multibyte char. Just decode to `char`.
285            let c = src[i..].chars().next().unwrap();
286            char_len = c.len_utf8();
287
288            let pos = RelativeBytePos::from_usize(i) + output_offset;
289            assert!((2..=4).contains(&char_len));
290            let mbc = MultiByteChar { pos, bytes: char_len as u8 };
291            multi_byte_chars.push(mbc);
292        }
293
294        i += char_len;
295    }
296
297    i - scan_len
298}