rustc_span/
analyze_source_file.rs

1use super::*;
2
3#[cfg(test)]
4mod tests;
5
6/// Finds all newlines, multi-byte characters, and non-narrow characters in a
7/// SourceFile.
8///
9/// This function will use an SSE2 enhanced implementation if hardware support
10/// is detected at runtime.
11pub(crate) fn analyze_source_file(src: &str) -> (Vec<RelativeBytePos>, Vec<MultiByteChar>) {
12    let mut lines = vec![RelativeBytePos::from_u32(0)];
13    let mut multi_byte_chars = vec![];
14
15    // Calls the right implementation, depending on hardware support available.
16    analyze_source_file_dispatch(src, &mut lines, &mut multi_byte_chars);
17
18    // The code above optimistically registers a new line *after* each \n
19    // it encounters. If that point is already outside the source_file, remove
20    // it again.
21    if let Some(&last_line_start) = lines.last() {
22        let source_file_end = RelativeBytePos::from_usize(src.len());
23        assert!(source_file_end >= last_line_start);
24        if last_line_start == source_file_end {
25            lines.pop();
26        }
27    }
28
29    (lines, multi_byte_chars)
30}
31
32#[cfg(bootstrap)]
33cfg_match! {
34    cfg(any(target_arch = "x86", target_arch = "x86_64")) => {
35        fn analyze_source_file_dispatch(
36            src: &str,
37            lines: &mut Vec<RelativeBytePos>,
38            multi_byte_chars: &mut Vec<MultiByteChar>,
39        ) {
40            if is_x86_feature_detected!("sse2") {
41                unsafe {
42                    analyze_source_file_sse2(src, lines, multi_byte_chars);
43                }
44            } else {
45                analyze_source_file_generic(
46                    src,
47                    src.len(),
48                    RelativeBytePos::from_u32(0),
49                    lines,
50                    multi_byte_chars,
51                );
52            }
53        }
54
55        /// Checks 16 byte chunks of text at a time. If the chunk contains
56        /// something other than printable ASCII characters and newlines, the
57        /// function falls back to the generic implementation. Otherwise it uses
58        /// SSE2 intrinsics to quickly find all newlines.
59        #[target_feature(enable = "sse2")]
60        unsafe fn analyze_source_file_sse2(
61            src: &str,
62            lines: &mut Vec<RelativeBytePos>,
63            multi_byte_chars: &mut Vec<MultiByteChar>,
64        ) {
65            #[cfg(target_arch = "x86")]
66            use std::arch::x86::*;
67            #[cfg(target_arch = "x86_64")]
68            use std::arch::x86_64::*;
69
70            const CHUNK_SIZE: usize = 16;
71
72            let src_bytes = src.as_bytes();
73
74            let chunk_count = src.len() / CHUNK_SIZE;
75
76            // This variable keeps track of where we should start decoding a
77            // chunk. If a multi-byte character spans across chunk boundaries,
78            // we need to skip that part in the next chunk because we already
79            // handled it.
80            let mut intra_chunk_offset = 0;
81
82            for chunk_index in 0..chunk_count {
83                let ptr = src_bytes.as_ptr() as *const __m128i;
84                // We don't know if the pointer is aligned to 16 bytes, so we
85                // use `loadu`, which supports unaligned loading.
86                let chunk = unsafe { _mm_loadu_si128(ptr.add(chunk_index)) };
87
88                // For character in the chunk, see if its byte value is < 0, which
89                // indicates that it's part of a UTF-8 char.
90                let multibyte_test = unsafe { _mm_cmplt_epi8(chunk, _mm_set1_epi8(0)) };
91                // Create a bit mask from the comparison results.
92                let multibyte_mask = unsafe { _mm_movemask_epi8(multibyte_test) };
93
94                // If the bit mask is all zero, we only have ASCII chars here:
95                if multibyte_mask == 0 {
96                    assert!(intra_chunk_offset == 0);
97
98                    // Check for newlines in the chunk
99                    let newlines_test = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)) };
100                    let mut newlines_mask = unsafe { _mm_movemask_epi8(newlines_test) };
101
102                    let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
103
104                    while newlines_mask != 0 {
105                        let index = newlines_mask.trailing_zeros();
106
107                        lines.push(RelativeBytePos(index) + output_offset);
108
109                        // Clear the bit, so we can find the next one.
110                        newlines_mask &= newlines_mask - 1;
111                    }
112                } else {
113                    // The slow path.
114                    // There are multibyte chars in here, fallback to generic decoding.
115                    let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
116                    intra_chunk_offset = analyze_source_file_generic(
117                        &src[scan_start..],
118                        CHUNK_SIZE - intra_chunk_offset,
119                        RelativeBytePos::from_usize(scan_start),
120                        lines,
121                        multi_byte_chars,
122                    );
123                }
124            }
125
126            // There might still be a tail left to analyze
127            let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
128            if tail_start < src.len() {
129                analyze_source_file_generic(
130                    &src[tail_start..],
131                    src.len() - tail_start,
132                    RelativeBytePos::from_usize(tail_start),
133                    lines,
134                    multi_byte_chars,
135                );
136            }
137        }
138    }
139    _ => {
140        // The target (or compiler version) does not support SSE2 ...
141        fn analyze_source_file_dispatch(
142            src: &str,
143            lines: &mut Vec<RelativeBytePos>,
144            multi_byte_chars: &mut Vec<MultiByteChar>,
145        ) {
146            analyze_source_file_generic(
147                src,
148                src.len(),
149                RelativeBytePos::from_u32(0),
150                lines,
151                multi_byte_chars,
152            );
153        }
154    }
155}
156
157#[cfg(not(bootstrap))]
158cfg_match! {
159    any(target_arch = "x86", target_arch = "x86_64") => {
160        fn analyze_source_file_dispatch(
161            src: &str,
162            lines: &mut Vec<RelativeBytePos>,
163            multi_byte_chars: &mut Vec<MultiByteChar>,
164        ) {
165            if is_x86_feature_detected!("sse2") {
166                unsafe {
167                    analyze_source_file_sse2(src, lines, multi_byte_chars);
168                }
169            } else {
170                analyze_source_file_generic(
171                    src,
172                    src.len(),
173                    RelativeBytePos::from_u32(0),
174                    lines,
175                    multi_byte_chars,
176                );
177            }
178        }
179
180        /// Checks 16 byte chunks of text at a time. If the chunk contains
181        /// something other than printable ASCII characters and newlines, the
182        /// function falls back to the generic implementation. Otherwise it uses
183        /// SSE2 intrinsics to quickly find all newlines.
184        #[target_feature(enable = "sse2")]
185        unsafe fn analyze_source_file_sse2(
186            src: &str,
187            lines: &mut Vec<RelativeBytePos>,
188            multi_byte_chars: &mut Vec<MultiByteChar>,
189        ) {
190            #[cfg(target_arch = "x86")]
191            use std::arch::x86::*;
192            #[cfg(target_arch = "x86_64")]
193            use std::arch::x86_64::*;
194
195            const CHUNK_SIZE: usize = 16;
196
197            let src_bytes = src.as_bytes();
198
199            let chunk_count = src.len() / CHUNK_SIZE;
200
201            // This variable keeps track of where we should start decoding a
202            // chunk. If a multi-byte character spans across chunk boundaries,
203            // we need to skip that part in the next chunk because we already
204            // handled it.
205            let mut intra_chunk_offset = 0;
206
207            for chunk_index in 0..chunk_count {
208                let ptr = src_bytes.as_ptr() as *const __m128i;
209                // We don't know if the pointer is aligned to 16 bytes, so we
210                // use `loadu`, which supports unaligned loading.
211                let chunk = unsafe { _mm_loadu_si128(ptr.add(chunk_index)) };
212
213                // For character in the chunk, see if its byte value is < 0, which
214                // indicates that it's part of a UTF-8 char.
215                let multibyte_test = unsafe { _mm_cmplt_epi8(chunk, _mm_set1_epi8(0)) };
216                // Create a bit mask from the comparison results.
217                let multibyte_mask = unsafe { _mm_movemask_epi8(multibyte_test) };
218
219                // If the bit mask is all zero, we only have ASCII chars here:
220                if multibyte_mask == 0 {
221                    assert!(intra_chunk_offset == 0);
222
223                    // Check for newlines in the chunk
224                    let newlines_test = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)) };
225                    let mut newlines_mask = unsafe { _mm_movemask_epi8(newlines_test) };
226
227                    let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
228
229                    while newlines_mask != 0 {
230                        let index = newlines_mask.trailing_zeros();
231
232                        lines.push(RelativeBytePos(index) + output_offset);
233
234                        // Clear the bit, so we can find the next one.
235                        newlines_mask &= newlines_mask - 1;
236                    }
237                } else {
238                    // The slow path.
239                    // There are multibyte chars in here, fallback to generic decoding.
240                    let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
241                    intra_chunk_offset = analyze_source_file_generic(
242                        &src[scan_start..],
243                        CHUNK_SIZE - intra_chunk_offset,
244                        RelativeBytePos::from_usize(scan_start),
245                        lines,
246                        multi_byte_chars,
247                    );
248                }
249            }
250
251            // There might still be a tail left to analyze
252            let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
253            if tail_start < src.len() {
254                analyze_source_file_generic(
255                    &src[tail_start..],
256                    src.len() - tail_start,
257                    RelativeBytePos::from_usize(tail_start),
258                    lines,
259                    multi_byte_chars,
260                );
261            }
262        }
263    }
264    _ => {
265        // The target (or compiler version) does not support SSE2 ...
266        fn analyze_source_file_dispatch(
267            src: &str,
268            lines: &mut Vec<RelativeBytePos>,
269            multi_byte_chars: &mut Vec<MultiByteChar>,
270        ) {
271            analyze_source_file_generic(
272                src,
273                src.len(),
274                RelativeBytePos::from_u32(0),
275                lines,
276                multi_byte_chars,
277            );
278        }
279    }
280}
281
282// `scan_len` determines the number of bytes in `src` to scan. Note that the
283// function can read past `scan_len` if a multi-byte character start within the
284// range but extends past it. The overflow is returned by the function.
285fn analyze_source_file_generic(
286    src: &str,
287    scan_len: usize,
288    output_offset: RelativeBytePos,
289    lines: &mut Vec<RelativeBytePos>,
290    multi_byte_chars: &mut Vec<MultiByteChar>,
291) -> usize {
292    assert!(src.len() >= scan_len);
293    let mut i = 0;
294    let src_bytes = src.as_bytes();
295
296    while i < scan_len {
297        let byte = unsafe {
298            // We verified that i < scan_len <= src.len()
299            *src_bytes.get_unchecked(i)
300        };
301
302        // How much to advance in order to get to the next UTF-8 char in the
303        // string.
304        let mut char_len = 1;
305
306        if byte == b'\n' {
307            let pos = RelativeBytePos::from_usize(i) + output_offset;
308            lines.push(pos + RelativeBytePos(1));
309        } else if byte >= 128 {
310            // This is the beginning of a multibyte char. Just decode to `char`.
311            let c = src[i..].chars().next().unwrap();
312            char_len = c.len_utf8();
313
314            let pos = RelativeBytePos::from_usize(i) + output_offset;
315            assert!((2..=4).contains(&char_len));
316            let mbc = MultiByteChar { pos, bytes: char_len as u8 };
317            multi_byte_chars.push(mbc);
318        }
319
320        i += char_len;
321    }
322
323    i - scan_len
324}