rustc_lint/
non_ascii_idents.rs

1use rustc_ast as ast;
2use rustc_data_structures::fx::FxIndexMap;
3use rustc_data_structures::unord::UnordMap;
4use rustc_session::{declare_lint, declare_lint_pass};
5use rustc_span::Symbol;
6use unicode_security::general_security_profile::IdentifierType;
7
8use crate::lints::{
9    ConfusableIdentifierPair, IdentifierNonAsciiChar, IdentifierUncommonCodepoints,
10    MixedScriptConfusables,
11};
12use crate::{EarlyContext, EarlyLintPass, LintContext};
13
14declare_lint! {
15    /// The `non_ascii_idents` lint detects non-ASCII identifiers.
16    ///
17    /// ### Example
18    ///
19    /// ```rust,compile_fail
20    /// # #![allow(unused)]
21    /// #![deny(non_ascii_idents)]
22    /// fn main() {
23    ///     let föö = 1;
24    /// }
25    /// ```
26    ///
27    /// {{produces}}
28    ///
29    /// ### Explanation
30    ///
31    /// This lint allows projects that wish to retain the limit of only using
32    /// ASCII characters to switch this lint to "forbid" (for example to ease
33    /// collaboration or for security reasons).
34    /// See [RFC 2457] for more details.
35    ///
36    /// [RFC 2457]: https://github.com/rust-lang/rfcs/blob/master/text/2457-non-ascii-idents.md
37    pub NON_ASCII_IDENTS,
38    Allow,
39    "detects non-ASCII identifiers",
40    crate_level_only
41}
42
43declare_lint! {
44    /// The `uncommon_codepoints` lint detects uncommon Unicode codepoints in
45    /// identifiers.
46    ///
47    /// ### Example
48    ///
49    /// ```rust
50    /// # #![allow(unused)]
51    /// const µ: f64 = 0.000001;
52    /// ```
53    ///
54    /// {{produces}}
55    ///
56    /// ### Explanation
57    ///
58    /// This lint warns about using characters which are not commonly used, and may
59    /// cause visual confusion.
60    ///
61    /// This lint is triggered by identifiers that contain a codepoint that is
62    /// not part of the set of "Allowed" codepoints as described by [Unicode®
63    /// Technical Standard #39 Unicode Security Mechanisms Section 3.1 General
64    /// Security Profile for Identifiers][TR39Allowed].
65    ///
66    /// Note that the set of uncommon codepoints may change over time. Beware
67    /// that if you "forbid" this lint that existing code may fail in the
68    /// future.
69    ///
70    /// [TR39Allowed]: https://www.unicode.org/reports/tr39/#General_Security_Profile
71    pub UNCOMMON_CODEPOINTS,
72    Warn,
73    "detects uncommon Unicode codepoints in identifiers",
74    crate_level_only
75}
76
77declare_lint! {
78    /// The `confusable_idents` lint detects visually confusable pairs between
79    /// identifiers.
80    ///
81    /// ### Example
82    ///
83    /// ```rust
84    /// // Latin Capital Letter E With Caron
85    /// pub const Ě: i32 = 1;
86    /// // Latin Capital Letter E With Breve
87    /// pub const Ĕ: i32 = 2;
88    /// ```
89    ///
90    /// {{produces}}
91    ///
92    /// ### Explanation
93    ///
94    /// This lint warns when different identifiers may appear visually similar,
95    /// which can cause confusion.
96    ///
97    /// The confusable detection algorithm is based on [Unicode® Technical
98    /// Standard #39 Unicode Security Mechanisms Section 4 Confusable
99    /// Detection][TR39Confusable]. For every distinct identifier X execute
100    /// the function `skeleton(X)`. If there exist two distinct identifiers X
101    /// and Y in the same crate where `skeleton(X) = skeleton(Y)` report it.
102    /// The compiler uses the same mechanism to check if an identifier is too
103    /// similar to a keyword.
104    ///
105    /// Note that the set of confusable characters may change over time.
106    /// Beware that if you "forbid" this lint that existing code may fail in
107    /// the future.
108    ///
109    /// [TR39Confusable]: https://www.unicode.org/reports/tr39/#Confusable_Detection
110    pub CONFUSABLE_IDENTS,
111    Warn,
112    "detects visually confusable pairs between identifiers",
113    crate_level_only
114}
115
116declare_lint! {
117    /// The `mixed_script_confusables` lint detects visually confusable
118    /// characters in identifiers between different [scripts].
119    ///
120    /// [scripts]: https://en.wikipedia.org/wiki/Script_(Unicode)
121    ///
122    /// ### Example
123    ///
124    /// ```rust
125    /// // The Japanese katakana character エ can be confused with the Han character 工.
126    /// const エ: &'static str = "アイウ";
127    /// ```
128    ///
129    /// {{produces}}
130    ///
131    /// ### Explanation
132    ///
133    /// This lint warns when characters between different scripts may appear
134    /// visually similar, which can cause confusion.
135    ///
136    /// If the crate contains other identifiers in the same script that have
137    /// non-confusable characters, then this lint will *not* be issued. For
138    /// example, if the example given above has another identifier with
139    /// katakana characters (such as `let カタカナ = 123;`), then this indicates
140    /// that you are intentionally using katakana, and it will not warn about
141    /// it.
142    ///
143    /// Note that the set of confusable characters may change over time.
144    /// Beware that if you "forbid" this lint that existing code may fail in
145    /// the future.
146    pub MIXED_SCRIPT_CONFUSABLES,
147    Warn,
148    "detects Unicode scripts whose mixed script confusables codepoints are solely used",
149    crate_level_only
150}
151
152declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]);
153
154impl EarlyLintPass for NonAsciiIdents {
155    fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) {
156        use std::collections::BTreeMap;
157
158        use rustc_session::lint::Level;
159        use rustc_span::Span;
160        use unicode_security::GeneralSecurityProfile;
161
162        let check_non_ascii_idents = cx.builder.lint_level(NON_ASCII_IDENTS).level != Level::Allow;
163        let check_uncommon_codepoints =
164            cx.builder.lint_level(UNCOMMON_CODEPOINTS).level != Level::Allow;
165        let check_confusable_idents =
166            cx.builder.lint_level(CONFUSABLE_IDENTS).level != Level::Allow;
167        let check_mixed_script_confusables =
168            cx.builder.lint_level(MIXED_SCRIPT_CONFUSABLES).level != Level::Allow;
169
170        if !check_non_ascii_idents
171            && !check_uncommon_codepoints
172            && !check_confusable_idents
173            && !check_mixed_script_confusables
174        {
175            return;
176        }
177
178        let mut has_non_ascii_idents = false;
179        let symbols = cx.sess().psess.symbol_gallery.symbols.lock();
180
181        // Sort by `Span` so that error messages make sense with respect to the
182        // order of identifier locations in the code.
183        // We will soon sort, so the initial order does not matter.
184        #[allow(rustc::potential_query_instability)]
185        let mut symbols: Vec<_> = symbols.iter().collect();
186        symbols.sort_by_key(|k| k.1);
187        for &(ref symbol, &sp) in symbols.iter() {
188            let symbol_str = symbol.as_str();
189            if symbol_str.is_ascii() {
190                continue;
191            }
192            has_non_ascii_idents = true;
193            cx.emit_span_lint(NON_ASCII_IDENTS, sp, IdentifierNonAsciiChar);
194            if check_uncommon_codepoints
195                && !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed)
196            {
197                let mut chars: Vec<_> = symbol_str
198                    .chars()
199                    .map(|c| (c, GeneralSecurityProfile::identifier_type(c)))
200                    .collect();
201
202                for (id_ty, id_ty_descr) in [
203                    (IdentifierType::Exclusion, "Exclusion"),
204                    (IdentifierType::Technical, "Technical"),
205                    (IdentifierType::Limited_Use, "Limited_Use"),
206                    (IdentifierType::Not_NFKC, "Not_NFKC"),
207                ] {
208                    let codepoints: Vec<_> =
209                        chars.extract_if(.., |(_, ty)| *ty == Some(id_ty)).collect();
210                    if codepoints.is_empty() {
211                        continue;
212                    }
213                    cx.emit_span_lint(
214                        UNCOMMON_CODEPOINTS,
215                        sp,
216                        IdentifierUncommonCodepoints {
217                            codepoints_len: codepoints.len(),
218                            codepoints: codepoints.into_iter().map(|(c, _)| c).collect(),
219                            identifier_type: id_ty_descr,
220                        },
221                    );
222                }
223
224                let remaining = chars
225                    .extract_if(.., |(c, _)| !GeneralSecurityProfile::identifier_allowed(*c))
226                    .collect::<Vec<_>>();
227                if !remaining.is_empty() {
228                    cx.emit_span_lint(
229                        UNCOMMON_CODEPOINTS,
230                        sp,
231                        IdentifierUncommonCodepoints {
232                            codepoints_len: remaining.len(),
233                            codepoints: remaining.into_iter().map(|(c, _)| c).collect(),
234                            identifier_type: "Restricted",
235                        },
236                    );
237                }
238            }
239        }
240
241        if has_non_ascii_idents && check_confusable_idents {
242            let mut skeleton_map: UnordMap<Symbol, (Symbol, Span, bool)> =
243                UnordMap::with_capacity(symbols.len());
244            let mut skeleton_buf = String::new();
245
246            for &(&symbol, &sp) in symbols.iter() {
247                use unicode_security::confusable_detection::skeleton;
248
249                let symbol_str = symbol.as_str();
250                let is_ascii = symbol_str.is_ascii();
251
252                // Get the skeleton as a `Symbol`.
253                skeleton_buf.clear();
254                skeleton_buf.extend(skeleton(symbol_str));
255                let skeleton_sym = if *symbol_str == *skeleton_buf {
256                    symbol
257                } else {
258                    Symbol::intern(&skeleton_buf)
259                };
260
261                skeleton_map
262                    .entry(skeleton_sym)
263                    .and_modify(|(existing_symbol, existing_span, existing_is_ascii)| {
264                        if !*existing_is_ascii || !is_ascii {
265                            cx.emit_span_lint(
266                                CONFUSABLE_IDENTS,
267                                sp,
268                                ConfusableIdentifierPair {
269                                    existing_sym: *existing_symbol,
270                                    sym: symbol,
271                                    label: *existing_span,
272                                    main_label: sp,
273                                },
274                            );
275                        }
276                        if *existing_is_ascii && !is_ascii {
277                            *existing_symbol = symbol;
278                            *existing_span = sp;
279                            *existing_is_ascii = is_ascii;
280                        }
281                    })
282                    .or_insert((symbol, sp, is_ascii));
283            }
284        }
285
286        if has_non_ascii_idents && check_mixed_script_confusables {
287            use unicode_security::is_potential_mixed_script_confusable_char;
288            use unicode_security::mixed_script::AugmentedScriptSet;
289
290            #[derive(Clone)]
291            enum ScriptSetUsage {
292                Suspicious(Vec<char>, Span),
293                Verified,
294            }
295
296            let mut script_states: FxIndexMap<AugmentedScriptSet, ScriptSetUsage> =
297                Default::default();
298            let latin_augmented_script_set = AugmentedScriptSet::for_char('A');
299            script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified);
300
301            let mut has_suspicious = false;
302            for &(ref symbol, &sp) in symbols.iter() {
303                let symbol_str = symbol.as_str();
304                for ch in symbol_str.chars() {
305                    if ch.is_ascii() {
306                        // all ascii characters are covered by exception.
307                        continue;
308                    }
309                    if !GeneralSecurityProfile::identifier_allowed(ch) {
310                        // this character is covered by `uncommon_codepoints` lint.
311                        continue;
312                    }
313                    let augmented_script_set = AugmentedScriptSet::for_char(ch);
314                    script_states
315                        .entry(augmented_script_set)
316                        .and_modify(|existing_state| {
317                            if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state {
318                                if is_potential_mixed_script_confusable_char(ch) {
319                                    ch_list.push(ch);
320                                } else {
321                                    *existing_state = ScriptSetUsage::Verified;
322                                }
323                            }
324                        })
325                        .or_insert_with(|| {
326                            if !is_potential_mixed_script_confusable_char(ch) {
327                                ScriptSetUsage::Verified
328                            } else {
329                                has_suspicious = true;
330                                ScriptSetUsage::Suspicious(vec![ch], sp)
331                            }
332                        });
333                }
334            }
335
336            if has_suspicious {
337                // The end result is put in `lint_reports` which is sorted.
338                #[allow(rustc::potential_query_instability)]
339                let verified_augmented_script_sets = script_states
340                    .iter()
341                    .flat_map(|(k, v)| match v {
342                        ScriptSetUsage::Verified => Some(*k),
343                        _ => None,
344                    })
345                    .collect::<Vec<_>>();
346
347                // we're sorting the output here.
348                let mut lint_reports: BTreeMap<(Span, Vec<char>), AugmentedScriptSet> =
349                    BTreeMap::new();
350
351                // The end result is put in `lint_reports` which is sorted.
352                #[allow(rustc::potential_query_instability)]
353                'outerloop: for (augment_script_set, usage) in script_states {
354                    let ScriptSetUsage::Suspicious(mut ch_list, sp) = usage else { continue };
355
356                    if augment_script_set.is_all() {
357                        continue;
358                    }
359
360                    for existing in verified_augmented_script_sets.iter() {
361                        if existing.is_all() {
362                            continue;
363                        }
364                        let mut intersect = *existing;
365                        intersect.intersect_with(augment_script_set);
366                        if !intersect.is_empty() && !intersect.is_all() {
367                            continue 'outerloop;
368                        }
369                    }
370
371                    // We sort primitive chars here and can use unstable sort
372                    ch_list.sort_unstable();
373                    ch_list.dedup();
374                    lint_reports.insert((sp, ch_list), augment_script_set);
375                }
376
377                for ((sp, ch_list), script_set) in lint_reports {
378                    let mut includes = String::new();
379                    for (idx, ch) in ch_list.into_iter().enumerate() {
380                        if idx != 0 {
381                            includes += ", ";
382                        }
383                        let char_info = format!("'{}' (U+{:04X})", ch, ch as u32);
384                        includes += &char_info;
385                    }
386                    cx.emit_span_lint(
387                        MIXED_SCRIPT_CONFUSABLES,
388                        sp,
389                        MixedScriptConfusables { set: script_set.to_string(), includes },
390                    );
391                }
392            }
393        }
394    }
395}