rustc_lint/
non_ascii_idents.rs

1use rustc_ast as ast;
2use rustc_data_structures::fx::FxIndexMap;
3use rustc_data_structures::unord::UnordMap;
4use rustc_session::{declare_lint, declare_lint_pass};
5use rustc_span::Symbol;
6use unicode_security::general_security_profile::IdentifierType;
7
8use crate::lints::{
9    ConfusableIdentifierPair, IdentifierNonAsciiChar, IdentifierUncommonCodepoints,
10    MixedScriptConfusables,
11};
12use crate::{EarlyContext, EarlyLintPass, LintContext};
13
14declare_lint! {
15    /// The `non_ascii_idents` lint detects non-ASCII identifiers.
16    ///
17    /// ### Example
18    ///
19    /// ```rust,compile_fail
20    /// # #![allow(unused)]
21    /// #![deny(non_ascii_idents)]
22    /// fn main() {
23    ///     let föö = 1;
24    /// }
25    /// ```
26    ///
27    /// {{produces}}
28    ///
29    /// ### Explanation
30    ///
31    /// This lint allows projects that wish to retain the limit of only using
32    /// ASCII characters to switch this lint to "forbid" (for example to ease
33    /// collaboration or for security reasons).
34    /// See [RFC 2457] for more details.
35    ///
36    /// [RFC 2457]: https://github.com/rust-lang/rfcs/blob/master/text/2457-non-ascii-idents.md
37    pub NON_ASCII_IDENTS,
38    Allow,
39    "detects non-ASCII identifiers",
40    crate_level_only
41}
42
43declare_lint! {
44    /// The `uncommon_codepoints` lint detects uncommon Unicode codepoints in
45    /// identifiers.
46    ///
47    /// ### Example
48    ///
49    /// ```rust
50    /// # #![allow(unused)]
51    /// const µ: f64 = 0.000001;
52    /// ```
53    ///
54    /// {{produces}}
55    ///
56    /// ### Explanation
57    ///
58    /// This lint warns about using characters which are not commonly used, and may
59    /// cause visual confusion.
60    ///
61    /// This lint is triggered by identifiers that contain a codepoint that is
62    /// not part of the set of "Allowed" codepoints as described by [Unicode®
63    /// Technical Standard #39 Unicode Security Mechanisms Section 3.1 General
64    /// Security Profile for Identifiers][TR39Allowed].
65    ///
66    /// Note that the set of uncommon codepoints may change over time. Beware
67    /// that if you "forbid" this lint that existing code may fail in the
68    /// future.
69    ///
70    /// [TR39Allowed]: https://www.unicode.org/reports/tr39/#General_Security_Profile
71    pub UNCOMMON_CODEPOINTS,
72    Warn,
73    "detects uncommon Unicode codepoints in identifiers",
74    crate_level_only
75}
76
77declare_lint! {
78    /// The `confusable_idents` lint detects visually confusable pairs between
79    /// identifiers.
80    ///
81    /// ### Example
82    ///
83    /// ```rust
84    /// // Latin Capital Letter E With Caron
85    /// pub const Ě: i32 = 1;
86    /// // Latin Capital Letter E With Breve
87    /// pub const Ĕ: i32 = 2;
88    /// ```
89    ///
90    /// {{produces}}
91    ///
92    /// ### Explanation
93    ///
94    /// This lint warns when different identifiers may appear visually similar,
95    /// which can cause confusion.
96    ///
97    /// The confusable detection algorithm is based on [Unicode® Technical
98    /// Standard #39 Unicode Security Mechanisms Section 4 Confusable
99    /// Detection][TR39Confusable]. For every distinct identifier X execute
100    /// the function `skeleton(X)`. If there exist two distinct identifiers X
101    /// and Y in the same crate where `skeleton(X) = skeleton(Y)` report it.
102    /// The compiler uses the same mechanism to check if an identifier is too
103    /// similar to a keyword.
104    ///
105    /// Note that the set of confusable characters may change over time.
106    /// Beware that if you "forbid" this lint that existing code may fail in
107    /// the future.
108    ///
109    /// [TR39Confusable]: https://www.unicode.org/reports/tr39/#Confusable_Detection
110    pub CONFUSABLE_IDENTS,
111    Warn,
112    "detects visually confusable pairs between identifiers",
113    crate_level_only
114}
115
116declare_lint! {
117    /// The `mixed_script_confusables` lint detects visually confusable
118    /// characters in identifiers between different [scripts].
119    ///
120    /// [scripts]: https://en.wikipedia.org/wiki/Script_(Unicode)
121    ///
122    /// ### Example
123    ///
124    /// ```rust
125    /// // The Japanese katakana character エ can be confused with the Han character 工.
126    /// const エ: &'static str = "アイウ";
127    /// ```
128    ///
129    /// {{produces}}
130    ///
131    /// ### Explanation
132    ///
133    /// This lint warns when characters between different scripts may appear
134    /// visually similar, which can cause confusion.
135    ///
136    /// If the crate contains other identifiers in the same script that have
137    /// non-confusable characters, then this lint will *not* be issued. For
138    /// example, if the example given above has another identifier with
139    /// katakana characters (such as `let カタカナ = 123;`), then this indicates
140    /// that you are intentionally using katakana, and it will not warn about
141    /// it.
142    ///
143    /// Note that the set of confusable characters may change over time.
144    /// Beware that if you "forbid" this lint that existing code may fail in
145    /// the future.
146    pub MIXED_SCRIPT_CONFUSABLES,
147    Warn,
148    "detects Unicode scripts whose mixed script confusables codepoints are solely used",
149    crate_level_only
150}
151
152declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]);
153
154impl EarlyLintPass for NonAsciiIdents {
155    fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) {
156        use std::collections::BTreeMap;
157
158        use rustc_session::lint::Level;
159        use rustc_span::Span;
160        use unicode_security::GeneralSecurityProfile;
161
162        let check_non_ascii_idents = cx.builder.lint_level(NON_ASCII_IDENTS).0 != Level::Allow;
163        let check_uncommon_codepoints =
164            cx.builder.lint_level(UNCOMMON_CODEPOINTS).0 != Level::Allow;
165        let check_confusable_idents = cx.builder.lint_level(CONFUSABLE_IDENTS).0 != Level::Allow;
166        let check_mixed_script_confusables =
167            cx.builder.lint_level(MIXED_SCRIPT_CONFUSABLES).0 != Level::Allow;
168
169        if !check_non_ascii_idents
170            && !check_uncommon_codepoints
171            && !check_confusable_idents
172            && !check_mixed_script_confusables
173        {
174            return;
175        }
176
177        let mut has_non_ascii_idents = false;
178        let symbols = cx.sess().psess.symbol_gallery.symbols.lock();
179
180        // Sort by `Span` so that error messages make sense with respect to the
181        // order of identifier locations in the code.
182        // We will soon sort, so the initial order does not matter.
183        #[allow(rustc::potential_query_instability)]
184        let mut symbols: Vec<_> = symbols.iter().collect();
185        symbols.sort_by_key(|k| k.1);
186        for &(ref symbol, &sp) in symbols.iter() {
187            let symbol_str = symbol.as_str();
188            if symbol_str.is_ascii() {
189                continue;
190            }
191            has_non_ascii_idents = true;
192            cx.emit_span_lint(NON_ASCII_IDENTS, sp, IdentifierNonAsciiChar);
193            if check_uncommon_codepoints
194                && !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed)
195            {
196                let mut chars: Vec<_> = symbol_str
197                    .chars()
198                    .map(|c| (c, GeneralSecurityProfile::identifier_type(c)))
199                    .collect();
200
201                for (id_ty, id_ty_descr) in [
202                    (IdentifierType::Exclusion, "Exclusion"),
203                    (IdentifierType::Technical, "Technical"),
204                    (IdentifierType::Limited_Use, "Limited_Use"),
205                    (IdentifierType::Not_NFKC, "Not_NFKC"),
206                ] {
207                    let codepoints: Vec<_> =
208                        chars.extract_if(.., |(_, ty)| *ty == Some(id_ty)).collect();
209                    if codepoints.is_empty() {
210                        continue;
211                    }
212                    cx.emit_span_lint(
213                        UNCOMMON_CODEPOINTS,
214                        sp,
215                        IdentifierUncommonCodepoints {
216                            codepoints_len: codepoints.len(),
217                            codepoints: codepoints.into_iter().map(|(c, _)| c).collect(),
218                            identifier_type: id_ty_descr,
219                        },
220                    );
221                }
222
223                let remaining = chars
224                    .extract_if(.., |(c, _)| !GeneralSecurityProfile::identifier_allowed(*c))
225                    .collect::<Vec<_>>();
226                if !remaining.is_empty() {
227                    cx.emit_span_lint(
228                        UNCOMMON_CODEPOINTS,
229                        sp,
230                        IdentifierUncommonCodepoints {
231                            codepoints_len: remaining.len(),
232                            codepoints: remaining.into_iter().map(|(c, _)| c).collect(),
233                            identifier_type: "Restricted",
234                        },
235                    );
236                }
237            }
238        }
239
240        if has_non_ascii_idents && check_confusable_idents {
241            let mut skeleton_map: UnordMap<Symbol, (Symbol, Span, bool)> =
242                UnordMap::with_capacity(symbols.len());
243            let mut skeleton_buf = String::new();
244
245            for &(&symbol, &sp) in symbols.iter() {
246                use unicode_security::confusable_detection::skeleton;
247
248                let symbol_str = symbol.as_str();
249                let is_ascii = symbol_str.is_ascii();
250
251                // Get the skeleton as a `Symbol`.
252                skeleton_buf.clear();
253                skeleton_buf.extend(skeleton(symbol_str));
254                let skeleton_sym = if *symbol_str == *skeleton_buf {
255                    symbol
256                } else {
257                    Symbol::intern(&skeleton_buf)
258                };
259
260                skeleton_map
261                    .entry(skeleton_sym)
262                    .and_modify(|(existing_symbol, existing_span, existing_is_ascii)| {
263                        if !*existing_is_ascii || !is_ascii {
264                            cx.emit_span_lint(
265                                CONFUSABLE_IDENTS,
266                                sp,
267                                ConfusableIdentifierPair {
268                                    existing_sym: *existing_symbol,
269                                    sym: symbol,
270                                    label: *existing_span,
271                                    main_label: sp,
272                                },
273                            );
274                        }
275                        if *existing_is_ascii && !is_ascii {
276                            *existing_symbol = symbol;
277                            *existing_span = sp;
278                            *existing_is_ascii = is_ascii;
279                        }
280                    })
281                    .or_insert((symbol, sp, is_ascii));
282            }
283        }
284
285        if has_non_ascii_idents && check_mixed_script_confusables {
286            use unicode_security::is_potential_mixed_script_confusable_char;
287            use unicode_security::mixed_script::AugmentedScriptSet;
288
289            #[derive(Clone)]
290            enum ScriptSetUsage {
291                Suspicious(Vec<char>, Span),
292                Verified,
293            }
294
295            let mut script_states: FxIndexMap<AugmentedScriptSet, ScriptSetUsage> =
296                Default::default();
297            let latin_augmented_script_set = AugmentedScriptSet::for_char('A');
298            script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified);
299
300            let mut has_suspicious = false;
301            for &(ref symbol, &sp) in symbols.iter() {
302                let symbol_str = symbol.as_str();
303                for ch in symbol_str.chars() {
304                    if ch.is_ascii() {
305                        // all ascii characters are covered by exception.
306                        continue;
307                    }
308                    if !GeneralSecurityProfile::identifier_allowed(ch) {
309                        // this character is covered by `uncommon_codepoints` lint.
310                        continue;
311                    }
312                    let augmented_script_set = AugmentedScriptSet::for_char(ch);
313                    script_states
314                        .entry(augmented_script_set)
315                        .and_modify(|existing_state| {
316                            if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state {
317                                if is_potential_mixed_script_confusable_char(ch) {
318                                    ch_list.push(ch);
319                                } else {
320                                    *existing_state = ScriptSetUsage::Verified;
321                                }
322                            }
323                        })
324                        .or_insert_with(|| {
325                            if !is_potential_mixed_script_confusable_char(ch) {
326                                ScriptSetUsage::Verified
327                            } else {
328                                has_suspicious = true;
329                                ScriptSetUsage::Suspicious(vec![ch], sp)
330                            }
331                        });
332                }
333            }
334
335            if has_suspicious {
336                // The end result is put in `lint_reports` which is sorted.
337                #[allow(rustc::potential_query_instability)]
338                let verified_augmented_script_sets = script_states
339                    .iter()
340                    .flat_map(|(k, v)| match v {
341                        ScriptSetUsage::Verified => Some(*k),
342                        _ => None,
343                    })
344                    .collect::<Vec<_>>();
345
346                // we're sorting the output here.
347                let mut lint_reports: BTreeMap<(Span, Vec<char>), AugmentedScriptSet> =
348                    BTreeMap::new();
349
350                // The end result is put in `lint_reports` which is sorted.
351                #[allow(rustc::potential_query_instability)]
352                'outerloop: for (augment_script_set, usage) in script_states {
353                    let ScriptSetUsage::Suspicious(mut ch_list, sp) = usage else { continue };
354
355                    if augment_script_set.is_all() {
356                        continue;
357                    }
358
359                    for existing in verified_augmented_script_sets.iter() {
360                        if existing.is_all() {
361                            continue;
362                        }
363                        let mut intersect = *existing;
364                        intersect.intersect_with(augment_script_set);
365                        if !intersect.is_empty() && !intersect.is_all() {
366                            continue 'outerloop;
367                        }
368                    }
369
370                    // We sort primitive chars here and can use unstable sort
371                    ch_list.sort_unstable();
372                    ch_list.dedup();
373                    lint_reports.insert((sp, ch_list), augment_script_set);
374                }
375
376                for ((sp, ch_list), script_set) in lint_reports {
377                    let mut includes = String::new();
378                    for (idx, ch) in ch_list.into_iter().enumerate() {
379                        if idx != 0 {
380                            includes += ", ";
381                        }
382                        let char_info = format!("'{}' (U+{:04X})", ch, ch as u32);
383                        includes += &char_info;
384                    }
385                    cx.emit_span_lint(
386                        MIXED_SCRIPT_CONFUSABLES,
387                        sp,
388                        MixedScriptConfusables { set: script_set.to_string(), includes },
389                    );
390                }
391            }
392        }
393    }
394}