Skip to main content

rustc_lint/
non_ascii_idents.rs

1use rustc_ast as ast;
2use rustc_data_structures::fx::FxIndexMap;
3use rustc_data_structures::unord::UnordMap;
4use rustc_session::{declare_lint, declare_lint_pass};
5use rustc_span::Symbol;
6use unicode_security::general_security_profile::IdentifierType;
7
8use crate::lints::{
9    ConfusableIdentifierPair, IdentifierNonAsciiChar, IdentifierUncommonCodepoints,
10    MixedScriptConfusables,
11};
12use crate::{EarlyContext, EarlyLintPass, LintContext};
13
14#[doc = r" The `non_ascii_idents` lint detects non-ASCII identifiers."]
#[doc = r""]
#[doc = r" ### Example"]
#[doc = r""]
#[doc = r" ```rust,compile_fail"]
#[doc = r" # #![allow(unused)]"]
#[doc = r" #![deny(non_ascii_idents)]"]
#[doc = r" fn main() {"]
#[doc = r"     let föö = 1;"]
#[doc = r" }"]
#[doc = r" ```"]
#[doc = r""]
#[doc = r" {{produces}}"]
#[doc = r""]
#[doc = r" ### Explanation"]
#[doc = r""]
#[doc =
r" This lint allows projects that wish to retain the limit of only using"]
#[doc =
r#" ASCII characters to switch this lint to "forbid" (for example to ease"#]
#[doc = r" collaboration or for security reasons)."]
#[doc = r" See [RFC 2457] for more details."]
#[doc = r""]
#[doc =
r" [RFC 2457]: https://github.com/rust-lang/rfcs/blob/master/text/2457-non-ascii-idents.md"]
pub static NON_ASCII_IDENTS: &::rustc_lint_defs::Lint =
    &::rustc_lint_defs::Lint {
            name: "NON_ASCII_IDENTS",
            default_level: ::rustc_lint_defs::Allow,
            desc: "detects non-ASCII identifiers",
            is_externally_loaded: false,
            crate_level_only: true,
            ..::rustc_lint_defs::Lint::default_fields_for_macro()
        };declare_lint! {
15    /// The `non_ascii_idents` lint detects non-ASCII identifiers.
16    ///
17    /// ### Example
18    ///
19    /// ```rust,compile_fail
20    /// # #![allow(unused)]
21    /// #![deny(non_ascii_idents)]
22    /// fn main() {
23    ///     let föö = 1;
24    /// }
25    /// ```
26    ///
27    /// {{produces}}
28    ///
29    /// ### Explanation
30    ///
31    /// This lint allows projects that wish to retain the limit of only using
32    /// ASCII characters to switch this lint to "forbid" (for example to ease
33    /// collaboration or for security reasons).
34    /// See [RFC 2457] for more details.
35    ///
36    /// [RFC 2457]: https://github.com/rust-lang/rfcs/blob/master/text/2457-non-ascii-idents.md
37    pub NON_ASCII_IDENTS,
38    Allow,
39    "detects non-ASCII identifiers",
40    crate_level_only
41}
42
43#[doc =
r" The `uncommon_codepoints` lint detects uncommon Unicode codepoints in"]
#[doc = r" identifiers."]
#[doc = r""]
#[doc = r" ### Example"]
#[doc = r""]
#[doc = r" ```rust"]
#[doc = r" # #![allow(unused)]"]
#[doc = r" const µ: f64 = 0.000001;"]
#[doc = r" ```"]
#[doc = r""]
#[doc = r" {{produces}}"]
#[doc = r""]
#[doc = r" ### Explanation"]
#[doc = r""]
#[doc =
r" This lint warns about using characters which are not commonly used, and may"]
#[doc = r" cause visual confusion."]
#[doc = r""]
#[doc =
r" This lint is triggered by identifiers that contain a codepoint that is"]
#[doc =
r#" not part of the set of "Allowed" codepoints as described by [Unicode®"#]
#[doc =
r" Technical Standard #39 Unicode Security Mechanisms Section 3.1 General"]
#[doc = r" Security Profile for Identifiers][TR39Allowed]."]
#[doc = r""]
#[doc =
r" Note that the set of uncommon codepoints may change over time. Beware"]
#[doc =
r#" that if you "forbid" this lint that existing code may fail in the"#]
#[doc = r" future."]
#[doc = r""]
#[doc =
r" [TR39Allowed]: https://www.unicode.org/reports/tr39/#General_Security_Profile"]
pub static UNCOMMON_CODEPOINTS: &::rustc_lint_defs::Lint =
    &::rustc_lint_defs::Lint {
            name: "UNCOMMON_CODEPOINTS",
            default_level: ::rustc_lint_defs::Warn,
            desc: "detects uncommon Unicode codepoints in identifiers",
            is_externally_loaded: false,
            crate_level_only: true,
            ..::rustc_lint_defs::Lint::default_fields_for_macro()
        };declare_lint! {
44    /// The `uncommon_codepoints` lint detects uncommon Unicode codepoints in
45    /// identifiers.
46    ///
47    /// ### Example
48    ///
49    /// ```rust
50    /// # #![allow(unused)]
51    /// const µ: f64 = 0.000001;
52    /// ```
53    ///
54    /// {{produces}}
55    ///
56    /// ### Explanation
57    ///
58    /// This lint warns about using characters which are not commonly used, and may
59    /// cause visual confusion.
60    ///
61    /// This lint is triggered by identifiers that contain a codepoint that is
62    /// not part of the set of "Allowed" codepoints as described by [Unicode®
63    /// Technical Standard #39 Unicode Security Mechanisms Section 3.1 General
64    /// Security Profile for Identifiers][TR39Allowed].
65    ///
66    /// Note that the set of uncommon codepoints may change over time. Beware
67    /// that if you "forbid" this lint that existing code may fail in the
68    /// future.
69    ///
70    /// [TR39Allowed]: https://www.unicode.org/reports/tr39/#General_Security_Profile
71    pub UNCOMMON_CODEPOINTS,
72    Warn,
73    "detects uncommon Unicode codepoints in identifiers",
74    crate_level_only
75}
76
77#[doc =
r" The `confusable_idents` lint detects visually confusable pairs between"]
#[doc = r" identifiers."]
#[doc = r""]
#[doc = r" ### Example"]
#[doc = r""]
#[doc = r" ```rust"]
#[doc = r" // Latin Capital Letter E With Caron"]
#[doc = r" pub const Ě: i32 = 1;"]
#[doc = r" // Latin Capital Letter E With Breve"]
#[doc = r" pub const Ĕ: i32 = 2;"]
#[doc = r" ```"]
#[doc = r""]
#[doc = r" {{produces}}"]
#[doc = r""]
#[doc = r" ### Explanation"]
#[doc = r""]
#[doc =
r" This lint warns when different identifiers may appear visually similar,"]
#[doc = r" which can cause confusion."]
#[doc = r""]
#[doc =
r" The confusable detection algorithm is based on [Unicode® Technical"]
#[doc = r" Standard #39 Unicode Security Mechanisms Section 4 Confusable"]
#[doc =
r" Detection][TR39Confusable]. For every distinct identifier X execute"]
#[doc =
r" the function `skeleton(X)`. If there exist two distinct identifiers X"]
#[doc =
r" and Y in the same crate where `skeleton(X) = skeleton(Y)` report it."]
#[doc =
r" The compiler uses the same mechanism to check if an identifier is too"]
#[doc = r" similar to a keyword."]
#[doc = r""]
#[doc = r" Note that the set of confusable characters may change over time."]
#[doc =
r#" Beware that if you "forbid" this lint that existing code may fail in"#]
#[doc = r" the future."]
#[doc = r""]
#[doc =
r" [TR39Confusable]: https://www.unicode.org/reports/tr39/#Confusable_Detection"]
pub static CONFUSABLE_IDENTS: &::rustc_lint_defs::Lint =
    &::rustc_lint_defs::Lint {
            name: "CONFUSABLE_IDENTS",
            default_level: ::rustc_lint_defs::Warn,
            desc: "detects visually confusable pairs between identifiers",
            is_externally_loaded: false,
            crate_level_only: true,
            ..::rustc_lint_defs::Lint::default_fields_for_macro()
        };declare_lint! {
78    /// The `confusable_idents` lint detects visually confusable pairs between
79    /// identifiers.
80    ///
81    /// ### Example
82    ///
83    /// ```rust
84    /// // Latin Capital Letter E With Caron
85    /// pub const Ě: i32 = 1;
86    /// // Latin Capital Letter E With Breve
87    /// pub const Ĕ: i32 = 2;
88    /// ```
89    ///
90    /// {{produces}}
91    ///
92    /// ### Explanation
93    ///
94    /// This lint warns when different identifiers may appear visually similar,
95    /// which can cause confusion.
96    ///
97    /// The confusable detection algorithm is based on [Unicode® Technical
98    /// Standard #39 Unicode Security Mechanisms Section 4 Confusable
99    /// Detection][TR39Confusable]. For every distinct identifier X execute
100    /// the function `skeleton(X)`. If there exist two distinct identifiers X
101    /// and Y in the same crate where `skeleton(X) = skeleton(Y)` report it.
102    /// The compiler uses the same mechanism to check if an identifier is too
103    /// similar to a keyword.
104    ///
105    /// Note that the set of confusable characters may change over time.
106    /// Beware that if you "forbid" this lint that existing code may fail in
107    /// the future.
108    ///
109    /// [TR39Confusable]: https://www.unicode.org/reports/tr39/#Confusable_Detection
110    pub CONFUSABLE_IDENTS,
111    Warn,
112    "detects visually confusable pairs between identifiers",
113    crate_level_only
114}
115
116#[doc = r" The `mixed_script_confusables` lint detects visually confusable"]
#[doc = r" characters in identifiers between different [scripts]."]
#[doc = r""]
#[doc = r" [scripts]: https://en.wikipedia.org/wiki/Script_(Unicode)"]
#[doc = r""]
#[doc = r" ### Example"]
#[doc = r""]
#[doc = r" ```rust"]
#[doc =
r" // The Japanese katakana character エ can be confused with the Han character 工."]
#[doc = r#" const エ: &'static str = "アイウ";"#]
#[doc = r" ```"]
#[doc = r""]
#[doc = r" {{produces}}"]
#[doc = r""]
#[doc = r" ### Explanation"]
#[doc = r""]
#[doc =
r" This lint warns when characters between different scripts may appear"]
#[doc = r" visually similar, which can cause confusion."]
#[doc = r""]
#[doc =
r" If the crate contains other identifiers in the same script that have"]
#[doc =
r" non-confusable characters, then this lint will *not* be issued. For"]
#[doc = r" example, if the example given above has another identifier with"]
#[doc =
r" katakana characters (such as `let カタカナ = 123;`), then this indicates"]
#[doc =
r" that you are intentionally using katakana, and it will not warn about"]
#[doc = r" it."]
#[doc = r""]
#[doc = r" Note that the set of confusable characters may change over time."]
#[doc =
r#" Beware that if you "forbid" this lint that existing code may fail in"#]
#[doc = r" the future."]
pub static MIXED_SCRIPT_CONFUSABLES: &::rustc_lint_defs::Lint =
    &::rustc_lint_defs::Lint {
            name: "MIXED_SCRIPT_CONFUSABLES",
            default_level: ::rustc_lint_defs::Warn,
            desc: "detects Unicode scripts whose mixed script confusables codepoints are solely used",
            is_externally_loaded: false,
            crate_level_only: true,
            ..::rustc_lint_defs::Lint::default_fields_for_macro()
        };declare_lint! {
117    /// The `mixed_script_confusables` lint detects visually confusable
118    /// characters in identifiers between different [scripts].
119    ///
120    /// [scripts]: https://en.wikipedia.org/wiki/Script_(Unicode)
121    ///
122    /// ### Example
123    ///
124    /// ```rust
125    /// // The Japanese katakana character エ can be confused with the Han character 工.
126    /// const エ: &'static str = "アイウ";
127    /// ```
128    ///
129    /// {{produces}}
130    ///
131    /// ### Explanation
132    ///
133    /// This lint warns when characters between different scripts may appear
134    /// visually similar, which can cause confusion.
135    ///
136    /// If the crate contains other identifiers in the same script that have
137    /// non-confusable characters, then this lint will *not* be issued. For
138    /// example, if the example given above has another identifier with
139    /// katakana characters (such as `let カタカナ = 123;`), then this indicates
140    /// that you are intentionally using katakana, and it will not warn about
141    /// it.
142    ///
143    /// Note that the set of confusable characters may change over time.
144    /// Beware that if you "forbid" this lint that existing code may fail in
145    /// the future.
146    pub MIXED_SCRIPT_CONFUSABLES,
147    Warn,
148    "detects Unicode scripts whose mixed script confusables codepoints are solely used",
149    crate_level_only
150}
151
152pub struct NonAsciiIdents;
#[automatically_derived]
impl ::core::marker::Copy for NonAsciiIdents { }
#[automatically_derived]
#[doc(hidden)]
unsafe impl ::core::clone::TrivialClone for NonAsciiIdents { }
#[automatically_derived]
impl ::core::clone::Clone for NonAsciiIdents {
    #[inline]
    fn clone(&self) -> NonAsciiIdents { *self }
}
impl ::rustc_lint_defs::LintPass for NonAsciiIdents {
    fn name(&self) -> &'static str { "NonAsciiIdents" }
    fn get_lints(&self) -> ::rustc_lint_defs::LintVec {
        ::alloc::boxed::box_assume_init_into_vec_unsafe(::alloc::intrinsics::write_box_via_move(::alloc::boxed::Box::new_uninit(),
                [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS,
                        MIXED_SCRIPT_CONFUSABLES]))
    }
}
impl NonAsciiIdents {
    #[allow(unused)]
    pub fn lint_vec() -> ::rustc_lint_defs::LintVec {
        ::alloc::boxed::box_assume_init_into_vec_unsafe(::alloc::intrinsics::write_box_via_move(::alloc::boxed::Box::new_uninit(),
                [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS,
                        MIXED_SCRIPT_CONFUSABLES]))
    }
}declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]);
153
154impl EarlyLintPass for NonAsciiIdents {
155    fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) {
156        use std::collections::BTreeMap;
157
158        use rustc_span::Span;
159        use unicode_security::GeneralSecurityProfile;
160
161        let check_non_ascii_idents = !cx.builder.lint_level_spec(NON_ASCII_IDENTS).is_allow();
162        let check_uncommon_codepoints = !cx.builder.lint_level_spec(UNCOMMON_CODEPOINTS).is_allow();
163        let check_confusable_idents = !cx.builder.lint_level_spec(CONFUSABLE_IDENTS).is_allow();
164        let check_mixed_script_confusables =
165            !cx.builder.lint_level_spec(MIXED_SCRIPT_CONFUSABLES).is_allow();
166
167        if !check_non_ascii_idents
168            && !check_uncommon_codepoints
169            && !check_confusable_idents
170            && !check_mixed_script_confusables
171        {
172            return;
173        }
174
175        let mut has_non_ascii_idents = false;
176        let symbols = cx.sess().psess.symbol_gallery.symbols.lock();
177
178        // Sort by `Span` so that error messages make sense with respect to the
179        // order of identifier locations in the code.
180        // We will soon sort, so the initial order does not matter.
181        #[allow(rustc::potential_query_instability)]
182        let mut symbols: Vec<_> = symbols.iter().collect();
183        symbols.sort_by_key(|k| k.1);
184        for &(ref symbol, &sp) in symbols.iter() {
185            let symbol_str = symbol.as_str();
186            if symbol_str.is_ascii() {
187                continue;
188            }
189            has_non_ascii_idents = true;
190            cx.emit_span_lint(NON_ASCII_IDENTS, sp, IdentifierNonAsciiChar);
191            if check_uncommon_codepoints
192                && !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed)
193            {
194                let mut chars: Vec<_> = symbol_str
195                    .chars()
196                    .map(|c| (c, GeneralSecurityProfile::identifier_type(c)))
197                    .collect();
198
199                for (id_ty, id_ty_descr) in [
200                    (IdentifierType::Exclusion, "Exclusion"),
201                    (IdentifierType::Technical, "Technical"),
202                    (IdentifierType::Limited_Use, "Limited_Use"),
203                    (IdentifierType::Not_NFKC, "Not_NFKC"),
204                ] {
205                    let codepoints: Vec<_> =
206                        chars.extract_if(.., |(_, ty)| *ty == Some(id_ty)).collect();
207                    if codepoints.is_empty() {
208                        continue;
209                    }
210                    cx.emit_span_lint(
211                        UNCOMMON_CODEPOINTS,
212                        sp,
213                        IdentifierUncommonCodepoints {
214                            codepoints_len: codepoints.len(),
215                            codepoints: codepoints.into_iter().map(|(c, _)| c).collect(),
216                            identifier_type: id_ty_descr,
217                        },
218                    );
219                }
220
221                let remaining = chars
222                    .extract_if(.., |(c, _)| !GeneralSecurityProfile::identifier_allowed(*c))
223                    .collect::<Vec<_>>();
224                if !remaining.is_empty() {
225                    cx.emit_span_lint(
226                        UNCOMMON_CODEPOINTS,
227                        sp,
228                        IdentifierUncommonCodepoints {
229                            codepoints_len: remaining.len(),
230                            codepoints: remaining.into_iter().map(|(c, _)| c).collect(),
231                            identifier_type: "Restricted",
232                        },
233                    );
234                }
235            }
236        }
237
238        if has_non_ascii_idents && check_confusable_idents {
239            let mut skeleton_map: UnordMap<Symbol, (Symbol, Span, bool)> =
240                UnordMap::with_capacity(symbols.len());
241            let mut skeleton_buf = String::new();
242
243            for &(&symbol, &sp) in symbols.iter() {
244                use unicode_security::confusable_detection::skeleton;
245
246                let symbol_str = symbol.as_str();
247                let is_ascii = symbol_str.is_ascii();
248
249                // Get the skeleton as a `Symbol`.
250                skeleton_buf.clear();
251                skeleton_buf.extend(skeleton(symbol_str));
252                let skeleton_sym = if *symbol_str == *skeleton_buf {
253                    symbol
254                } else {
255                    Symbol::intern(&skeleton_buf)
256                };
257
258                skeleton_map
259                    .entry(skeleton_sym)
260                    .and_modify(|(existing_symbol, existing_span, existing_is_ascii)| {
261                        if !*existing_is_ascii || !is_ascii {
262                            cx.emit_span_lint(
263                                CONFUSABLE_IDENTS,
264                                sp,
265                                ConfusableIdentifierPair {
266                                    existing_sym: *existing_symbol,
267                                    sym: symbol,
268                                    label: *existing_span,
269                                    main_label: sp,
270                                },
271                            );
272                        }
273                        if *existing_is_ascii && !is_ascii {
274                            *existing_symbol = symbol;
275                            *existing_span = sp;
276                            *existing_is_ascii = is_ascii;
277                        }
278                    })
279                    .or_insert((symbol, sp, is_ascii));
280            }
281        }
282
283        if has_non_ascii_idents && check_mixed_script_confusables {
284            use unicode_security::is_potential_mixed_script_confusable_char;
285            use unicode_security::mixed_script::AugmentedScriptSet;
286
287            #[derive(#[automatically_derived]
impl ::core::clone::Clone for ScriptSetUsage {
    #[inline]
    fn clone(&self) -> ScriptSetUsage {
        match self {
            ScriptSetUsage::Suspicious(__self_0, __self_1) =>
                ScriptSetUsage::Suspicious(::core::clone::Clone::clone(__self_0),
                    ::core::clone::Clone::clone(__self_1)),
            ScriptSetUsage::Verified => ScriptSetUsage::Verified,
        }
    }
}Clone)]
288            enum ScriptSetUsage {
289                Suspicious(Vec<char>, Span),
290                Verified,
291            }
292
293            let mut script_states: FxIndexMap<AugmentedScriptSet, ScriptSetUsage> =
294                Default::default();
295            let latin_augmented_script_set = AugmentedScriptSet::for_char('A');
296            script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified);
297
298            let mut has_suspicious = false;
299            for &(ref symbol, &sp) in symbols.iter() {
300                let symbol_str = symbol.as_str();
301                for ch in symbol_str.chars() {
302                    if ch.is_ascii() {
303                        // all ascii characters are covered by exception.
304                        continue;
305                    }
306                    if !GeneralSecurityProfile::identifier_allowed(ch) {
307                        // this character is covered by `uncommon_codepoints` lint.
308                        continue;
309                    }
310                    let augmented_script_set = AugmentedScriptSet::for_char(ch);
311                    script_states
312                        .entry(augmented_script_set)
313                        .and_modify(|existing_state| {
314                            if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state {
315                                if is_potential_mixed_script_confusable_char(ch) {
316                                    ch_list.push(ch);
317                                } else {
318                                    *existing_state = ScriptSetUsage::Verified;
319                                }
320                            }
321                        })
322                        .or_insert_with(|| {
323                            if !is_potential_mixed_script_confusable_char(ch) {
324                                ScriptSetUsage::Verified
325                            } else {
326                                has_suspicious = true;
327                                ScriptSetUsage::Suspicious(::alloc::boxed::box_assume_init_into_vec_unsafe(::alloc::intrinsics::write_box_via_move(::alloc::boxed::Box::new_uninit(),
        [ch]))vec![ch], sp)
328                            }
329                        });
330                }
331            }
332
333            if has_suspicious {
334                // The end result is put in `lint_reports` which is sorted.
335                #[allow(rustc::potential_query_instability)]
336                let verified_augmented_script_sets = script_states
337                    .iter()
338                    .flat_map(|(k, v)| match v {
339                        ScriptSetUsage::Verified => Some(*k),
340                        _ => None,
341                    })
342                    .collect::<Vec<_>>();
343
344                // we're sorting the output here.
345                let mut lint_reports: BTreeMap<(Span, Vec<char>), AugmentedScriptSet> =
346                    BTreeMap::new();
347
348                // The end result is put in `lint_reports` which is sorted.
349                #[allow(rustc::potential_query_instability)]
350                'outerloop: for (augment_script_set, usage) in script_states {
351                    let ScriptSetUsage::Suspicious(mut ch_list, sp) = usage else { continue };
352
353                    if augment_script_set.is_all() {
354                        continue;
355                    }
356
357                    for existing in verified_augmented_script_sets.iter() {
358                        if existing.is_all() {
359                            continue;
360                        }
361                        let mut intersect = *existing;
362                        intersect.intersect_with(augment_script_set);
363                        if !intersect.is_empty() && !intersect.is_all() {
364                            continue 'outerloop;
365                        }
366                    }
367
368                    // We sort primitive chars here and can use unstable sort
369                    ch_list.sort_unstable();
370                    ch_list.dedup();
371                    lint_reports.insert((sp, ch_list), augment_script_set);
372                }
373
374                for ((sp, ch_list), script_set) in lint_reports {
375                    let mut includes = String::new();
376                    for (idx, ch) in ch_list.into_iter().enumerate() {
377                        if idx != 0 {
378                            includes += ", ";
379                        }
380                        let char_info = ::alloc::__export::must_use({
        ::alloc::fmt::format(format_args!("\'{0}\' (U+{1:04X})", ch,
                ch as u32))
    })format!("'{}' (U+{:04X})", ch, ch as u32);
381                        includes += &char_info;
382                    }
383                    cx.emit_span_lint(
384                        MIXED_SCRIPT_CONFUSABLES,
385                        sp,
386                        MixedScriptConfusables { set: script_set.to_string(), includes },
387                    );
388                }
389            }
390        }
391    }
392}