rustc_lint/non_ascii_idents.rs
1use rustc_ast as ast;
2use rustc_data_structures::fx::FxIndexMap;
3use rustc_data_structures::unord::UnordMap;
4use rustc_session::{declare_lint, declare_lint_pass};
5use rustc_span::Symbol;
6use unicode_security::general_security_profile::IdentifierType;
7
8use crate::lints::{
9 ConfusableIdentifierPair, IdentifierNonAsciiChar, IdentifierUncommonCodepoints,
10 MixedScriptConfusables,
11};
12use crate::{EarlyContext, EarlyLintPass, LintContext};
13
14declare_lint! {
15 /// The `non_ascii_idents` lint detects non-ASCII identifiers.
16 ///
17 /// ### Example
18 ///
19 /// ```rust,compile_fail
20 /// # #![allow(unused)]
21 /// #![deny(non_ascii_idents)]
22 /// fn main() {
23 /// let föö = 1;
24 /// }
25 /// ```
26 ///
27 /// {{produces}}
28 ///
29 /// ### Explanation
30 ///
31 /// This lint allows projects that wish to retain the limit of only using
32 /// ASCII characters to switch this lint to "forbid" (for example to ease
33 /// collaboration or for security reasons).
34 /// See [RFC 2457] for more details.
35 ///
36 /// [RFC 2457]: https://github.com/rust-lang/rfcs/blob/master/text/2457-non-ascii-idents.md
37 pub NON_ASCII_IDENTS,
38 Allow,
39 "detects non-ASCII identifiers",
40 crate_level_only
41}
42
43declare_lint! {
44 /// The `uncommon_codepoints` lint detects uncommon Unicode codepoints in
45 /// identifiers.
46 ///
47 /// ### Example
48 ///
49 /// ```rust
50 /// # #![allow(unused)]
51 /// const µ: f64 = 0.000001;
52 /// ```
53 ///
54 /// {{produces}}
55 ///
56 /// ### Explanation
57 ///
58 /// This lint warns about using characters which are not commonly used, and may
59 /// cause visual confusion.
60 ///
61 /// This lint is triggered by identifiers that contain a codepoint that is
62 /// not part of the set of "Allowed" codepoints as described by [Unicode®
63 /// Technical Standard #39 Unicode Security Mechanisms Section 3.1 General
64 /// Security Profile for Identifiers][TR39Allowed].
65 ///
66 /// Note that the set of uncommon codepoints may change over time. Beware
67 /// that if you "forbid" this lint that existing code may fail in the
68 /// future.
69 ///
70 /// [TR39Allowed]: https://www.unicode.org/reports/tr39/#General_Security_Profile
71 pub UNCOMMON_CODEPOINTS,
72 Warn,
73 "detects uncommon Unicode codepoints in identifiers",
74 crate_level_only
75}
76
77declare_lint! {
78 /// The `confusable_idents` lint detects visually confusable pairs between
79 /// identifiers.
80 ///
81 /// ### Example
82 ///
83 /// ```rust
84 /// // Latin Capital Letter E With Caron
85 /// pub const Ě: i32 = 1;
86 /// // Latin Capital Letter E With Breve
87 /// pub const Ĕ: i32 = 2;
88 /// ```
89 ///
90 /// {{produces}}
91 ///
92 /// ### Explanation
93 ///
94 /// This lint warns when different identifiers may appear visually similar,
95 /// which can cause confusion.
96 ///
97 /// The confusable detection algorithm is based on [Unicode® Technical
98 /// Standard #39 Unicode Security Mechanisms Section 4 Confusable
99 /// Detection][TR39Confusable]. For every distinct identifier X execute
100 /// the function `skeleton(X)`. If there exist two distinct identifiers X
101 /// and Y in the same crate where `skeleton(X) = skeleton(Y)` report it.
102 /// The compiler uses the same mechanism to check if an identifier is too
103 /// similar to a keyword.
104 ///
105 /// Note that the set of confusable characters may change over time.
106 /// Beware that if you "forbid" this lint that existing code may fail in
107 /// the future.
108 ///
109 /// [TR39Confusable]: https://www.unicode.org/reports/tr39/#Confusable_Detection
110 pub CONFUSABLE_IDENTS,
111 Warn,
112 "detects visually confusable pairs between identifiers",
113 crate_level_only
114}
115
116declare_lint! {
117 /// The `mixed_script_confusables` lint detects visually confusable
118 /// characters in identifiers between different [scripts].
119 ///
120 /// [scripts]: https://en.wikipedia.org/wiki/Script_(Unicode)
121 ///
122 /// ### Example
123 ///
124 /// ```rust
125 /// // The Japanese katakana character エ can be confused with the Han character 工.
126 /// const エ: &'static str = "アイウ";
127 /// ```
128 ///
129 /// {{produces}}
130 ///
131 /// ### Explanation
132 ///
133 /// This lint warns when characters between different scripts may appear
134 /// visually similar, which can cause confusion.
135 ///
136 /// If the crate contains other identifiers in the same script that have
137 /// non-confusable characters, then this lint will *not* be issued. For
138 /// example, if the example given above has another identifier with
139 /// katakana characters (such as `let カタカナ = 123;`), then this indicates
140 /// that you are intentionally using katakana, and it will not warn about
141 /// it.
142 ///
143 /// Note that the set of confusable characters may change over time.
144 /// Beware that if you "forbid" this lint that existing code may fail in
145 /// the future.
146 pub MIXED_SCRIPT_CONFUSABLES,
147 Warn,
148 "detects Unicode scripts whose mixed script confusables codepoints are solely used",
149 crate_level_only
150}
151
152declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]);
153
154impl EarlyLintPass for NonAsciiIdents {
155 fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) {
156 use std::collections::BTreeMap;
157
158 use rustc_session::lint::Level;
159 use rustc_span::Span;
160 use unicode_security::GeneralSecurityProfile;
161
162 let check_non_ascii_idents = cx.builder.lint_level(NON_ASCII_IDENTS).0 != Level::Allow;
163 let check_uncommon_codepoints =
164 cx.builder.lint_level(UNCOMMON_CODEPOINTS).0 != Level::Allow;
165 let check_confusable_idents = cx.builder.lint_level(CONFUSABLE_IDENTS).0 != Level::Allow;
166 let check_mixed_script_confusables =
167 cx.builder.lint_level(MIXED_SCRIPT_CONFUSABLES).0 != Level::Allow;
168
169 if !check_non_ascii_idents
170 && !check_uncommon_codepoints
171 && !check_confusable_idents
172 && !check_mixed_script_confusables
173 {
174 return;
175 }
176
177 let mut has_non_ascii_idents = false;
178 let symbols = cx.sess().psess.symbol_gallery.symbols.lock();
179
180 // Sort by `Span` so that error messages make sense with respect to the
181 // order of identifier locations in the code.
182 // We will soon sort, so the initial order does not matter.
183 #[allow(rustc::potential_query_instability)]
184 let mut symbols: Vec<_> = symbols.iter().collect();
185 symbols.sort_by_key(|k| k.1);
186 for &(ref symbol, &sp) in symbols.iter() {
187 let symbol_str = symbol.as_str();
188 if symbol_str.is_ascii() {
189 continue;
190 }
191 has_non_ascii_idents = true;
192 cx.emit_span_lint(NON_ASCII_IDENTS, sp, IdentifierNonAsciiChar);
193 if check_uncommon_codepoints
194 && !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed)
195 {
196 let mut chars: Vec<_> = symbol_str
197 .chars()
198 .map(|c| (c, GeneralSecurityProfile::identifier_type(c)))
199 .collect();
200
201 for (id_ty, id_ty_descr) in [
202 (IdentifierType::Exclusion, "Exclusion"),
203 (IdentifierType::Technical, "Technical"),
204 (IdentifierType::Limited_Use, "Limited_Use"),
205 (IdentifierType::Not_NFKC, "Not_NFKC"),
206 ] {
207 let codepoints: Vec<_> =
208 chars.extract_if(.., |(_, ty)| *ty == Some(id_ty)).collect();
209 if codepoints.is_empty() {
210 continue;
211 }
212 cx.emit_span_lint(
213 UNCOMMON_CODEPOINTS,
214 sp,
215 IdentifierUncommonCodepoints {
216 codepoints_len: codepoints.len(),
217 codepoints: codepoints.into_iter().map(|(c, _)| c).collect(),
218 identifier_type: id_ty_descr,
219 },
220 );
221 }
222
223 let remaining = chars
224 .extract_if(.., |(c, _)| !GeneralSecurityProfile::identifier_allowed(*c))
225 .collect::<Vec<_>>();
226 if !remaining.is_empty() {
227 cx.emit_span_lint(
228 UNCOMMON_CODEPOINTS,
229 sp,
230 IdentifierUncommonCodepoints {
231 codepoints_len: remaining.len(),
232 codepoints: remaining.into_iter().map(|(c, _)| c).collect(),
233 identifier_type: "Restricted",
234 },
235 );
236 }
237 }
238 }
239
240 if has_non_ascii_idents && check_confusable_idents {
241 let mut skeleton_map: UnordMap<Symbol, (Symbol, Span, bool)> =
242 UnordMap::with_capacity(symbols.len());
243 let mut skeleton_buf = String::new();
244
245 for &(&symbol, &sp) in symbols.iter() {
246 use unicode_security::confusable_detection::skeleton;
247
248 let symbol_str = symbol.as_str();
249 let is_ascii = symbol_str.is_ascii();
250
251 // Get the skeleton as a `Symbol`.
252 skeleton_buf.clear();
253 skeleton_buf.extend(skeleton(symbol_str));
254 let skeleton_sym = if *symbol_str == *skeleton_buf {
255 symbol
256 } else {
257 Symbol::intern(&skeleton_buf)
258 };
259
260 skeleton_map
261 .entry(skeleton_sym)
262 .and_modify(|(existing_symbol, existing_span, existing_is_ascii)| {
263 if !*existing_is_ascii || !is_ascii {
264 cx.emit_span_lint(
265 CONFUSABLE_IDENTS,
266 sp,
267 ConfusableIdentifierPair {
268 existing_sym: *existing_symbol,
269 sym: symbol,
270 label: *existing_span,
271 main_label: sp,
272 },
273 );
274 }
275 if *existing_is_ascii && !is_ascii {
276 *existing_symbol = symbol;
277 *existing_span = sp;
278 *existing_is_ascii = is_ascii;
279 }
280 })
281 .or_insert((symbol, sp, is_ascii));
282 }
283 }
284
285 if has_non_ascii_idents && check_mixed_script_confusables {
286 use unicode_security::is_potential_mixed_script_confusable_char;
287 use unicode_security::mixed_script::AugmentedScriptSet;
288
289 #[derive(Clone)]
290 enum ScriptSetUsage {
291 Suspicious(Vec<char>, Span),
292 Verified,
293 }
294
295 let mut script_states: FxIndexMap<AugmentedScriptSet, ScriptSetUsage> =
296 Default::default();
297 let latin_augmented_script_set = AugmentedScriptSet::for_char('A');
298 script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified);
299
300 let mut has_suspicious = false;
301 for &(ref symbol, &sp) in symbols.iter() {
302 let symbol_str = symbol.as_str();
303 for ch in symbol_str.chars() {
304 if ch.is_ascii() {
305 // all ascii characters are covered by exception.
306 continue;
307 }
308 if !GeneralSecurityProfile::identifier_allowed(ch) {
309 // this character is covered by `uncommon_codepoints` lint.
310 continue;
311 }
312 let augmented_script_set = AugmentedScriptSet::for_char(ch);
313 script_states
314 .entry(augmented_script_set)
315 .and_modify(|existing_state| {
316 if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state {
317 if is_potential_mixed_script_confusable_char(ch) {
318 ch_list.push(ch);
319 } else {
320 *existing_state = ScriptSetUsage::Verified;
321 }
322 }
323 })
324 .or_insert_with(|| {
325 if !is_potential_mixed_script_confusable_char(ch) {
326 ScriptSetUsage::Verified
327 } else {
328 has_suspicious = true;
329 ScriptSetUsage::Suspicious(vec![ch], sp)
330 }
331 });
332 }
333 }
334
335 if has_suspicious {
336 // The end result is put in `lint_reports` which is sorted.
337 #[allow(rustc::potential_query_instability)]
338 let verified_augmented_script_sets = script_states
339 .iter()
340 .flat_map(|(k, v)| match v {
341 ScriptSetUsage::Verified => Some(*k),
342 _ => None,
343 })
344 .collect::<Vec<_>>();
345
346 // we're sorting the output here.
347 let mut lint_reports: BTreeMap<(Span, Vec<char>), AugmentedScriptSet> =
348 BTreeMap::new();
349
350 // The end result is put in `lint_reports` which is sorted.
351 #[allow(rustc::potential_query_instability)]
352 'outerloop: for (augment_script_set, usage) in script_states {
353 let ScriptSetUsage::Suspicious(mut ch_list, sp) = usage else { continue };
354
355 if augment_script_set.is_all() {
356 continue;
357 }
358
359 for existing in verified_augmented_script_sets.iter() {
360 if existing.is_all() {
361 continue;
362 }
363 let mut intersect = *existing;
364 intersect.intersect_with(augment_script_set);
365 if !intersect.is_empty() && !intersect.is_all() {
366 continue 'outerloop;
367 }
368 }
369
370 // We sort primitive chars here and can use unstable sort
371 ch_list.sort_unstable();
372 ch_list.dedup();
373 lint_reports.insert((sp, ch_list), augment_script_set);
374 }
375
376 for ((sp, ch_list), script_set) in lint_reports {
377 let mut includes = String::new();
378 for (idx, ch) in ch_list.into_iter().enumerate() {
379 if idx != 0 {
380 includes += ", ";
381 }
382 let char_info = format!("'{}' (U+{:04X})", ch, ch as u32);
383 includes += &char_info;
384 }
385 cx.emit_span_lint(
386 MIXED_SCRIPT_CONFUSABLES,
387 sp,
388 MixedScriptConfusables { set: script_set.to_string(), includes },
389 );
390 }
391 }
392 }
393 }
394}