rustc_lint/non_ascii_idents.rs
1use rustc_ast as ast;
2use rustc_data_structures::fx::FxIndexMap;
3use rustc_data_structures::unord::UnordMap;
4use rustc_session::{declare_lint, declare_lint_pass};
5use rustc_span::Symbol;
6use unicode_security::general_security_profile::IdentifierType;
7
8use crate::lints::{
9 ConfusableIdentifierPair, IdentifierNonAsciiChar, IdentifierUncommonCodepoints,
10 MixedScriptConfusables,
11};
12use crate::{EarlyContext, EarlyLintPass, LintContext};
13
14declare_lint! {
15 /// The `non_ascii_idents` lint detects non-ASCII identifiers.
16 ///
17 /// ### Example
18 ///
19 /// ```rust,compile_fail
20 /// # #![allow(unused)]
21 /// #![deny(non_ascii_idents)]
22 /// fn main() {
23 /// let föö = 1;
24 /// }
25 /// ```
26 ///
27 /// {{produces}}
28 ///
29 /// ### Explanation
30 ///
31 /// This lint allows projects that wish to retain the limit of only using
32 /// ASCII characters to switch this lint to "forbid" (for example to ease
33 /// collaboration or for security reasons).
34 /// See [RFC 2457] for more details.
35 ///
36 /// [RFC 2457]: https://github.com/rust-lang/rfcs/blob/master/text/2457-non-ascii-idents.md
37 pub NON_ASCII_IDENTS,
38 Allow,
39 "detects non-ASCII identifiers",
40 crate_level_only
41}
42
43declare_lint! {
44 /// The `uncommon_codepoints` lint detects uncommon Unicode codepoints in
45 /// identifiers.
46 ///
47 /// ### Example
48 ///
49 /// ```rust
50 /// # #![allow(unused)]
51 /// const µ: f64 = 0.000001;
52 /// ```
53 ///
54 /// {{produces}}
55 ///
56 /// ### Explanation
57 ///
58 /// This lint warns about using characters which are not commonly used, and may
59 /// cause visual confusion.
60 ///
61 /// This lint is triggered by identifiers that contain a codepoint that is
62 /// not part of the set of "Allowed" codepoints as described by [Unicode®
63 /// Technical Standard #39 Unicode Security Mechanisms Section 3.1 General
64 /// Security Profile for Identifiers][TR39Allowed].
65 ///
66 /// Note that the set of uncommon codepoints may change over time. Beware
67 /// that if you "forbid" this lint that existing code may fail in the
68 /// future.
69 ///
70 /// [TR39Allowed]: https://www.unicode.org/reports/tr39/#General_Security_Profile
71 pub UNCOMMON_CODEPOINTS,
72 Warn,
73 "detects uncommon Unicode codepoints in identifiers",
74 crate_level_only
75}
76
77declare_lint! {
78 /// The `confusable_idents` lint detects visually confusable pairs between
79 /// identifiers.
80 ///
81 /// ### Example
82 ///
83 /// ```rust
84 /// // Latin Capital Letter E With Caron
85 /// pub const Ě: i32 = 1;
86 /// // Latin Capital Letter E With Breve
87 /// pub const Ĕ: i32 = 2;
88 /// ```
89 ///
90 /// {{produces}}
91 ///
92 /// ### Explanation
93 ///
94 /// This lint warns when different identifiers may appear visually similar,
95 /// which can cause confusion.
96 ///
97 /// The confusable detection algorithm is based on [Unicode® Technical
98 /// Standard #39 Unicode Security Mechanisms Section 4 Confusable
99 /// Detection][TR39Confusable]. For every distinct identifier X execute
100 /// the function `skeleton(X)`. If there exist two distinct identifiers X
101 /// and Y in the same crate where `skeleton(X) = skeleton(Y)` report it.
102 /// The compiler uses the same mechanism to check if an identifier is too
103 /// similar to a keyword.
104 ///
105 /// Note that the set of confusable characters may change over time.
106 /// Beware that if you "forbid" this lint that existing code may fail in
107 /// the future.
108 ///
109 /// [TR39Confusable]: https://www.unicode.org/reports/tr39/#Confusable_Detection
110 pub CONFUSABLE_IDENTS,
111 Warn,
112 "detects visually confusable pairs between identifiers",
113 crate_level_only
114}
115
116declare_lint! {
117 /// The `mixed_script_confusables` lint detects visually confusable
118 /// characters in identifiers between different [scripts].
119 ///
120 /// [scripts]: https://en.wikipedia.org/wiki/Script_(Unicode)
121 ///
122 /// ### Example
123 ///
124 /// ```rust
125 /// // The Japanese katakana character エ can be confused with the Han character 工.
126 /// const エ: &'static str = "アイウ";
127 /// ```
128 ///
129 /// {{produces}}
130 ///
131 /// ### Explanation
132 ///
133 /// This lint warns when characters between different scripts may appear
134 /// visually similar, which can cause confusion.
135 ///
136 /// If the crate contains other identifiers in the same script that have
137 /// non-confusable characters, then this lint will *not* be issued. For
138 /// example, if the example given above has another identifier with
139 /// katakana characters (such as `let カタカナ = 123;`), then this indicates
140 /// that you are intentionally using katakana, and it will not warn about
141 /// it.
142 ///
143 /// Note that the set of confusable characters may change over time.
144 /// Beware that if you "forbid" this lint that existing code may fail in
145 /// the future.
146 pub MIXED_SCRIPT_CONFUSABLES,
147 Warn,
148 "detects Unicode scripts whose mixed script confusables codepoints are solely used",
149 crate_level_only
150}
151
152declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]);
153
154impl EarlyLintPass for NonAsciiIdents {
155 fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) {
156 use std::collections::BTreeMap;
157
158 use rustc_session::lint::Level;
159 use rustc_span::Span;
160 use unicode_security::GeneralSecurityProfile;
161
162 let check_non_ascii_idents = cx.builder.lint_level(NON_ASCII_IDENTS).level != Level::Allow;
163 let check_uncommon_codepoints =
164 cx.builder.lint_level(UNCOMMON_CODEPOINTS).level != Level::Allow;
165 let check_confusable_idents =
166 cx.builder.lint_level(CONFUSABLE_IDENTS).level != Level::Allow;
167 let check_mixed_script_confusables =
168 cx.builder.lint_level(MIXED_SCRIPT_CONFUSABLES).level != Level::Allow;
169
170 if !check_non_ascii_idents
171 && !check_uncommon_codepoints
172 && !check_confusable_idents
173 && !check_mixed_script_confusables
174 {
175 return;
176 }
177
178 let mut has_non_ascii_idents = false;
179 let symbols = cx.sess().psess.symbol_gallery.symbols.lock();
180
181 // Sort by `Span` so that error messages make sense with respect to the
182 // order of identifier locations in the code.
183 // We will soon sort, so the initial order does not matter.
184 #[allow(rustc::potential_query_instability)]
185 let mut symbols: Vec<_> = symbols.iter().collect();
186 symbols.sort_by_key(|k| k.1);
187 for &(ref symbol, &sp) in symbols.iter() {
188 let symbol_str = symbol.as_str();
189 if symbol_str.is_ascii() {
190 continue;
191 }
192 has_non_ascii_idents = true;
193 cx.emit_span_lint(NON_ASCII_IDENTS, sp, IdentifierNonAsciiChar);
194 if check_uncommon_codepoints
195 && !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed)
196 {
197 let mut chars: Vec<_> = symbol_str
198 .chars()
199 .map(|c| (c, GeneralSecurityProfile::identifier_type(c)))
200 .collect();
201
202 for (id_ty, id_ty_descr) in [
203 (IdentifierType::Exclusion, "Exclusion"),
204 (IdentifierType::Technical, "Technical"),
205 (IdentifierType::Limited_Use, "Limited_Use"),
206 (IdentifierType::Not_NFKC, "Not_NFKC"),
207 ] {
208 let codepoints: Vec<_> =
209 chars.extract_if(.., |(_, ty)| *ty == Some(id_ty)).collect();
210 if codepoints.is_empty() {
211 continue;
212 }
213 cx.emit_span_lint(
214 UNCOMMON_CODEPOINTS,
215 sp,
216 IdentifierUncommonCodepoints {
217 codepoints_len: codepoints.len(),
218 codepoints: codepoints.into_iter().map(|(c, _)| c).collect(),
219 identifier_type: id_ty_descr,
220 },
221 );
222 }
223
224 let remaining = chars
225 .extract_if(.., |(c, _)| !GeneralSecurityProfile::identifier_allowed(*c))
226 .collect::<Vec<_>>();
227 if !remaining.is_empty() {
228 cx.emit_span_lint(
229 UNCOMMON_CODEPOINTS,
230 sp,
231 IdentifierUncommonCodepoints {
232 codepoints_len: remaining.len(),
233 codepoints: remaining.into_iter().map(|(c, _)| c).collect(),
234 identifier_type: "Restricted",
235 },
236 );
237 }
238 }
239 }
240
241 if has_non_ascii_idents && check_confusable_idents {
242 let mut skeleton_map: UnordMap<Symbol, (Symbol, Span, bool)> =
243 UnordMap::with_capacity(symbols.len());
244 let mut skeleton_buf = String::new();
245
246 for &(&symbol, &sp) in symbols.iter() {
247 use unicode_security::confusable_detection::skeleton;
248
249 let symbol_str = symbol.as_str();
250 let is_ascii = symbol_str.is_ascii();
251
252 // Get the skeleton as a `Symbol`.
253 skeleton_buf.clear();
254 skeleton_buf.extend(skeleton(symbol_str));
255 let skeleton_sym = if *symbol_str == *skeleton_buf {
256 symbol
257 } else {
258 Symbol::intern(&skeleton_buf)
259 };
260
261 skeleton_map
262 .entry(skeleton_sym)
263 .and_modify(|(existing_symbol, existing_span, existing_is_ascii)| {
264 if !*existing_is_ascii || !is_ascii {
265 cx.emit_span_lint(
266 CONFUSABLE_IDENTS,
267 sp,
268 ConfusableIdentifierPair {
269 existing_sym: *existing_symbol,
270 sym: symbol,
271 label: *existing_span,
272 main_label: sp,
273 },
274 );
275 }
276 if *existing_is_ascii && !is_ascii {
277 *existing_symbol = symbol;
278 *existing_span = sp;
279 *existing_is_ascii = is_ascii;
280 }
281 })
282 .or_insert((symbol, sp, is_ascii));
283 }
284 }
285
286 if has_non_ascii_idents && check_mixed_script_confusables {
287 use unicode_security::is_potential_mixed_script_confusable_char;
288 use unicode_security::mixed_script::AugmentedScriptSet;
289
290 #[derive(Clone)]
291 enum ScriptSetUsage {
292 Suspicious(Vec<char>, Span),
293 Verified,
294 }
295
296 let mut script_states: FxIndexMap<AugmentedScriptSet, ScriptSetUsage> =
297 Default::default();
298 let latin_augmented_script_set = AugmentedScriptSet::for_char('A');
299 script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified);
300
301 let mut has_suspicious = false;
302 for &(ref symbol, &sp) in symbols.iter() {
303 let symbol_str = symbol.as_str();
304 for ch in symbol_str.chars() {
305 if ch.is_ascii() {
306 // all ascii characters are covered by exception.
307 continue;
308 }
309 if !GeneralSecurityProfile::identifier_allowed(ch) {
310 // this character is covered by `uncommon_codepoints` lint.
311 continue;
312 }
313 let augmented_script_set = AugmentedScriptSet::for_char(ch);
314 script_states
315 .entry(augmented_script_set)
316 .and_modify(|existing_state| {
317 if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state {
318 if is_potential_mixed_script_confusable_char(ch) {
319 ch_list.push(ch);
320 } else {
321 *existing_state = ScriptSetUsage::Verified;
322 }
323 }
324 })
325 .or_insert_with(|| {
326 if !is_potential_mixed_script_confusable_char(ch) {
327 ScriptSetUsage::Verified
328 } else {
329 has_suspicious = true;
330 ScriptSetUsage::Suspicious(vec![ch], sp)
331 }
332 });
333 }
334 }
335
336 if has_suspicious {
337 // The end result is put in `lint_reports` which is sorted.
338 #[allow(rustc::potential_query_instability)]
339 let verified_augmented_script_sets = script_states
340 .iter()
341 .flat_map(|(k, v)| match v {
342 ScriptSetUsage::Verified => Some(*k),
343 _ => None,
344 })
345 .collect::<Vec<_>>();
346
347 // we're sorting the output here.
348 let mut lint_reports: BTreeMap<(Span, Vec<char>), AugmentedScriptSet> =
349 BTreeMap::new();
350
351 // The end result is put in `lint_reports` which is sorted.
352 #[allow(rustc::potential_query_instability)]
353 'outerloop: for (augment_script_set, usage) in script_states {
354 let ScriptSetUsage::Suspicious(mut ch_list, sp) = usage else { continue };
355
356 if augment_script_set.is_all() {
357 continue;
358 }
359
360 for existing in verified_augmented_script_sets.iter() {
361 if existing.is_all() {
362 continue;
363 }
364 let mut intersect = *existing;
365 intersect.intersect_with(augment_script_set);
366 if !intersect.is_empty() && !intersect.is_all() {
367 continue 'outerloop;
368 }
369 }
370
371 // We sort primitive chars here and can use unstable sort
372 ch_list.sort_unstable();
373 ch_list.dedup();
374 lint_reports.insert((sp, ch_list), augment_script_set);
375 }
376
377 for ((sp, ch_list), script_set) in lint_reports {
378 let mut includes = String::new();
379 for (idx, ch) in ch_list.into_iter().enumerate() {
380 if idx != 0 {
381 includes += ", ";
382 }
383 let char_info = format!("'{}' (U+{:04X})", ch, ch as u32);
384 includes += &char_info;
385 }
386 cx.emit_span_lint(
387 MIXED_SCRIPT_CONFUSABLES,
388 sp,
389 MixedScriptConfusables { set: script_set.to_string(), includes },
390 );
391 }
392 }
393 }
394 }
395}