From ef24faf130b35964cce159bd130631b6e90ed0ea Mon Sep 17 00:00:00 2001
From: Charles Lew <crlf0710@gmail.com>
Date: Sun, 10 May 2020 09:10:15 +0800
Subject: [PATCH] Refactor non_ascii_idents lints, exclude ascii pair for
 confusable_idents lint.

---
 src/librustc_lint/non_ascii_idents.rs         | 235 ++++++++----------
 src/librustc_session/parse.rs                 |   3 +-
 .../lint-confusable-idents.rs                 |  10 +-
 .../lint-confusable-idents.stderr             |   8 +-
 .../lint-non-ascii-idents.rs                  |   4 +-
 .../lint-non-ascii-idents.stderr              |   8 +-
 .../lint-uncommon-codepoints.rs               |   4 +-
 .../lint-uncommon-codepoints.stderr           |   8 +-
 src/test/ui/parser/issue-62524.rs             |   2 +
 src/test/ui/parser/issue-62524.stderr         |   6 +-
 10 files changed, 131 insertions(+), 157 deletions(-)

diff --git a/src/librustc_lint/non_ascii_idents.rs b/src/librustc_lint/non_ascii_idents.rs
index 064b0255397ce..90bd7ad4acf71 100644
--- a/src/librustc_lint/non_ascii_idents.rs
+++ b/src/librustc_lint/non_ascii_idents.rs
@@ -1,9 +1,7 @@
 use crate::{EarlyContext, EarlyLintPass, LintContext};
 use rustc_ast::ast;
 use rustc_data_structures::fx::FxHashMap;
-use rustc_span::symbol::{Ident, SymbolStr};
-use std::hash::{Hash, Hasher};
-use std::ops::Deref;
+use rustc_span::symbol::SymbolStr;
 
 declare_lint! {
     pub NON_ASCII_IDENTS,
@@ -19,158 +17,133 @@ declare_lint! {
     crate_level_only
 }
 
-// FIXME: Change this to warn.
 declare_lint! {
     pub CONFUSABLE_IDENTS,
-    Allow,
+    Warn,
     "detects visually confusable pairs between identifiers",
     crate_level_only
 }
 
 declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS]);
 
-enum CowBoxSymStr {
-    Interned(SymbolStr),
-    Owned(Box<str>),
-}
-
-impl Deref for CowBoxSymStr {
-    type Target = str;
-
-    fn deref(&self) -> &str {
-        match self {
-            CowBoxSymStr::Interned(interned) => interned,
-            CowBoxSymStr::Owned(ref owned) => owned,
-        }
-    }
-}
-
-impl Hash for CowBoxSymStr {
-    #[inline]
-    fn hash<H: Hasher>(&self, state: &mut H) {
-        Hash::hash(&**self, state)
-    }
-}
-
-impl PartialEq<CowBoxSymStr> for CowBoxSymStr {
-    #[inline]
-    fn eq(&self, other: &CowBoxSymStr) -> bool {
-        PartialEq::eq(&**self, &**other)
-    }
-}
-
-impl Eq for CowBoxSymStr {}
-
-fn calc_skeleton(symbol_str: SymbolStr, buffer: &'_ mut String) -> CowBoxSymStr {
-    use std::mem::swap;
-    use unicode_security::confusable_detection::skeleton;
-    buffer.clear();
-    buffer.extend(skeleton(&symbol_str));
-    if symbol_str == *buffer {
-        CowBoxSymStr::Interned(symbol_str)
-    } else {
-        let mut owned = String::new();
-        swap(buffer, &mut owned);
-        CowBoxSymStr::Owned(owned.into_boxed_str())
-    }
-}
-
-fn is_in_ascii_confusable_closure(c: char) -> bool {
-    // FIXME: move this table to `unicode_security` crate.
-    // data here corresponds to Unicode 13.
-    const ASCII_CONFUSABLE_CLOSURE: &[(u64, u64)] = &[(0x00, 0x7f), (0xba, 0xba), (0x2080, 0x2080)];
-    let c = c as u64;
-    for &(range_start, range_end) in ASCII_CONFUSABLE_CLOSURE {
-        if c >= range_start && c <= range_end {
-            return true;
-        }
-    }
-    false
-}
-
-fn is_in_ascii_confusable_closure_relevant_list(c: char) -> bool {
-    // FIXME: move this table to `unicode_security` crate.
-    // data here corresponds to Unicode 13.
-    const ASCII_CONFUSABLE_CLOSURE_RELEVANT_LIST: &[u64] = &[
-        0x22, 0x25, 0x27, 0x2f, 0x30, 0x31, 0x49, 0x4f, 0x60, 0x6c, 0x6d, 0x6e, 0x72, 0x7c, 0xba,
-        0x2080,
-    ];
-    let c = c as u64;
-    for &item in ASCII_CONFUSABLE_CLOSURE_RELEVANT_LIST {
-        if c == item {
-            return true;
-        }
-    }
-    false
-}
-
 impl EarlyLintPass for NonAsciiIdents {
     fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) {
         use rustc_session::lint::Level;
-        if cx.builder.lint_level(CONFUSABLE_IDENTS).0 == Level::Allow {
+        use rustc_span::Span;
+        use unicode_security::GeneralSecurityProfile;
+        use utils::CowBoxSymStr;
+
+        let check_non_ascii_idents = cx.builder.lint_level(NON_ASCII_IDENTS).0 != Level::Allow;
+        let check_uncommon_codepoints =
+            cx.builder.lint_level(UNCOMMON_CODEPOINTS).0 != Level::Allow;
+        let check_confusable_idents = cx.builder.lint_level(CONFUSABLE_IDENTS).0 != Level::Allow;
+
+        if !check_non_ascii_idents && !check_uncommon_codepoints && !check_confusable_idents {
             return;
         }
+
+        let mut has_non_ascii_idents = false;
         let symbols = cx.sess.parse_sess.symbol_gallery.symbols.lock();
-        let mut symbol_strs_and_spans = Vec::with_capacity(symbols.len());
-        let mut in_fast_path = true;
-        for (symbol, sp) in symbols.iter() {
-            // fast path
+        for (symbol, &sp) in symbols.iter() {
             let symbol_str = symbol.as_str();
-            if !symbol_str.chars().all(is_in_ascii_confusable_closure) {
-                // fallback to slow path.
-                symbol_strs_and_spans.clear();
-                in_fast_path = false;
-                break;
+            if symbol_str.is_ascii() {
+                continue;
             }
-            if symbol_str.chars().any(is_in_ascii_confusable_closure_relevant_list) {
-                symbol_strs_and_spans.push((symbol_str, *sp));
+            has_non_ascii_idents = true;
+            cx.struct_span_lint(NON_ASCII_IDENTS, sp, |lint| {
+                lint.build("identifier contains non-ASCII characters").emit()
+            });
+            if check_uncommon_codepoints
+                && !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed)
+            {
+                cx.struct_span_lint(UNCOMMON_CODEPOINTS, sp, |lint| {
+                    lint.build("identifier contains uncommon Unicode codepoints").emit()
+                })
             }
         }
-        if !in_fast_path {
-            // slow path
-            for (symbol, sp) in symbols.iter() {
+
+        if has_non_ascii_idents && check_confusable_idents {
+            let mut skeleton_map: FxHashMap<CowBoxSymStr, (SymbolStr, Span, bool)> =
+                FxHashMap::with_capacity_and_hasher(symbols.len(), Default::default());
+            let mut str_buf = String::new();
+            for (symbol, &sp) in symbols.iter() {
+                fn calc_skeleton(symbol_str: &SymbolStr, buffer: &mut String) -> CowBoxSymStr {
+                    use std::mem::replace;
+                    use unicode_security::confusable_detection::skeleton;
+                    buffer.clear();
+                    buffer.extend(skeleton(symbol_str));
+                    if *symbol_str == *buffer {
+                        CowBoxSymStr::Interned(symbol_str.clone())
+                    } else {
+                        let owned = replace(buffer, String::new());
+                        CowBoxSymStr::Owned(owned.into_boxed_str())
+                    }
+                }
                 let symbol_str = symbol.as_str();
-                symbol_strs_and_spans.push((symbol_str, *sp));
+                let is_ascii = symbol_str.is_ascii();
+                let skeleton = calc_skeleton(&symbol_str, &mut str_buf);
+                skeleton_map
+                    .entry(skeleton)
+                    .and_modify(|(existing_symbolstr, existing_span, existing_is_ascii)| {
+                        if !*existing_is_ascii || !is_ascii {
+                            cx.struct_span_lint(CONFUSABLE_IDENTS, sp, |lint| {
+                                lint.build(&format!(
+                                    "identifier pair considered confusable between `{}` and `{}`",
+                                    existing_symbolstr, symbol_str
+                                ))
+                                .span_label(
+                                    *existing_span,
+                                    "this is where the previous identifier occurred",
+                                )
+                                .emit();
+                            });
+                        }
+                        if *existing_is_ascii && !is_ascii {
+                            *existing_symbolstr = symbol_str.clone();
+                            *existing_span = sp;
+                            *existing_is_ascii = is_ascii;
+                        }
+                    })
+                    .or_insert((symbol_str, sp, is_ascii));
             }
         }
-        drop(symbols);
-        symbol_strs_and_spans.sort_by_key(|x| x.0.clone());
-        let mut skeleton_map =
-            FxHashMap::with_capacity_and_hasher(symbol_strs_and_spans.len(), Default::default());
-        let mut str_buf = String::new();
-        for (symbol_str, sp) in symbol_strs_and_spans {
-            let skeleton = calc_skeleton(symbol_str.clone(), &mut str_buf);
-            skeleton_map
-                .entry(skeleton)
-                .and_modify(|(existing_symbolstr, existing_span)| {
-                    cx.struct_span_lint(CONFUSABLE_IDENTS, sp, |lint| {
-                        lint.build(&format!(
-                            "identifier pair considered confusable between `{}` and `{}`",
-                            existing_symbolstr, symbol_str
-                        ))
-                        .span_label(
-                            *existing_span,
-                            "this is where the previous identifier occurred",
-                        )
-                        .emit();
-                    });
-                })
-                .or_insert((symbol_str, sp));
+    }
+}
+
+mod utils {
+    use rustc_span::symbol::SymbolStr;
+    use std::hash::{Hash, Hasher};
+    use std::ops::Deref;
+
+    pub(super) enum CowBoxSymStr {
+        Interned(SymbolStr),
+        Owned(Box<str>),
+    }
+
+    impl Deref for CowBoxSymStr {
+        type Target = str;
+
+        fn deref(&self) -> &str {
+            match self {
+                CowBoxSymStr::Interned(interned) => interned,
+                CowBoxSymStr::Owned(ref owned) => owned,
+            }
         }
     }
-    fn check_ident(&mut self, cx: &EarlyContext<'_>, ident: Ident) {
-        use unicode_security::GeneralSecurityProfile;
-        let name_str = ident.name.as_str();
-        if name_str.is_ascii() {
-            return;
+
+    impl Hash for CowBoxSymStr {
+        #[inline]
+        fn hash<H: Hasher>(&self, state: &mut H) {
+            Hash::hash(&**self, state)
         }
-        cx.struct_span_lint(NON_ASCII_IDENTS, ident.span, |lint| {
-            lint.build("identifier contains non-ASCII characters").emit()
-        });
-        if !name_str.chars().all(GeneralSecurityProfile::identifier_allowed) {
-            cx.struct_span_lint(UNCOMMON_CODEPOINTS, ident.span, |lint| {
-                lint.build("identifier contains uncommon Unicode codepoints").emit()
-            })
+    }
+
+    impl PartialEq<CowBoxSymStr> for CowBoxSymStr {
+        #[inline]
+        fn eq(&self, other: &CowBoxSymStr) -> bool {
+            PartialEq::eq(&**self, &**other)
         }
     }
+
+    impl Eq for CowBoxSymStr {}
 }
diff --git a/src/librustc_session/parse.rs b/src/librustc_session/parse.rs
index ddbc95fb1b0b8..f4e5da4d54f46 100644
--- a/src/librustc_session/parse.rs
+++ b/src/librustc_session/parse.rs
@@ -13,6 +13,7 @@ use rustc_span::hygiene::ExpnId;
 use rustc_span::source_map::{FilePathMapping, SourceMap};
 use rustc_span::{MultiSpan, Span, Symbol};
 
+use std::collections::BTreeMap;
 use std::path::PathBuf;
 use std::str;
 
@@ -63,7 +64,7 @@ impl GatedSpans {
 #[derive(Default)]
 pub struct SymbolGallery {
     /// All symbols occurred and their first occurrance span.
-    pub symbols: Lock<FxHashMap<Symbol, Span>>,
+    pub symbols: Lock<BTreeMap<Symbol, Span>>,
 }
 
 impl SymbolGallery {
diff --git a/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-confusable-idents.rs b/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-confusable-idents.rs
index 12093837d2630..e15ed2e70b896 100644
--- a/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-confusable-idents.rs
+++ b/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-confusable-idents.rs
@@ -2,8 +2,14 @@
 #![deny(confusable_idents)]
 #![allow(uncommon_codepoints, non_upper_case_globals)]
 
-const s: usize = 42; //~ ERROR identifier pair considered confusable
+const s: usize = 42;
 
 fn main() {
-    let s = "rust";
+    let s = "rust"; //~ ERROR identifier pair considered confusable
+    not_affected();
+}
+
+fn not_affected() {
+    let s1 = 1;
+    let sl = 'l';
 }
diff --git a/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-confusable-idents.stderr b/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-confusable-idents.stderr
index 40ee18acb3cd4..218f94f7b5829 100644
--- a/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-confusable-idents.stderr
+++ b/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-confusable-idents.stderr
@@ -1,11 +1,11 @@
-error: identifier pair considered confusable between `s` and `s`
-  --> $DIR/lint-confusable-idents.rs:5:7
+error: identifier pair considered confusable between `s` and `s`
+  --> $DIR/lint-confusable-idents.rs:8:9
    |
 LL | const s: usize = 42;
-   |       ^^
+   |       -- this is where the previous identifier occurred
 ...
 LL |     let s = "rust";
-   |         - this is where the previous identifier occurred
+   |         ^
    |
 note: the lint level is defined here
   --> $DIR/lint-confusable-idents.rs:2:9
diff --git a/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-non-ascii-idents.rs b/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-non-ascii-idents.rs
index 057329a0a650c..20d00cf701a15 100644
--- a/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-non-ascii-idents.rs
+++ b/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-non-ascii-idents.rs
@@ -7,5 +7,7 @@ fn coöperation() {} //~ ERROR identifier contains non-ASCII characters
 
 fn main() {
     let naïveté = 2; //~ ERROR identifier contains non-ASCII characters
-    println!("{}", naïveté); //~ ERROR identifier contains non-ASCII characters
+
+    // using the same identifier the second time won't trigger the lint.
+    println!("{}", naïveté);
 }
diff --git a/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-non-ascii-idents.stderr b/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-non-ascii-idents.stderr
index 6c9f0866c017a..048b6ff5d687f 100644
--- a/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-non-ascii-idents.stderr
+++ b/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-non-ascii-idents.stderr
@@ -22,11 +22,5 @@ error: identifier contains non-ASCII characters
 LL |     let naïveté = 2;
    |         ^^^^^^^
 
-error: identifier contains non-ASCII characters
-  --> $DIR/lint-non-ascii-idents.rs:10:20
-   |
-LL |     println!("{}", naïveté);
-   |                    ^^^^^^^
-
-error: aborting due to 4 previous errors
+error: aborting due to 3 previous errors
 
diff --git a/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-uncommon-codepoints.rs b/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-uncommon-codepoints.rs
index 7ac0d035d5bf1..b5e251e047b5a 100644
--- a/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-uncommon-codepoints.rs
+++ b/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-uncommon-codepoints.rs
@@ -7,5 +7,7 @@ fn dijkstra() {} //~ ERROR identifier contains uncommon Unicode codepoints
 
 fn main() {
     let ㇻㇲㇳ = "rust"; //~ ERROR identifier contains uncommon Unicode codepoints
-    println!("{}", ㇻㇲㇳ); //~ ERROR identifier contains uncommon Unicode codepoints
+
+    // using the same identifier the second time won't trigger the lint.
+    println!("{}", ㇻㇲㇳ);
 }
diff --git a/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-uncommon-codepoints.stderr b/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-uncommon-codepoints.stderr
index b270bd1f051c2..05ea3d5de7dbc 100644
--- a/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-uncommon-codepoints.stderr
+++ b/src/test/ui/lint/rfc-2457-non-ascii-idents/lint-uncommon-codepoints.stderr
@@ -22,11 +22,5 @@ error: identifier contains uncommon Unicode codepoints
 LL |     let ㇻㇲㇳ = "rust";
    |         ^^^^^^
 
-error: identifier contains uncommon Unicode codepoints
-  --> $DIR/lint-uncommon-codepoints.rs:10:20
-   |
-LL |     println!("{}", ㇻㇲㇳ);
-   |                    ^^^^^^
-
-error: aborting due to 4 previous errors
+error: aborting due to 3 previous errors
 
diff --git a/src/test/ui/parser/issue-62524.rs b/src/test/ui/parser/issue-62524.rs
index 57de4b87b0fe6..5259dfe2e656c 100644
--- a/src/test/ui/parser/issue-62524.rs
+++ b/src/test/ui/parser/issue-62524.rs
@@ -1,4 +1,6 @@
 // ignore-tidy-trailing-newlines
 // error-pattern: aborting due to 3 previous errors
+#![allow(uncommon_codepoints)]
+
 y![
 Ϥ,
\ No newline at end of file
diff --git a/src/test/ui/parser/issue-62524.stderr b/src/test/ui/parser/issue-62524.stderr
index 8191c9682cefd..d5e07622b11b9 100644
--- a/src/test/ui/parser/issue-62524.stderr
+++ b/src/test/ui/parser/issue-62524.stderr
@@ -1,5 +1,5 @@
 error: this file contains an unclosed delimiter
-  --> $DIR/issue-62524.rs:4:3
+  --> $DIR/issue-62524.rs:6:3
    |
 LL | y![
    |   - unclosed delimiter
@@ -7,7 +7,7 @@ LL | Ϥ,
    |   ^
 
 error: macros that expand to items must be delimited with braces or followed by a semicolon
-  --> $DIR/issue-62524.rs:3:3
+  --> $DIR/issue-62524.rs:5:3
    |
 LL |   y![
    |  ___^
@@ -24,7 +24,7 @@ LL | Ϥ,;
    |   ^
 
 error: cannot find macro `y` in this scope
-  --> $DIR/issue-62524.rs:3:1
+  --> $DIR/issue-62524.rs:5:1
    |
 LL | y![
    | ^