Skip to content

Optimize bidi character detection. #90559

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Nov 6, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions compiler/rustc_ast/src/lib.rs
Original file line number Diff line number Diff line change
@@ -16,6 +16,7 @@
#![feature(nll)]
#![feature(min_specialization)]
#![recursion_limit = "256"]
#![feature(slice_internals)]

#[macro_use]
extern crate rustc_macros;
@@ -25,6 +26,7 @@ pub mod util {
pub mod comments;
pub mod literal;
pub mod parser;
pub mod unicode;
}

pub mod ast;
35 changes: 35 additions & 0 deletions compiler/rustc_ast/src/util/unicode.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
pub const TEXT_FLOW_CONTROL_CHARS: &[char] = &[
'\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}', '\u{202C}',
'\u{2069}',
];

#[inline]
pub fn contains_text_flow_control_chars(s: &str) -> bool {
// Char - UTF-8
// U+202A - E2 80 AA
// U+202B - E2 80 AB
// U+202C - E2 80 AC
// U+202D - E2 80 AD
// U+202E - E2 80 AE
// U+2066 - E2 81 A6
// U+2067 - E2 81 A7
// U+2068 - E2 81 A8
// U+2069 - E2 81 A9
let mut bytes = s.as_bytes();
loop {
match core::slice::memchr::memchr(0xE2, &bytes) {
Some(idx) => {
// bytes are valid UTF-8 -> E2 must be followed by two bytes
let ch = &bytes[idx..idx + 3];
match ch {
[_, 0x80, 0xAA..=0xAE] | [_, 0x81, 0xA6..=0xA9] => break true,
_ => {}
}
bytes = &bytes[idx + 3..];
}
None => {
break false;
}
}
}
}
4 changes: 2 additions & 2 deletions compiler/rustc_lint/src/context.rs
Original file line number Diff line number Diff line change
@@ -16,9 +16,9 @@

use self::TargetLint::*;

use crate::hidden_unicode_codepoints::UNICODE_TEXT_FLOW_CHARS;
use crate::levels::{is_known_lint_tool, LintLevelsBuilder};
use crate::passes::{EarlyLintPassObject, LateLintPassObject};
use ast::util::unicode::TEXT_FLOW_CONTROL_CHARS;
use rustc_ast as ast;
use rustc_data_structures::fx::FxHashMap;
use rustc_data_structures::sync;
@@ -602,7 +602,7 @@ pub trait LintContext: Sized {
let spans: Vec<_> = content
.char_indices()
.filter_map(|(i, c)| {
UNICODE_TEXT_FLOW_CHARS.contains(&c).then(|| {
TEXT_FLOW_CONTROL_CHARS.contains(&c).then(|| {
let lo = span.lo() + BytePos(2 + i as u32);
(c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32)))
})
12 changes: 4 additions & 8 deletions compiler/rustc_lint/src/hidden_unicode_codepoints.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use crate::{EarlyContext, EarlyLintPass, LintContext};
use ast::util::unicode::{contains_text_flow_control_chars, TEXT_FLOW_CONTROL_CHARS};
use rustc_ast as ast;
use rustc_errors::{Applicability, SuggestionStyle};
use rustc_span::{BytePos, Span, Symbol};
@@ -37,11 +38,6 @@ declare_lint! {

declare_lint_pass!(HiddenUnicodeCodepoints => [TEXT_DIRECTION_CODEPOINT_IN_LITERAL]);

crate const UNICODE_TEXT_FLOW_CHARS: &[char] = &[
'\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}', '\u{202C}',
'\u{2069}',
];

impl HiddenUnicodeCodepoints {
fn lint_text_direction_codepoint(
&self,
@@ -57,7 +53,7 @@ impl HiddenUnicodeCodepoints {
.as_str()
.char_indices()
.filter_map(|(i, c)| {
UNICODE_TEXT_FLOW_CHARS.contains(&c).then(|| {
TEXT_FLOW_CONTROL_CHARS.contains(&c).then(|| {
let lo = span.lo() + BytePos(i as u32 + padding);
(c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32)))
})
@@ -131,7 +127,7 @@ impl HiddenUnicodeCodepoints {
impl EarlyLintPass for HiddenUnicodeCodepoints {
fn check_attribute(&mut self, cx: &EarlyContext<'_>, attr: &ast::Attribute) {
if let ast::AttrKind::DocComment(_, comment) = attr.kind {
if comment.as_str().contains(UNICODE_TEXT_FLOW_CHARS) {
if contains_text_flow_control_chars(&comment.as_str()) {
self.lint_text_direction_codepoint(cx, comment, attr.span, 0, false, "doc comment");
}
}
@@ -142,7 +138,7 @@ impl EarlyLintPass for HiddenUnicodeCodepoints {
let (text, span, padding) = match &expr.kind {
ast::ExprKind::Lit(ast::Lit { token, kind, span }) => {
let text = token.symbol;
if !text.as_str().contains(UNICODE_TEXT_FLOW_CHARS) {
if !contains_text_flow_control_chars(&text.as_str()) {
return;
}
let padding = match kind {
9 changes: 3 additions & 6 deletions compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use rustc_ast::ast::{self, AttrStyle};
use rustc_ast::token::{self, CommentKind, Token, TokenKind};
use rustc_ast::tokenstream::{Spacing, TokenStream};
use rustc_ast::util::unicode::contains_text_flow_control_chars;
use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError, PResult};
use rustc_lexer::unescape::{self, Mode};
use rustc_lexer::{Base, DocStyle, RawStrError};
@@ -137,12 +138,8 @@ impl<'a> StringReader<'a> {
// Opening delimiter of the length 2 is not included into the comment text.
let content_start = start + BytePos(2);
let content = self.str_from(content_start);
let span = self.mk_sp(start, self.pos);
const UNICODE_TEXT_FLOW_CHARS: &[char] = &[
'\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}',
'\u{202C}', '\u{2069}',
];
if content.contains(UNICODE_TEXT_FLOW_CHARS) {
if contains_text_flow_control_chars(content) {
let span = self.mk_sp(start, self.pos);
self.sess.buffer_lint_with_diagnostic(
&TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
span,