Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(es/typescript): Handle unicode for fast ts strip #9202

Merged
merged 4 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 79 additions & 13 deletions crates/swc_fast_ts_strip/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,22 +113,66 @@ pub fn operate(
return Ok(fm.src.to_string());
}

let source = fm.src.clone();
let mut code = fm.src.to_string().into_bytes();

for r in replacements {
for c in &mut code[(r.0 .0 - 1) as usize..(r.1 .0 - 1) as usize] {
if *c == b'\n' || *c == b'\r' {
continue;
let (start, end) = (r.0 .0 as usize - 1, r.1 .0 as usize - 1);

for (i, c) in source[start..end].char_indices() {
let i = start + i;
match c {
// https://262.ecma-international.org/#sec-white-space
'\u{0009}' | '\u{0000B}' | '\u{000C}' | '\u{FEFF}' => continue,
// Space_Separator
'\u{0020}' | '\u{00A0}' | '\u{1680}' | '\u{2000}' | '\u{2001}' | '\u{2002}'
| '\u{2003}' | '\u{2004}' | '\u{2005}' | '\u{2006}' | '\u{2007}' | '\u{2008}'
| '\u{2009}' | '\u{200A}' | '\u{202F}' | '\u{205F}' | '\u{3000}' => continue,
// https://262.ecma-international.org/#sec-line-terminators
'\u{000A}' | '\u{000D}' | '\u{2028}' | '\u{2029}' => continue,
_ => match c.len_utf8() {
1 => {
// Space 0020
code[i] = 0x20;
}
2 => {
// No-Break Space 00A0
code[i] = 0xc2;
code[i + 1] = 0xa0;
}
3 => {
// En Space 2002
code[i] = 0xe2;
code[i + 1] = 0x80;
code[i + 2] = 0x82;
}
4 => {
// We do not have a 4-byte space character in the Unicode standard.

// Space 0020
code[i] = 0x20;
// ZWNBSP FEFF
code[i + 1] = 0xef;
code[i + 2] = 0xbb;
code[i + 3] = 0xbf;
}
_ => unreachable!(),
},
}
*c = b' ';
}
}

for (i, v) in overwrites {
code[i.0 as usize - 1] = v;
}

String::from_utf8(code).map_err(|_| anyhow::anyhow!("failed to convert to utf-8"))
if cfg!(debug_assertions) {
String::from_utf8(code).map_err(|_| anyhow::anyhow!("failed to convert to utf-8"))
} else {
// SAFETY: We've already validated that the source is valid utf-8
// and our operations are limited to character-level string replacements.
unsafe { Ok(String::from_utf8_unchecked(code)) }
}
}

struct TsStrip {
Expand Down Expand Up @@ -197,15 +241,18 @@ impl Visit for TsStrip {
if let Some(ret) = &n.return_type {
self.add_replacement(ret.span);

let l_paren = self.get_prev_token(ret.span_lo() - BytePos(1));
debug_assert_eq!(l_paren.token, Token::RParen);
let r_paren = self.get_prev_token(ret.span_lo() - BytePos(1));
debug_assert_eq!(r_paren.token, Token::RParen);
let arrow = self.get_next_token(ret.span_hi());
debug_assert_eq!(arrow.token, Token::Arrow);
let span = span(l_paren.span.lo, arrow.span.hi);
let span = span(r_paren.span.lo, arrow.span.lo);

let slice = self.get_src_slice(span).as_bytes();
if slice.contains(&b'\n') {
self.add_replacement(l_paren.span);
let slice = self.get_src_slice(span);
if slice
.chars()
.any(|c| matches!(c, '\u{000A}' | '\u{000D}' | '\u{2028}' | '\u{2029}'))
{
self.add_replacement(r_paren.span);

// Instead of moving the arrow mark, we shift the right parenthesis to the next
// line. This is because there might be a line break after the right
Expand All @@ -219,11 +266,17 @@ impl Visit for TsStrip {
//
// ```TypeScript
// (
// )=>
// ) =>
// 1;
// ```

self.add_overwrite(ret.span_hi() - BytePos(1), b')');
let mut pos = ret.span_hi() - BytePos(1);
while !self.src.as_bytes()[pos.0 as usize - 1].is_utf8_char_boundary() {
self.add_overwrite(pos, b' ');
pos = pos - BytePos(1);
}

self.add_overwrite(pos, b')');
}
}

Expand Down Expand Up @@ -611,6 +664,19 @@ impl Visit for TsStrip {
}
}

trait U8Helper {
fn is_utf8_char_boundary(&self) -> bool;
}

impl U8Helper for u8 {
// Copy from std::core::num::u8
#[inline]
fn is_utf8_char_boundary(&self) -> bool {
// This is bit magic equivalent to: b < 128 || b >= 192
(*self as i8) >= -0x40
Comment on lines +675 to +676
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this copied from some legacy code, or eyeball optimized? The expression in comment compiles to exact same code with the "optimized" one: https://godbolt.org/z/KsqjM8jad

IMO, readability is more important if they are actually the same.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy from std::core::num::u8

}
}

fn span(lo: BytePos, hi: BytePos) -> Span {
Span::new(lo, hi, Default::default())
}
16 changes: 16 additions & 0 deletions crates/swc_fast_ts_strip/tests/fixture/test-case-1.js
Original file line number Diff line number Diff line change
Expand Up @@ -180,3 +180,19 @@ void 0;
(
)=>
1;

{
(a, b, c = [] /*comment-1*/ /*comment-2*/
)/*comment-4*/=>
1
};



(
   ) =>
1;

( /*comment-1*/
   ) /*comment-4*/=>
1;
16 changes: 16 additions & 0 deletions crates/swc_fast_ts_strip/tests/fixture/test-case-1.ts
Original file line number Diff line number Diff line change
Expand Up @@ -180,3 +180,19 @@ void 0;
():
any=>
1;

{
(a, b, c: D = [] as any/*comment-1*/)/*comment-2*/:
/*comment-3*/any/*comment-4*/=>
1
};

type 任意の型 = any;

():
任意の型=>
1;

()/*comment-1*/:/*comment-2*/
/*comment-3*/任意の型/*comment-4*/=>
1;
7 changes: 7 additions & 0 deletions crates/swc_fast_ts_strip/tests/fixture/unicode.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@


function foo() {
   (void 1); throw new Error('foo');
}

foo();
7 changes: 7 additions & 0 deletions crates/swc_fast_ts_strip/tests/fixture/unicode.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
type 任意 = any;

function foo() {
<任意>(void 1); throw new Error('foo');
}

foo();
Loading