From 5696f1e4f3bb5595f746bea28ea494e4b645343f Mon Sep 17 00:00:00 2001 From: Ian Gregory Date: Wed, 23 Oct 2024 00:25:27 -0400 Subject: [PATCH] Convert raw strings to non-raw when fixes add escape sequences (#13294) --- .../fixtures/pylint/invalid_characters.py | Bin 1639 -> 2118 bytes crates/ruff_linter/src/checkers/tokens.rs | 6 +- .../pylint/rules/invalid_string_characters.rs | 112 ++++++++++++++++-- ..._tests__PLE2510_invalid_characters.py.snap | Bin 1894 -> 3863 bytes ..._tests__PLE2512_invalid_characters.py.snap | Bin 2398 -> 3851 bytes ..._tests__PLE2513_invalid_characters.py.snap | Bin 2342 -> 3378 bytes ..._tests__PLE2514_invalid_characters.py.snap | Bin 809 -> 1516 bytes ..._tests__PLE2515_invalid_characters.py.snap | Bin 6763 -> 7454 bytes crates/ruff_python_parser/src/token.rs | 20 ++++ 9 files changed, 128 insertions(+), 10 deletions(-) diff --git a/crates/ruff_linter/resources/test/fixtures/pylint/invalid_characters.py b/crates/ruff_linter/resources/test/fixtures/pylint/invalid_characters.py index 79c9307695a9153b9945e2976c45335259630bfa..048b5c232cdf0dfdf6ed04d11013bb87fbe15d14 100644 GIT binary patch delta 451 zcmaFPb4*}EIvXR`WIi^XdP8F)OB3bH;^NZO`x^;#h00sx}hk*xp# delta 7 OcmX>m@SJBuIvW5DXaeT| diff --git a/crates/ruff_linter/src/checkers/tokens.rs b/crates/ruff_linter/src/checkers/tokens.rs index f272b910f8d36..15cbe95587840 100644 --- a/crates/ruff_linter/src/checkers/tokens.rs +++ b/crates/ruff_linter/src/checkers/tokens.rs @@ -10,7 +10,6 @@ use ruff_diagnostics::Diagnostic; use ruff_python_index::Indexer; use ruff_python_parser::Tokens; use ruff_source_file::Locator; -use ruff_text_size::Ranged; use crate::directives::TodoComment; use crate::registry::{AsRule, Rule}; @@ -93,11 +92,12 @@ pub(crate) fn check_tokens( Rule::InvalidCharacterNul, Rule::InvalidCharacterZeroWidthSpace, ]) { + let mut last_fstring_start = None; for token in tokens { pylint::rules::invalid_string_characters( &mut diagnostics, - token.kind(), - token.range(), + token, + &mut last_fstring_start, locator, ); } diff --git a/crates/ruff_linter/src/rules/pylint/rules/invalid_string_characters.rs b/crates/ruff_linter/src/rules/pylint/rules/invalid_string_characters.rs index 16a0b8ba794e5..d636846053738 100644 --- a/crates/ruff_linter/src/rules/pylint/rules/invalid_string_characters.rs +++ b/crates/ruff_linter/src/rules/pylint/rules/invalid_string_characters.rs @@ -1,3 +1,7 @@ +use ruff_python_ast::str::Quote; +use ruff_python_ast::StringFlags; +use ruff_python_parser::Token; +use ruff_text_size::Ranged; use ruff_text_size::{TextLen, TextRange, TextSize}; use ruff_diagnostics::AlwaysFixableViolation; @@ -172,19 +176,33 @@ impl AlwaysFixableViolation for InvalidCharacterZeroWidthSpace { } /// PLE2510, PLE2512, PLE2513, PLE2514, PLE2515 -pub(crate) fn invalid_string_characters( +pub(crate) fn invalid_string_characters<'a>( diagnostics: &mut Vec, - token: TokenKind, - range: TextRange, + token: &'a Token, + last_fstring_start: &mut Option<&'a Token>, locator: &Locator, ) { - let text = match token { + struct InvalidCharacterDiagnostic { + diagnostic: Diagnostic, + edit: Edit, + } + + let kind = token.kind(); + let range = token.range(); + + let text = match kind { // We can't use the `value` field since it's decoded and e.g. for f-strings removed a curly // brace that escaped another curly brace, which would gives us wrong column information. TokenKind::String | TokenKind::FStringMiddle => locator.slice(range), + TokenKind::FStringStart => { + *last_fstring_start = Some(token); + return; + } _ => return, }; + // Accumulate diagnostics here to postpone generating shared fixes until we know we need them. + let mut new_diagnostics: Vec = Vec::new(); for (column, match_) in text.match_indices(&['\x08', '\x1A', '\x1B', '\0', '\u{200b}']) { let c = match_.chars().next().unwrap(); let (replacement, rule): (&str, DiagnosticKind) = match c { @@ -201,8 +219,88 @@ pub(crate) fn invalid_string_characters( let location = range.start() + TextSize::try_from(column).unwrap(); let range = TextRange::at(location, c.text_len()); - diagnostics.push(Diagnostic::new(rule, range).with_fix(Fix::safe_edit( - Edit::range_replacement(replacement.to_string(), range), - ))); + new_diagnostics.push(InvalidCharacterDiagnostic { + diagnostic: Diagnostic::new(rule, range), + // This is integrated with other fixes and attached to the diagnostic below. + edit: Edit::range_replacement(replacement.to_string(), range), + }); + } + if new_diagnostics.is_empty() { + // No issues, nothing to fix. + return; + } + + // Convert raw strings to non-raw strings when fixes are applied: + // https://github.com/astral-sh/ruff/issues/13294#issuecomment-2341955180 + let mut string_conversion_edits = Vec::new(); + if token.is_raw_string() { + let string_flags = token.string_flags(); + let prefix = string_flags.prefix().as_str(); + + // 1. Remove the raw string prefix. + for (column, match_) in prefix.match_indices(&['r', 'R']) { + let c = match_.chars().next().unwrap(); + + let entire_string_range = match kind { + TokenKind::String => range, + _ => last_fstring_start.unwrap().range(), + }; + let location = entire_string_range.start() + TextSize::try_from(column).unwrap(); + let range = TextRange::at(location, c.text_len()); + + string_conversion_edits.push(Edit::range_deletion(range)); + } + + // 2. Escape '\' and quote characters inside the string content. + let (content_start, content_end): (TextSize, TextSize) = match kind { + TokenKind::String => ( + prefix.text_len() + string_flags.quote_len(), + TextSize::try_from(text.len()).unwrap() - string_flags.quote_len(), + ), + _ => (0.into(), text.len().try_into().unwrap()), + }; + let string_content = &text[content_start.to_usize()..content_end.to_usize()]; + for (column, match_) in string_content.match_indices(&['\\', '\'', '"']) { + let c = match_.chars().next().unwrap(); + let replacement: &str = match c { + '\\' => "\\\\", + '\'' | '"' => { + if string_flags.is_triple_quoted() { + continue; + } + match (c, string_flags.quote_style()) { + ('\'', Quote::Single) => "\\'", + ('"', Quote::Double) => "\\\"", + _ => { + continue; + } + } + } + _ => { + continue; + } + }; + + let location = range.start() + content_start + TextSize::try_from(column).unwrap(); + let range = TextRange::at(location, c.text_len()); + + string_conversion_edits.push(Edit::range_replacement(replacement.to_string(), range)); + } + + // 3. Add back '\' characters for line continuation in non-triple-quoted strings. + if !string_flags.is_triple_quoted() { + for (column, _match) in string_content.match_indices("\\\n") { + let location = range.start() + content_start + TextSize::try_from(column).unwrap(); + string_conversion_edits.push(Edit::insertion( + "\\n\\".to_string(), + location + TextSize::from(1), + )); + } + } + } + + for InvalidCharacterDiagnostic { diagnostic, edit } in new_diagnostics { + diagnostics + .push(diagnostic.with_fix(Fix::safe_edits(edit, string_conversion_edits.clone()))); } } diff --git a/crates/ruff_linter/src/rules/pylint/snapshots/ruff_linter__rules__pylint__tests__PLE2510_invalid_characters.py.snap b/crates/ruff_linter/src/rules/pylint/snapshots/ruff_linter__rules__pylint__tests__PLE2510_invalid_characters.py.snap index 3f23d12764d772922a9930563ab968121993a7d8..778ca022d93c384d1f4076363b6c26853629991b 100644 GIT binary patch literal 3863 zcmcgu&2HO95bl|e;M0IUu&pdhD~Y5;zy*r5hX6SQDS9efVM#45A{5DR$vAFgpobiS z76o$YDKAijORkO zvLXnEQ5feUYst*Vd4zA%YkJk1B%||8YE9f0#?N^aj)wl2XWYlt@;tqEnCVy!Jbv_L z$7F5z=F@NCVYv>9SV*6zVg#!l;Bo({Ot~-4ppXJ|*V7@4WiI$g1AwW zAFC+NK!4C}w>$bLaPb*D!T22K>o^v^kdkNDUxb|<)<9#-oyQ<^2qP@#F5MJ)wdd??-j zFdF>`C9ksx*b55p+)pMgj-TLBQ;ts!4K=g_v$}ZlHVeeqtDgW>M)@A{ixx7vWL6h(xEQE=BTI|2IWs zwE=B|K6I%Fv>2da!6U>+M_DPol3t1TD!n?;#7kr6a%|n;x{4X<;fV9TQ164bWdYg- zlXY@*MUM_2-BAd(=^XP!v1`h$gjAj`c$HNe)=@W_O;SYp;D~gFdM;H|xg^-TbF4Yk zOP6#kkVm5Sz?N>e0c``?iZC`mi$$U`VPKO=RW2Q{sU4^SLO|*wyB8C+VF`b#tXlO=pF#Dy~Nn=16oPk?p|y6k1Mf56;Tb9P_3$r zu3%Ky|2)PrypqAo^?w7_7nMWx)rGz5WTRBp)q?CPfl%y=e!fc+8Fm3BRlD4xq}ngX lq%R~8CnN`@uzS3f&Psuy92%r9wIB6B(n4?c0MPcu^>00A(UAZE delta 7 OcmbO(_l$2t8an_Cbppo# diff --git a/crates/ruff_linter/src/rules/pylint/snapshots/ruff_linter__rules__pylint__tests__PLE2512_invalid_characters.py.snap b/crates/ruff_linter/src/rules/pylint/snapshots/ruff_linter__rules__pylint__tests__PLE2512_invalid_characters.py.snap index 3ef59bd9291bda5a554c924a091ce5baeb4f46d0..234b6db863595ee6920f3ffb28b22474442d9f30 100644 GIT binary patch literal 3851 zcmcgvPjBNy6z4o2p{FPB!4knuJ;~UKr9FVe0f_@*#c6{q*BLh!&Y$(zv@ZQ*QohIJmVn|<_$@I<4d%xfNy>TteVrrSEq4Xe3gG{P^ znn#hJ#EVR(eU*mzJ;B-4OPcD>m($Z!F$V`$A_NHJ^14D zPr0x;Jpb@(_^fb*d?8gBtmG7`CGhO4Pd=YIB(L&iK$oMs=HSi0^_3cjq1*{;wj8zvhv_dxk`di!eyMz zpokKbe38pVC>gu`@ega4c-@ z02<}*0-A=PK*z*&fytUk5uQEhij+g>kA+Iv-f=;rijQ3ehvCzr$I|jJ0Y{1kBd0S< zTI%INIhDK=(4>>qsmeem9gFYwtf5acJupo^XGQ+s~*4GZvNVC-s$l#+qXZ#?GLZf3?_(sCP+}I zPi8kIWN%-`>12ukq(lm4tWo+LeURB_XJypTI23R?6%5FHnp5 z`DwVE_X9j7NGvte&8Z(Nm19eCheABjWAvHJMP}LV&>oLR_L-v(eq<>L4rncrYzgE5 z{tvP$z7X-X_0K6IhI`HKB!xQ3NMFTZrzDCIpsnFbm#Sf0Uk4}WhG8`IIWg+)^65H} zBUunDl@@4F>0#WF61Z@fbGDBlYExrKr$1I90@^fz|FS+P3+tdhaFz5BJ+AA7J|w!XBM5hpv(|xbziv z_e7Q-FY;w3QH!ZLfr$wQV8=I^L$?C?;#IIZ;%lNeIiHy3Ll*+_wtMZg;LvSTi)6aY z&l7w!Dr}KLQiQ}wVVsWDtr+0 zee{W~UZX#`7ykNr2MI&`TT;-91;+)nJ%(!;6pt|YpGR4BNOhBzT<+N&Ny(*-c^sm9 z0U=x%kxLpkXh0gQmHDoQ-Fs@P)#d(%hTe|!ge5)sS0&mSS7!uhMIz1^(AM*k00L4XqL-AQ=0AKf*|*H)rrWq2G_0dbp@bdnm4Zu F_BT3x!TA6H delta 7 OcmeB{yC<|EjuQY2`vS25 diff --git a/crates/ruff_linter/src/rules/pylint/snapshots/ruff_linter__rules__pylint__tests__PLE2513_invalid_characters.py.snap b/crates/ruff_linter/src/rules/pylint/snapshots/ruff_linter__rules__pylint__tests__PLE2513_invalid_characters.py.snap index 993c89041e59f44a6191eeb587a963a7feaca077..d777c7ea4d665308ae91e281500f31f205c95ef2 100644 GIT binary patch delta 774 zcmZ1`v`K2iZ*~qdLxmazE;FOa^V#%d4ULT~O_Vc>i%V0J^K)}k^GbA$j7r22FEG2;lVX^e3Dn|}qRav; z7OMlT2H7m7uCC5yW{T>{l>E}9oK(0gl|V{klvshzoUJtRqUhuUEL=d$Ir$U|SG1WK zszvB}mB6}{6d070l(@{yQ7wXm3fQ9f+|rzq%$&?TXutq%ocxR3hSkgn8rbzz4Qyce zkQhYu3P4b!ixipYUWEh`IFz`6YC)l@jZ?W&3|M(gjFLi3g`pELY|Iq!hNYiE5ikfL Wu?7oF(!!I=%t8T(YJkxm!vz3P8uS$a delta 12 TcmdlawM=NkZ}!dp9NLTkB2ffw diff --git a/crates/ruff_linter/src/rules/pylint/snapshots/ruff_linter__rules__pylint__tests__PLE2514_invalid_characters.py.snap b/crates/ruff_linter/src/rules/pylint/snapshots/ruff_linter__rules__pylint__tests__PLE2514_invalid_characters.py.snap index 3ab1a52ec4958bb83546a3b43e2f688ca4a4c373..f416d129798fb8c67cc98d9651c34a0e2fb0b9a5 100644 GIT binary patch literal 1516 zcmb`H!ET%|5QcO1BlPsk7o?_ScLVEWk#g*zQdCu~dMXtnV7jOTs0~dbap<9kNWJvl zC+X|#lXQ%;h~4Z~+9+s{u=U&i=bJIE>oQ%nDv<#a6*p2(RGVc{k(Z5B6Ri>&7c^Vn znW@Qjm7XZgTw0cw-{T@rqhuMYI3cw@sqX^sESLv)clr8sKJ)SERmvs>*5;u9XgtY}NzqPVVBx9?efAJyCB(nULc|IsQql&7pv_s~Ex3ep9 zGzUcoK8Df!)D**^tEpYBj?%4AOgga|zGEZr_l zi>de~YE)hqGSYeZrJ#na8c8jusAEDn;CNwrN0_!VVylYMs=XSCxsiq(tKWY(6XqI? zHs03qUd?gDIUijs!uY>ued~m2oRO1}?dZBKnw)+bHi3{VF<~w#rw`h@d*Xc*9w_#m z+QSM#rf+EUXWkxD{@4~IN(77I{xSpE3ruz%a) BvH}1A delta 9 QcmaFEy^?K%1~VfU01=e}jQ{`u diff --git a/crates/ruff_linter/src/rules/pylint/snapshots/ruff_linter__rules__pylint__tests__PLE2515_invalid_characters.py.snap b/crates/ruff_linter/src/rules/pylint/snapshots/ruff_linter__rules__pylint__tests__PLE2515_invalid_characters.py.snap index bf097d02f2cecfa62c8fd2e6eaf1fcd183a2a6ab..731a700d14528fc1422b0d9a233b0b1f573f2dbe 100644 GIT binary patch delta 545 zcmaEDGS6y5mek}%F-c}KORLEj`NHeX%oS=BiW1A?i!<}mb5fyHVQGFzYJ6^KPDy4? zW?rg-twND{jJiS$mzf1nxe|zB31mEK=vL=aPyn(N0(@MJObt!B%nhLCl@w(b;5AQO z9cUmYm$@Ot#K|!dHuYvE3P1#L5T*fOXQ->kD5wLSAO+OVWoD`XL?|9e$uCXHNd7(dL^(9B?Sf`iwhV!An)mt6hL4d&@ckCVZj6n zYi)c+)k6bYJthVe+A%SCF`(cD1-drSKsX;5_S!L}Mg|5+>YxB8G0Z_h4hwZIb0Y;H Is(}P20AlT@pa1{> delta 9 QcmbPd_1a`ZmJ}lw02F=$Hvj+t diff --git a/crates/ruff_python_parser/src/token.rs b/crates/ruff_python_parser/src/token.rs index 9c240f4964e5b..8d5763fdab457 100644 --- a/crates/ruff_python_parser/src/token.rs +++ b/crates/ruff_python_parser/src/token.rs @@ -54,6 +54,16 @@ impl Token { self.flags.is_triple_quoted() } + /// Returns `true` if the current token is a raw string of any kind. + /// + /// # Panics + /// + /// If it isn't a string or any f-string tokens. + pub const fn is_raw_string(self) -> bool { + assert!(self.is_any_string()); + self.flags.is_raw_string() + } + /// Returns the [`Quote`] style for the current string token of any kind. /// /// # Panics @@ -64,6 +74,16 @@ impl Token { self.flags.quote_style() } + /// Returns the string flags for the current string token of any kind. + /// + /// # Panics + /// + /// If it isn't a string or any f-string tokens. + pub fn string_flags(self) -> AnyStringFlags { + assert!(self.is_any_string()); + self.flags.as_any_string_flags() + } + /// Returns `true` if this is any kind of string token. const fn is_any_string(self) -> bool { matches!(