-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Don't normalize strings in the CLI (#127)
* Normalize multiline strings in the formatter so we don't have to normalize in the CLI Allowing us to actually take advantage of the `Unchanged` optimization with CRLF endings, and correctly handle `--check` too * Add a parser snapshot test for multiline strings with CRLF line endings To prove we can parse these line endings, and to prove that the CRLF ends up in the `RStringValue` * Add CHANGELOG bullets * Mention why no `line_ending` crate usage * Don't use `Cell` after all, since it's more mental overhead than it's worth
- Loading branch information
1 parent
92887d3
commit f41a27e
Showing
12 changed files
with
221 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,6 @@ | ||
* text=auto eol=lf | ||
|
||
# Windows specific test files where we need CRLF endings | ||
crates/air/tests/fixtures/crlf/*.R text eol=crlf | ||
crates/air_r_formatter/tests/specs/r/crlf/*.R text eol=crlf | ||
crates/air_r_parser/tests/snapshots/ok/crlf/*.R text eol=crlf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
"multiline | ||
string" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
use air_r_syntax::RSyntaxKind::R_STRING_LITERAL; | ||
use air_r_syntax::RSyntaxToken; | ||
use biome_formatter::prelude::syntax_token_cow_slice; | ||
use biome_formatter::prelude::Formatter; | ||
use biome_formatter::trivia::format_replaced; | ||
use biome_formatter::Format; | ||
use biome_formatter::FormatResult; | ||
use std::borrow::Cow; | ||
|
||
use crate::context::RFormatContext; | ||
use crate::RFormatter; | ||
|
||
/// Helper utility for formatting a string literal token | ||
/// | ||
/// The main job of this utility is to `normalize()` the string and handle the | ||
/// complicated way we have to call [format_replaced] with that normalized result. | ||
pub(crate) struct FormatStringLiteralToken<'token> { | ||
/// The string literal token to format | ||
token: &'token RSyntaxToken, | ||
} | ||
|
||
impl<'token> FormatStringLiteralToken<'token> { | ||
pub(crate) fn new(token: &'token RSyntaxToken) -> Self { | ||
Self { token } | ||
} | ||
|
||
fn normalize(&self) -> FormatNormalizedStringLiteralToken { | ||
let token = self.token; | ||
|
||
debug_assert!( | ||
matches!(token.kind(), R_STRING_LITERAL), | ||
"Found kind {:?}", | ||
token.kind() | ||
); | ||
|
||
let text = token.text_trimmed(); | ||
let text = normalize_string(text); | ||
|
||
FormatNormalizedStringLiteralToken { token, text } | ||
} | ||
} | ||
|
||
impl Format<RFormatContext> for FormatStringLiteralToken<'_> { | ||
fn fmt(&self, f: &mut RFormatter) -> FormatResult<()> { | ||
self.normalize().fmt(f) | ||
} | ||
} | ||
|
||
struct FormatNormalizedStringLiteralToken<'token> { | ||
/// The original string literal token before normalization | ||
token: &'token RSyntaxToken, | ||
|
||
/// The normalized text | ||
text: Cow<'token, str>, | ||
} | ||
|
||
impl Format<RFormatContext> for FormatNormalizedStringLiteralToken<'_> { | ||
fn fmt(&self, f: &mut Formatter<RFormatContext>) -> FormatResult<()> { | ||
format_replaced( | ||
self.token, | ||
&syntax_token_cow_slice( | ||
// Cloning the `Cow<str>` is cheap since 99% of the time it will be the | ||
// `Borrowed` variant. Only with multiline strings on Windows will it | ||
// ever actually clone the underlying string. | ||
self.text.clone(), | ||
self.token, | ||
self.token.text_trimmed_range().start(), | ||
), | ||
) | ||
.fmt(f) | ||
} | ||
} | ||
|
||
/// Normalize a string, returning a [`Cow::Borrowed`] if the input was already normalized | ||
/// | ||
/// This function: | ||
/// - Normalizes all line endings to `\n` | ||
/// | ||
/// We may perform more normalization in the future. We don't use utilities from the | ||
/// `line_ending` crate because we don't own the string. | ||
/// | ||
/// This function is particularly useful for multiline strings, which capture the existing | ||
/// line ending inside the string token itself. We must normalize those line endings to | ||
/// `\n` before the formatter -> printer stage, because the printer can't handle other | ||
/// line endings and will panic on them. At the printer -> string stage at the very end, | ||
/// the printer will replace all `\n` with the `LineEnding` requested by the user. | ||
/// https://github.com/biomejs/biome/blob/a658a294087c143b83350cbeb6b44f7a2e9afdd1/crates/biome_formatter/src/printer/mod.rs#L714-L718 | ||
fn normalize_string(input: &str) -> Cow<str> { | ||
// The normalized string if `input` is not yet normalized. | ||
// `output` must remain empty if `input` is already normalized. | ||
let mut output = String::new(); | ||
|
||
// Tracks the last index of `input` that has been written to `output`. | ||
// If `last_loc` is `0` at the end, then the input is already normalized and can be returned as is. | ||
let mut last_loc = 0; | ||
|
||
let mut iter = input.char_indices().peekable(); | ||
|
||
while let Some((loc, char)) = iter.next() { | ||
if char == '\r' { | ||
output.push_str(&input[last_loc..loc]); | ||
|
||
if iter.peek().is_some_and(|(_, next)| next == &'\n') { | ||
// CRLF support - skip over the '\r' character, keep the `\n` | ||
iter.next(); | ||
} else { | ||
// CR support - Replace the `\r` with a `\n` | ||
output.push('\n'); | ||
} | ||
|
||
last_loc = loc + '\r'.len_utf8(); | ||
} | ||
} | ||
|
||
if last_loc == 0 { | ||
Cow::Borrowed(input) | ||
} else { | ||
output.push_str(&input[last_loc..]); | ||
Cow::Owned(output) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use crate::string_literal::normalize_string; | ||
use std::borrow::Cow; | ||
|
||
#[test] | ||
fn normalize_empty() { | ||
let x = ""; | ||
assert_eq!(normalize_string(x), Cow::Borrowed(x)); | ||
} | ||
|
||
#[test] | ||
fn normalize_newlines() { | ||
let x = "abcd"; | ||
assert_eq!(normalize_string(x), Cow::Borrowed(x)); | ||
|
||
let x = "a\nb\nc\nd\n"; | ||
assert_eq!(normalize_string(x), Cow::Borrowed(x)); | ||
|
||
let x = "a\nb\rc\r\nd\n"; | ||
assert_eq!( | ||
normalize_string(x), | ||
Cow::Owned::<str>(String::from("a\nb\nc\nd\n")) | ||
); | ||
} | ||
} |
2 changes: 2 additions & 0 deletions
2
crates/air_r_parser/tests/snapshots/ok/crlf/multiline_string_value.R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
"multiline | ||
string" |
38 changes: 38 additions & 0 deletions
38
crates/air_r_parser/tests/snapshots/ok/crlf/multiline_string_value.R.snap
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
--- | ||
source: crates/air_r_parser/tests/spec_test.rs | ||
expression: snapshot | ||
--- | ||
## Input | ||
|
||
```R | ||
"multiline | ||
string" | ||
``` | ||
|
||
|
||
## AST | ||
|
||
``` | ||
RRoot { | ||
bom_token: missing (optional), | ||
expressions: RExpressionList [ | ||
RStringValue { | ||
value_token: R_STRING_LITERAL@0..19 "\"multiline\r\nstring\"" [] [], | ||
}, | ||
], | ||
eof_token: EOF@19..21 "" [Newline("\r\n")] [], | ||
} | ||
``` | ||
|
||
## CST | ||
|
||
``` | ||
0: [email protected] | ||
0: (empty) | ||
1: [email protected] | ||
0: [email protected] | ||
0: [email protected] "\"multiline\r\nstring\"" [] [] | ||
2: [email protected] "" [Newline("\r\n")] [] | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters