From f41a27e86da324f9c3fa7d533c77b91fb8700d41 Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Tue, 7 Jan 2025 15:16:34 -0500 Subject: [PATCH] Don't normalize strings in the CLI (#127) * Normalize multiline strings in the formatter so we don't have to normalize in the CLI Allowing us to actually take advantage of the `Unchanged` optimization with CRLF endings, and correctly handle `--check` too * Add a parser snapshot test for multiline strings with CRLF line endings To prove we can parse these line endings, and to prove that the CRLF ends up in the `RStringValue` * Add CHANGELOG bullets * Mention why no `line_ending` crate usage * Don't use `Cell` after all, since it's more mental overhead than it's worth --- .gitattributes | 2 + CHANGELOG.md | 4 + crates/air/src/commands/format.rs | 15 -- .../fixtures/crlf/multiline_string_value.R | 2 + crates/air/tests/format.rs | 22 +++ crates/air_formatter_test/src/spec.rs | 3 - crates/air_r_formatter/src/lib.rs | 1 + .../src/r/auxiliary/string_value.rs | 3 +- crates/air_r_formatter/src/string_literal.rs | 148 ++++++++++++++++++ .../ok/crlf/multiline_string_value.R | 2 + .../ok/crlf/multiline_string_value.R.snap | 38 +++++ crates/air_r_parser/tests/spec_test.rs | 3 - 12 files changed, 221 insertions(+), 22 deletions(-) create mode 100644 crates/air/tests/fixtures/crlf/multiline_string_value.R create mode 100644 crates/air_r_formatter/src/string_literal.rs create mode 100644 crates/air_r_parser/tests/snapshots/ok/crlf/multiline_string_value.R create mode 100644 crates/air_r_parser/tests/snapshots/ok/crlf/multiline_string_value.R.snap diff --git a/.gitattributes b/.gitattributes index 922c1a66..dc0eeb62 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,4 +1,6 @@ * text=auto eol=lf # Windows specific test files where we need CRLF endings +crates/air/tests/fixtures/crlf/*.R text eol=crlf crates/air_r_formatter/tests/specs/r/crlf/*.R text eol=crlf +crates/air_r_parser/tests/snapshots/ok/crlf/*.R text eol=crlf diff --git a/CHANGELOG.md b/CHANGELOG.md index ebed5c0b..e089eea6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ # Development version +- `air format` is now faster on Windows when nothing changes (#90). + +- `air format --check` now works correctly with Windows line endings (#123). + - Magic line breaks are now supported in left assignment (#118). diff --git a/crates/air/src/commands/format.rs b/crates/air/src/commands/format.rs index 46c127a6..5400b7dd 100644 --- a/crates/air/src/commands/format.rs +++ b/crates/air/src/commands/format.rs @@ -179,24 +179,11 @@ fn format_file(path: PathBuf, mode: FormatMode) -> Result formatted, Err(err) => return Err(FormatCommandError::Format(path.clone(), err)), }; - // TODO: We rarely ever take advantage of this optimization on Windows right - // now. We always normalize on entry but we apply the requested line ending - // on exit (so on Windows we often infer CRLF on entry and normalize to - // LF, but apply CRLF on exit so `source` and `new` always have different - // line endings). We probably need to compare pre-normalized against - // post-formatted output? - // - // Ah, no, the right thing to do is for the cli to not care about normalizing - // line endings. This is mostly useful in the LSP for all the document manipulation - // we are going to do there. In the cli, we want to format a whole bunch of files - // so we really want this optimization, and since the formatter and parser can handle - // windows line endings just fine, we should not normalize here. match formatted { FormattedSource::Formatted(new) => { match mode { @@ -231,8 +218,6 @@ pub(crate) enum FormatSourceError { } /// Formats a vector of `source` code -/// -/// Safety: `source` should already be normalized to Unix line endings pub(crate) fn format_source( source: &str, options: RFormatOptions, diff --git a/crates/air/tests/fixtures/crlf/multiline_string_value.R b/crates/air/tests/fixtures/crlf/multiline_string_value.R new file mode 100644 index 00000000..198f3f21 --- /dev/null +++ b/crates/air/tests/fixtures/crlf/multiline_string_value.R @@ -0,0 +1,2 @@ +"multiline +string" diff --git a/crates/air/tests/format.rs b/crates/air/tests/format.rs index 6e9eb285..f710a2fb 100644 --- a/crates/air/tests/format.rs +++ b/crates/air/tests/format.rs @@ -1,3 +1,6 @@ +use std::path::Path; +use std::path::PathBuf; + use air::args::Args; use air::run; use air::status::ExitStatus; @@ -21,3 +24,22 @@ fn default_options() -> anyhow::Result<()> { assert_eq!(err, ExitStatus::Success); Ok(()) } + +#[test] +fn test_check_returns_cleanly_for_multiline_strings_with_crlf_line_endings() -> anyhow::Result<()> { + let fixtures = path_fixtures(); + let path = fixtures.join("crlf").join("multiline_string_value.R"); + let path = path.to_str().unwrap(); + + let args = Args::parse_from(["", "format", path, "--check"]); + let err = run(args)?; + + assert_eq!(err, ExitStatus::Success); + Ok(()) +} + +fn path_fixtures() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") +} diff --git a/crates/air_formatter_test/src/spec.rs b/crates/air_formatter_test/src/spec.rs index 78e93112..a096a08f 100644 --- a/crates/air_formatter_test/src/spec.rs +++ b/crates/air_formatter_test/src/spec.rs @@ -28,9 +28,6 @@ impl<'a> SpecTestFile<'a> { let input_code = std::fs::read_to_string(input_file).unwrap(); - // Normalize to Unix line endings - let input_code = line_ending::normalize(input_code); - // For the whole file, not a specific range right now let range_start_index = None; let range_end_index = None; diff --git a/crates/air_r_formatter/src/lib.rs b/crates/air_r_formatter/src/lib.rs index 93fff37f..4da17881 100644 --- a/crates/air_r_formatter/src/lib.rs +++ b/crates/air_r_formatter/src/lib.rs @@ -25,6 +25,7 @@ mod prelude; mod r; pub(crate) mod separated; mod statement_body; +mod string_literal; #[rustfmt::skip] mod generated; diff --git a/crates/air_r_formatter/src/r/auxiliary/string_value.rs b/crates/air_r_formatter/src/r/auxiliary/string_value.rs index 9c7c9ef1..2dff8c9d 100644 --- a/crates/air_r_formatter/src/r/auxiliary/string_value.rs +++ b/crates/air_r_formatter/src/r/auxiliary/string_value.rs @@ -1,4 +1,5 @@ use crate::prelude::*; +use crate::string_literal::FormatStringLiteralToken; use air_r_syntax::RStringValue; use air_r_syntax::RStringValueFields; use biome_formatter::write; @@ -8,6 +9,6 @@ pub(crate) struct FormatRStringValue; impl FormatNodeRule for FormatRStringValue { fn fmt_fields(&self, node: &RStringValue, f: &mut RFormatter) -> FormatResult<()> { let RStringValueFields { value_token } = node.as_fields(); - write![f, [value_token.format()]] + write![f, [FormatStringLiteralToken::new(&value_token?)]] } } diff --git a/crates/air_r_formatter/src/string_literal.rs b/crates/air_r_formatter/src/string_literal.rs new file mode 100644 index 00000000..954aa5fa --- /dev/null +++ b/crates/air_r_formatter/src/string_literal.rs @@ -0,0 +1,148 @@ +use air_r_syntax::RSyntaxKind::R_STRING_LITERAL; +use air_r_syntax::RSyntaxToken; +use biome_formatter::prelude::syntax_token_cow_slice; +use biome_formatter::prelude::Formatter; +use biome_formatter::trivia::format_replaced; +use biome_formatter::Format; +use biome_formatter::FormatResult; +use std::borrow::Cow; + +use crate::context::RFormatContext; +use crate::RFormatter; + +/// Helper utility for formatting a string literal token +/// +/// The main job of this utility is to `normalize()` the string and handle the +/// complicated way we have to call [format_replaced] with that normalized result. +pub(crate) struct FormatStringLiteralToken<'token> { + /// The string literal token to format + token: &'token RSyntaxToken, +} + +impl<'token> FormatStringLiteralToken<'token> { + pub(crate) fn new(token: &'token RSyntaxToken) -> Self { + Self { token } + } + + fn normalize(&self) -> FormatNormalizedStringLiteralToken { + let token = self.token; + + debug_assert!( + matches!(token.kind(), R_STRING_LITERAL), + "Found kind {:?}", + token.kind() + ); + + let text = token.text_trimmed(); + let text = normalize_string(text); + + FormatNormalizedStringLiteralToken { token, text } + } +} + +impl Format for FormatStringLiteralToken<'_> { + fn fmt(&self, f: &mut RFormatter) -> FormatResult<()> { + self.normalize().fmt(f) + } +} + +struct FormatNormalizedStringLiteralToken<'token> { + /// The original string literal token before normalization + token: &'token RSyntaxToken, + + /// The normalized text + text: Cow<'token, str>, +} + +impl Format for FormatNormalizedStringLiteralToken<'_> { + fn fmt(&self, f: &mut Formatter) -> FormatResult<()> { + format_replaced( + self.token, + &syntax_token_cow_slice( + // Cloning the `Cow` is cheap since 99% of the time it will be the + // `Borrowed` variant. Only with multiline strings on Windows will it + // ever actually clone the underlying string. + self.text.clone(), + self.token, + self.token.text_trimmed_range().start(), + ), + ) + .fmt(f) + } +} + +/// Normalize a string, returning a [`Cow::Borrowed`] if the input was already normalized +/// +/// This function: +/// - Normalizes all line endings to `\n` +/// +/// We may perform more normalization in the future. We don't use utilities from the +/// `line_ending` crate because we don't own the string. +/// +/// This function is particularly useful for multiline strings, which capture the existing +/// line ending inside the string token itself. We must normalize those line endings to +/// `\n` before the formatter -> printer stage, because the printer can't handle other +/// line endings and will panic on them. At the printer -> string stage at the very end, +/// the printer will replace all `\n` with the `LineEnding` requested by the user. +/// https://github.com/biomejs/biome/blob/a658a294087c143b83350cbeb6b44f7a2e9afdd1/crates/biome_formatter/src/printer/mod.rs#L714-L718 +fn normalize_string(input: &str) -> Cow { + // The normalized string if `input` is not yet normalized. + // `output` must remain empty if `input` is already normalized. + let mut output = String::new(); + + // Tracks the last index of `input` that has been written to `output`. + // If `last_loc` is `0` at the end, then the input is already normalized and can be returned as is. + let mut last_loc = 0; + + let mut iter = input.char_indices().peekable(); + + while let Some((loc, char)) = iter.next() { + if char == '\r' { + output.push_str(&input[last_loc..loc]); + + if iter.peek().is_some_and(|(_, next)| next == &'\n') { + // CRLF support - skip over the '\r' character, keep the `\n` + iter.next(); + } else { + // CR support - Replace the `\r` with a `\n` + output.push('\n'); + } + + last_loc = loc + '\r'.len_utf8(); + } + } + + if last_loc == 0 { + Cow::Borrowed(input) + } else { + output.push_str(&input[last_loc..]); + Cow::Owned(output) + } +} + +#[cfg(test)] +mod tests { + use crate::string_literal::normalize_string; + use std::borrow::Cow; + + #[test] + fn normalize_empty() { + let x = ""; + assert_eq!(normalize_string(x), Cow::Borrowed(x)); + } + + #[test] + fn normalize_newlines() { + let x = "abcd"; + assert_eq!(normalize_string(x), Cow::Borrowed(x)); + + let x = "a\nb\nc\nd\n"; + assert_eq!(normalize_string(x), Cow::Borrowed(x)); + + let x = "a\nb\rc\r\nd\n"; + assert_eq!( + normalize_string(x), + Cow::Owned::(String::from("a\nb\nc\nd\n")) + ); + } +} diff --git a/crates/air_r_parser/tests/snapshots/ok/crlf/multiline_string_value.R b/crates/air_r_parser/tests/snapshots/ok/crlf/multiline_string_value.R new file mode 100644 index 00000000..198f3f21 --- /dev/null +++ b/crates/air_r_parser/tests/snapshots/ok/crlf/multiline_string_value.R @@ -0,0 +1,2 @@ +"multiline +string" diff --git a/crates/air_r_parser/tests/snapshots/ok/crlf/multiline_string_value.R.snap b/crates/air_r_parser/tests/snapshots/ok/crlf/multiline_string_value.R.snap new file mode 100644 index 00000000..bec48733 --- /dev/null +++ b/crates/air_r_parser/tests/snapshots/ok/crlf/multiline_string_value.R.snap @@ -0,0 +1,38 @@ +--- +source: crates/air_r_parser/tests/spec_test.rs +expression: snapshot +--- +## Input + +```R +"multiline +string" + +``` + + +## AST + +``` +RRoot { + bom_token: missing (optional), + expressions: RExpressionList [ + RStringValue { + value_token: R_STRING_LITERAL@0..19 "\"multiline\r\nstring\"" [] [], + }, + ], + eof_token: EOF@19..21 "" [Newline("\r\n")] [], +} +``` + +## CST + +``` +0: R_ROOT@0..21 + 0: (empty) + 1: R_EXPRESSION_LIST@0..19 + 0: R_STRING_VALUE@0..19 + 0: R_STRING_LITERAL@0..19 "\"multiline\r\nstring\"" [] [] + 2: EOF@19..21 "" [Newline("\r\n")] [] + +``` diff --git a/crates/air_r_parser/tests/spec_test.rs b/crates/air_r_parser/tests/spec_test.rs index d22860a9..4f990f25 100644 --- a/crates/air_r_parser/tests/spec_test.rs +++ b/crates/air_r_parser/tests/spec_test.rs @@ -52,9 +52,6 @@ pub fn run(test_case: &str, _snapshot_name: &str, test_directory: &str, outcome_ let content = fs::read_to_string(test_case_path) .expect("Expected test path to be a readable file in UTF8 encoding"); - // Normalize to Unix line endings - let content = line_ending::normalize(content); - let options = RParserOptions::default(); let parsed = parse(&content, options); let root = RRoot::unwrap_cast(parsed.syntax());