Skip to content

Commit

Permalink
Don't normalize strings in the CLI (#127)
Browse files Browse the repository at this point in the history
* Normalize multiline strings in the formatter so we don't have to normalize in the CLI

Allowing us to actually take advantage of the `Unchanged` optimization with CRLF endings, and correctly handle `--check` too

* Add a parser snapshot test for multiline strings with CRLF line endings

To prove we can parse these line endings, and to prove that the CRLF ends up in the `RStringValue`

* Add CHANGELOG bullets

* Mention why no `line_ending` crate usage

* Don't use `Cell` after all, since it's more mental overhead than it's worth
  • Loading branch information
DavisVaughan authored Jan 7, 2025
1 parent 92887d3 commit f41a27e
Show file tree
Hide file tree
Showing 12 changed files with 221 additions and 22 deletions.
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
* text=auto eol=lf

# Windows specific test files where we need CRLF endings
crates/air/tests/fixtures/crlf/*.R text eol=crlf
crates/air_r_formatter/tests/specs/r/crlf/*.R text eol=crlf
crates/air_r_parser/tests/snapshots/ok/crlf/*.R text eol=crlf
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

# Development version

- `air format` is now faster on Windows when nothing changes (#90).

- `air format --check` now works correctly with Windows line endings (#123).

- Magic line breaks are now supported in left assignment (#118).


Expand Down
15 changes: 0 additions & 15 deletions crates/air/src/commands/format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,24 +179,11 @@ fn format_file(path: PathBuf, mode: FormatMode) -> Result<FormatFileAction, Form
};
let options = RFormatOptions::default().with_line_ending(line_ending);

let source = line_ending::normalize(source);
let formatted = match format_source(source.as_str(), options) {
Ok(formatted) => formatted,
Err(err) => return Err(FormatCommandError::Format(path.clone(), err)),
};

// TODO: We rarely ever take advantage of this optimization on Windows right
// now. We always normalize on entry but we apply the requested line ending
// on exit (so on Windows we often infer CRLF on entry and normalize to
// LF, but apply CRLF on exit so `source` and `new` always have different
// line endings). We probably need to compare pre-normalized against
// post-formatted output?
//
// Ah, no, the right thing to do is for the cli to not care about normalizing
// line endings. This is mostly useful in the LSP for all the document manipulation
// we are going to do there. In the cli, we want to format a whole bunch of files
// so we really want this optimization, and since the formatter and parser can handle
// windows line endings just fine, we should not normalize here.
match formatted {
FormattedSource::Formatted(new) => {
match mode {
Expand Down Expand Up @@ -231,8 +218,6 @@ pub(crate) enum FormatSourceError {
}

/// Formats a vector of `source` code
///
/// Safety: `source` should already be normalized to Unix line endings
pub(crate) fn format_source(
source: &str,
options: RFormatOptions,
Expand Down
2 changes: 2 additions & 0 deletions crates/air/tests/fixtures/crlf/multiline_string_value.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
"multiline
string"
22 changes: 22 additions & 0 deletions crates/air/tests/format.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
use std::path::Path;
use std::path::PathBuf;

use air::args::Args;
use air::run;
use air::status::ExitStatus;
Expand All @@ -21,3 +24,22 @@ fn default_options() -> anyhow::Result<()> {
assert_eq!(err, ExitStatus::Success);
Ok(())
}

#[test]
fn test_check_returns_cleanly_for_multiline_strings_with_crlf_line_endings() -> anyhow::Result<()> {
let fixtures = path_fixtures();
let path = fixtures.join("crlf").join("multiline_string_value.R");
let path = path.to_str().unwrap();

let args = Args::parse_from(["", "format", path, "--check"]);
let err = run(args)?;

assert_eq!(err, ExitStatus::Success);
Ok(())
}

fn path_fixtures() -> PathBuf {
Path::new(env!("CARGO_MANIFEST_DIR"))
.join("tests")
.join("fixtures")
}
3 changes: 0 additions & 3 deletions crates/air_formatter_test/src/spec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,6 @@ impl<'a> SpecTestFile<'a> {

let input_code = std::fs::read_to_string(input_file).unwrap();

// Normalize to Unix line endings
let input_code = line_ending::normalize(input_code);

// For the whole file, not a specific range right now
let range_start_index = None;
let range_end_index = None;
Expand Down
1 change: 1 addition & 0 deletions crates/air_r_formatter/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ mod prelude;
mod r;
pub(crate) mod separated;
mod statement_body;
mod string_literal;

#[rustfmt::skip]
mod generated;
Expand Down
3 changes: 2 additions & 1 deletion crates/air_r_formatter/src/r/auxiliary/string_value.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use crate::prelude::*;
use crate::string_literal::FormatStringLiteralToken;
use air_r_syntax::RStringValue;
use air_r_syntax::RStringValueFields;
use biome_formatter::write;
Expand All @@ -8,6 +9,6 @@ pub(crate) struct FormatRStringValue;
impl FormatNodeRule<RStringValue> for FormatRStringValue {
fn fmt_fields(&self, node: &RStringValue, f: &mut RFormatter) -> FormatResult<()> {
let RStringValueFields { value_token } = node.as_fields();
write![f, [value_token.format()]]
write![f, [FormatStringLiteralToken::new(&value_token?)]]
}
}
148 changes: 148 additions & 0 deletions crates/air_r_formatter/src/string_literal.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
use air_r_syntax::RSyntaxKind::R_STRING_LITERAL;
use air_r_syntax::RSyntaxToken;
use biome_formatter::prelude::syntax_token_cow_slice;
use biome_formatter::prelude::Formatter;
use biome_formatter::trivia::format_replaced;
use biome_formatter::Format;
use biome_formatter::FormatResult;
use std::borrow::Cow;

use crate::context::RFormatContext;
use crate::RFormatter;

/// Helper utility for formatting a string literal token
///
/// The main job of this utility is to `normalize()` the string and handle the
/// complicated way we have to call [format_replaced] with that normalized result.
pub(crate) struct FormatStringLiteralToken<'token> {
/// The string literal token to format
token: &'token RSyntaxToken,
}

impl<'token> FormatStringLiteralToken<'token> {
pub(crate) fn new(token: &'token RSyntaxToken) -> Self {
Self { token }
}

fn normalize(&self) -> FormatNormalizedStringLiteralToken {
let token = self.token;

debug_assert!(
matches!(token.kind(), R_STRING_LITERAL),
"Found kind {:?}",
token.kind()
);

let text = token.text_trimmed();
let text = normalize_string(text);

FormatNormalizedStringLiteralToken { token, text }
}
}

impl Format<RFormatContext> for FormatStringLiteralToken<'_> {
fn fmt(&self, f: &mut RFormatter) -> FormatResult<()> {
self.normalize().fmt(f)
}
}

struct FormatNormalizedStringLiteralToken<'token> {
/// The original string literal token before normalization
token: &'token RSyntaxToken,

/// The normalized text
text: Cow<'token, str>,
}

impl Format<RFormatContext> for FormatNormalizedStringLiteralToken<'_> {
fn fmt(&self, f: &mut Formatter<RFormatContext>) -> FormatResult<()> {
format_replaced(
self.token,
&syntax_token_cow_slice(
// Cloning the `Cow<str>` is cheap since 99% of the time it will be the
// `Borrowed` variant. Only with multiline strings on Windows will it
// ever actually clone the underlying string.
self.text.clone(),
self.token,
self.token.text_trimmed_range().start(),
),
)
.fmt(f)
}
}

/// Normalize a string, returning a [`Cow::Borrowed`] if the input was already normalized
///
/// This function:
/// - Normalizes all line endings to `\n`
///
/// We may perform more normalization in the future. We don't use utilities from the
/// `line_ending` crate because we don't own the string.
///
/// This function is particularly useful for multiline strings, which capture the existing
/// line ending inside the string token itself. We must normalize those line endings to
/// `\n` before the formatter -> printer stage, because the printer can't handle other
/// line endings and will panic on them. At the printer -> string stage at the very end,
/// the printer will replace all `\n` with the `LineEnding` requested by the user.
/// https://github.com/biomejs/biome/blob/a658a294087c143b83350cbeb6b44f7a2e9afdd1/crates/biome_formatter/src/printer/mod.rs#L714-L718
fn normalize_string(input: &str) -> Cow<str> {
// The normalized string if `input` is not yet normalized.
// `output` must remain empty if `input` is already normalized.
let mut output = String::new();

// Tracks the last index of `input` that has been written to `output`.
// If `last_loc` is `0` at the end, then the input is already normalized and can be returned as is.
let mut last_loc = 0;

let mut iter = input.char_indices().peekable();

while let Some((loc, char)) = iter.next() {
if char == '\r' {
output.push_str(&input[last_loc..loc]);

if iter.peek().is_some_and(|(_, next)| next == &'\n') {
// CRLF support - skip over the '\r' character, keep the `\n`
iter.next();
} else {
// CR support - Replace the `\r` with a `\n`
output.push('\n');
}

last_loc = loc + '\r'.len_utf8();
}
}

if last_loc == 0 {
Cow::Borrowed(input)
} else {
output.push_str(&input[last_loc..]);
Cow::Owned(output)
}
}

#[cfg(test)]
mod tests {
use crate::string_literal::normalize_string;
use std::borrow::Cow;

#[test]
fn normalize_empty() {
let x = "";
assert_eq!(normalize_string(x), Cow::Borrowed(x));
}

#[test]
fn normalize_newlines() {
let x = "abcd";
assert_eq!(normalize_string(x), Cow::Borrowed(x));

let x = "a\nb\nc\nd\n";
assert_eq!(normalize_string(x), Cow::Borrowed(x));

let x = "a\nb\rc\r\nd\n";
assert_eq!(
normalize_string(x),
Cow::Owned::<str>(String::from("a\nb\nc\nd\n"))
);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
"multiline
string"
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
---
source: crates/air_r_parser/tests/spec_test.rs
expression: snapshot
---
## Input

```R
"multiline
string"
```


## AST

```
RRoot {
bom_token: missing (optional),
expressions: RExpressionList [
RStringValue {
value_token: R_STRING_LITERAL@0..19 "\"multiline\r\nstring\"" [] [],
},
],
eof_token: EOF@19..21 "" [Newline("\r\n")] [],
}
```

## CST

```
0: [email protected]
0: (empty)
1: [email protected]
0: [email protected]
0: [email protected] "\"multiline\r\nstring\"" [] []
2: [email protected] "" [Newline("\r\n")] []
```
3 changes: 0 additions & 3 deletions crates/air_r_parser/tests/spec_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,6 @@ pub fn run(test_case: &str, _snapshot_name: &str, test_directory: &str, outcome_
let content = fs::read_to_string(test_case_path)
.expect("Expected test path to be a readable file in UTF8 encoding");

// Normalize to Unix line endings
let content = line_ending::normalize(content);

let options = RParserOptions::default();
let parsed = parse(&content, options);
let root = RRoot::unwrap_cast(parsed.syntax());
Expand Down

0 comments on commit f41a27e

Please sign in to comment.