From 12e0fe264b3e8a0e32f9612c0fa0cb4071cdd7bc Mon Sep 17 00:00:00 2001 From: Yuzhan Jiang Date: Mon, 18 Nov 2024 08:20:21 -0800 Subject: [PATCH 1/4] Add support in Commons CSV for tracking byte positions during parsing --- .../org/apache/commons/csv/CSVFormat.java | 24 ---- .../org/apache/commons/csv/CSVParser.java | 30 +++-- .../commons/csv/ExtendedBufferedReader.java | 46 +++++--- .../org/apache/commons/csv/CSVParserTest.java | 109 ++++++++---------- .../apache/commons/csv/JiraCsv196Test.java | 36 +++--- 5 files changed, 117 insertions(+), 128 deletions(-) diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java index cabcb5135e..8205f4c47e 100644 --- a/src/main/java/org/apache/commons/csv/CSVFormat.java +++ b/src/main/java/org/apache/commons/csv/CSVFormat.java @@ -2097,30 +2097,6 @@ public CSVParser parse(final Reader reader) throws IOException { return CSVParser.builder().setReader(reader).setFormat(this).get(); } - /** - * Parses the specified content. - * - *

- * This method provides a way to parse CSV data from an input stream, starting at a specified character offset and record number, - * using a specified encoding. It returns a {@link CSVParser} that can be used to iterate over the parsed {@link CSVRecord}s. - *

- * - *

- * For additional parsing options, see the various static parse methods available on {@link CSVParser}. - *

- * - * @param reader the input stream - * @param characterOffset the character offset to start parsing from - * @param recordNumber the initial record number to start counting from - * @param encoding the character encoding of the input stream - * @return a parser over a stream of {@link CSVRecord}s. - * @throws IOException If an I/O error occurs - * @throws CSVException Thrown on invalid input. - */ - public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException { - return new CSVParser(reader, this, characterOffset, recordNumber, encoding); - } - /** * Prints to the specified output. * diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index c48e1da096..b7e0ab2b27 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -153,6 +153,7 @@ public static class Builder extends AbstractStreamBuilder { private CSVFormat format; private long characterOffset; private long recordNumber = 1; + private Charset charset; /** * Constructs a new instance. @@ -164,7 +165,7 @@ protected Builder() { @SuppressWarnings("resource") @Override public CSVParser get() throws IOException { - return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber); + return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, charset); } /** @@ -200,6 +201,16 @@ public Builder setRecordNumber(final long recordNumber) { return asThis(); } + /** + * Sets the Charset to use for the reader. + * + * @param charset the Charset to use for the reader. + * @return this instance. + */ + public Builder setCharset(final Charset charset) { + this.charset = charset; + return asThis(); + } } final class CSVRecordIterator implements Iterator { @@ -510,7 +521,7 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact this(reader, format, characterOffset, recordNumber, null); } - /** + /** * Constructs a new instance using the given {@link CSVFormat} * *

@@ -525,21 +536,22 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact * @param characterOffset * Lexer offset when the parser does not start parsing at the beginning of the source. * @param recordNumber - * The next record number to assign - * @param encoding - * The encoding to use for the reader + * The next record number to assign. + * @param charset + * The Charset to decode the given file. * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either the reader or format is null. * @throws IOException - * If there is a problem reading the header or skipping the first record + * If there is a problem reading the header or skipping the first record. * @throws CSVException Thrown on invalid input. + * @since 1.13.0. */ - public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, - String encoding) throws IOException { + private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, final Charset charset) + throws IOException { Objects.requireNonNull(reader, "reader"); Objects.requireNonNull(format, "format"); this.format = format.copy(); - this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, encoding)); + this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset)); this.csvRecordIterator = new CSVRecordIterator(); this.headers = createHeaders(); this.characterOffset = characterOffset; diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index 2a82d48a5a..2593ac496c 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -53,12 +53,12 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { private long position; private long positionMark; - /** The number of bytes read so far */ + /** The number of bytes read so far. */ private long bytesRead; private long bytesReadMark; - /** Encoder used to calculate the bytes of characters */ - CharsetEncoder encoder; + /** Encoder used to calculate the bytes of characters. */ + private CharsetEncoder encoder; /** * Constructs a new instance using the default buffer size. @@ -67,10 +67,10 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { super(reader); } - ExtendedBufferedReader(final Reader reader, String encoding) { + ExtendedBufferedReader(final Reader reader, Charset charset) { super(reader); - if (encoding != null) { - encoder = Charset.forName(encoding).newEncoder(); + if (charset != null) { + encoder = charset.newEncoder(); } } @@ -146,20 +146,30 @@ public int read() throws IOException { } /** - * In Java, a char data type are based on the original Unicode - * specification, which defined characters as fixed-width 16-bit entities. - * U+0000 to U+FFFF: - * - BMP, represented using 1 16-bit char - * - Consists of UTF-8 1-byte, 2-byte, some 3-byte chars - * U+10000 to U+10FFFF: - * - Supplementary characters, represented as a pair of characters, - * the first char from the high-surrogates range (\uD800-\uDBFF), - * and the second char from the low-surrogates range (uDC00-\uDFFF). - * - Consists of UTF-8 some 3-byte chars and 4-byte chars + * In Java, the {@code char} data type is based on the original Unicode + * specification, which defined characters as fixed-width 16-bit entities. + *

+ * The Unicode characters are divided into two main ranges: + *

    + *
  • U+0000 to U+FFFF (Basic Multilingual Plane, BMP): + *
      + *
    • Represented using a single 16-bit {@code char}.
    • + *
    • Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.
    • + *
    + *
  • + *
  • U+10000 to U+10FFFF (Supplementary Characters): + *
      + *
    • Represented as a pair of {@code char}s:
    • + *
    • The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).
    • + *
    • The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).
    • + *
    • Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.
    • + *
    + *
  • + *
*/ private long getCharBytes(int current) throws CharacterCodingException { - char cChar = (char) current; - char lChar = (char) lastChar; + final char cChar = (char) current; + final char lChar = (char) lastChar; if (!Character.isSurrogate(cChar)) { return encoder.encode( CharBuffer.wrap(new char[] {cChar})).limit(); diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index fd1ecdb021..73841a30d5 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -707,76 +707,69 @@ public void testGetRecordThreeBytesRead() throws Exception { "11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" + "22222222222222,'4017-01-01',おはよう私の友人~,v4\n" + "33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n"; - // String code = "'1',4"; - // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); final CSVFormat format = CSVFormat.Builder.create() - .setDelimiter(',') - .setQuote('\'') - .build(); - // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8"); - CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8"); + .setDelimiter(',') + .setQuote('\'') + .get(); + try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).get() ) { + CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L); - CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L); - assertEquals(0, parser.getRecordNumber()); - assertNotNull(record = parser.nextRecord()); - assertEquals(1, record.getRecordNumber()); - assertEquals(code.indexOf('i'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), record.getCharacterPosition()); - - assertNotNull(record = parser.nextRecord()); - assertEquals(2, record.getRecordNumber()); - assertEquals(code.indexOf('1'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), record.getCharacterPosition()); + assertEquals(0, parser.getRecordNumber()); + assertNotNull(record = parser.nextRecord()); + assertEquals(1, record.getRecordNumber()); + assertEquals(code.indexOf('i'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), record.getCharacterPosition()); - assertNotNull(record = parser.nextRecord()); - assertEquals(3, record.getRecordNumber()); - assertEquals(code.indexOf('2'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), 95); + assertNotNull(record = parser.nextRecord()); + assertEquals(2, record.getRecordNumber()); + assertEquals(code.indexOf('1'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), record.getCharacterPosition()); - assertNotNull(record = parser.nextRecord()); - assertEquals(4, record.getRecordNumber()); - assertEquals(code.indexOf('3'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), 154); + assertNotNull(record = parser.nextRecord()); + assertEquals(3, record.getRecordNumber()); + assertEquals(code.indexOf('2'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), 95); - parser.close(); + assertNotNull(record = parser.nextRecord()); + assertEquals(4, record.getRecordNumber()); + assertEquals(code.indexOf('3'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), 154); + }; } @Test public void testGetRecordFourBytesRead() throws Exception { String code = "id,a,b,c\n" + - "1,😊,🤔,😂\n" + - "2,😊,🤔,😂\n" + - "3,😊,🤔,😂\n"; - // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); + "1,😊,🤔,😂\n" + + "2,😊,🤔,😂\n" + + "3,😊,🤔,😂\n"; final CSVFormat format = CSVFormat.Builder.create() - .setDelimiter(',') - .setQuote('\'') - .build(); - - // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8"); - CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8"); - - CSVRecord record; - assertEquals(0, parser.getRecordNumber()); - assertNotNull(record = parser.nextRecord()); - assertEquals(1, record.getRecordNumber()); - assertEquals(code.indexOf('i'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), record.getCharacterPosition()); - - assertNotNull(record = parser.nextRecord()); - assertEquals(2, record.getRecordNumber()); - assertEquals(code.indexOf('1'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), record.getCharacterPosition()); - assertNotNull(record = parser.nextRecord()); - assertEquals(3, record.getRecordNumber()); - assertEquals(code.indexOf('2'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), 26); - assertNotNull(record = parser.nextRecord()); - assertEquals(4, record.getRecordNumber()); - assertEquals(code.indexOf('3'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), 43); - parser.close(); + .setDelimiter(',') + .setQuote('\'') + .get(); + try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).get()) { + CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L); + + assertEquals(0, parser.getRecordNumber()); + assertNotNull(record = parser.nextRecord()); + assertEquals(1, record.getRecordNumber()); + assertEquals(code.indexOf('i'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), record.getCharacterPosition()); + + assertNotNull(record = parser.nextRecord()); + assertEquals(2, record.getRecordNumber()); + assertEquals(code.indexOf('1'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), record.getCharacterPosition()); + assertNotNull(record = parser.nextRecord()); + assertEquals(3, record.getRecordNumber()); + assertEquals(code.indexOf('2'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), 26); + assertNotNull(record = parser.nextRecord()); + assertEquals(4, record.getRecordNumber()); + assertEquals(code.indexOf('3'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), 43); + } } @Test diff --git a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java index 7dbc23cafa..746bdea1b0 100644 --- a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java +++ b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java @@ -21,7 +21,7 @@ import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; - +import java.nio.charset.StandardCharsets; import org.junit.jupiter.api.Test; @@ -29,16 +29,15 @@ public class JiraCsv196Test { @Test public void parseThreeBytes() throws IOException { - - // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); final CSVFormat format = CSVFormat.Builder.create() - .setDelimiter(',') - .setQuote('\'') - .build(); - // CSVParser parser = new CSVParser(getTestInput( - // "org/apache/commons/csv/CSV-196/japanese.csv"), format, 0L, 1L, "UTF-8"); - CSVParser parser = format.parse(getTestInput( - "org/apache/commons/csv/CSV-196/japanese.csv"), 0L, 1L, "UTF-8"); + .setDelimiter(',') + .setQuote('\'') + .get(); + CSVParser parser = new CSVParser.Builder() + .setFormat(format) + .setReader(getTestInput("org/apache/commons/csv/CSV-196/japanese.csv")) + .setCharset(StandardCharsets.UTF_8) + .get(); long[] charByteKey = {0, 89, 242, 395}; int idx = 0; for (CSVRecord record : parser) { @@ -50,15 +49,15 @@ public void parseThreeBytes() throws IOException { @Test public void parseFourBytes() throws IOException { - // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); final CSVFormat format = CSVFormat.Builder.create() - .setDelimiter(',') - .setQuote('\'') - .build(); - - CSVParser parser = format.parse(getTestInput( - "org/apache/commons/csv/CSV-196/emoji.csv"), 0L, 1L, "UTF-8"); - + .setDelimiter(',') + .setQuote('\'') + .get(); + CSVParser parser = new CSVParser.Builder() + .setFormat(format) + .setReader(getTestInput("org/apache/commons/csv/CSV-196/emoji.csv")) + .setCharset(StandardCharsets.UTF_8) + .get(); long[] charByteKey = {0, 84, 701, 1318, 1935}; int idx = 0; for (CSVRecord record : parser) { @@ -67,7 +66,6 @@ public void parseFourBytes() throws IOException { parser.close(); } - private Reader getTestInput(String path) { return new InputStreamReader( ClassLoader.getSystemClassLoader().getResourceAsStream(path)); From 67d841f475683cff0df369958b0e0f50f53694ec Mon Sep 17 00:00:00 2001 From: Yuzhan Jiang Date: Mon, 18 Nov 2024 08:40:21 -0800 Subject: [PATCH 2/4] Delete trailing spaces and delete jar in pom.xml --- pom.xml | 1 - src/main/java/org/apache/commons/csv/CSVParser.java | 6 +++--- src/test/java/org/apache/commons/csv/CSVParserTest.java | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pom.xml b/pom.xml index bfdf9e74a7..a03787382e 100644 --- a/pom.xml +++ b/pom.xml @@ -28,7 +28,6 @@ https://commons.apache.org/proper/commons-csv/ 2005 The Apache Commons CSV library provides a simple interface for reading and writing CSV files of various types. - jar diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index b7e0ab2b27..2a22c94b3e 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -201,9 +201,9 @@ public Builder setRecordNumber(final long recordNumber) { return asThis(); } - /** + /** * Sets the Charset to use for the reader. - * + * * @param charset the Charset to use for the reader. * @return this instance. */ @@ -546,7 +546,7 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact * @throws CSVException Thrown on invalid input. * @since 1.13.0. */ - private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, final Charset charset) + private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, final Charset charset) throws IOException { Objects.requireNonNull(reader, "reader"); Objects.requireNonNull(format, "format"); diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index 73841a30d5..8adb9a4f28 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -750,7 +750,7 @@ public void testGetRecordFourBytesRead() throws Exception { .get(); try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).get()) { CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L); - + assertEquals(0, parser.getRecordNumber()); assertNotNull(record = parser.nextRecord()); assertEquals(1, record.getRecordNumber()); From c0d83858bbef8b710a4018cfbd3fa1d777923f93 Mon Sep 17 00:00:00 2001 From: Yuzhan Jiang Date: Tue, 19 Nov 2024 08:08:00 -0800 Subject: [PATCH 3/4] Fix the comments and the indentation --- .../org/apache/commons/csv/CSVParser.java | 6 ++-- .../commons/csv/ExtendedBufferedReader.java | 2 +- .../org/apache/commons/csv/CSVParserTest.java | 18 ++++++------ .../apache/commons/csv/JiraCsv196Test.java | 28 +++++++++---------- 4 files changed, 27 insertions(+), 27 deletions(-) diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index 2a22c94b3e..f698532cc2 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -202,9 +202,9 @@ public Builder setRecordNumber(final long recordNumber) { } /** - * Sets the Charset to use for the reader. + * Sets the character encoding to be used for the reader. * - * @param charset the Charset to use for the reader. + * @param charset the character encoding. * @return this instance. */ public Builder setCharset(final Charset charset) { @@ -538,7 +538,7 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact * @param recordNumber * The next record number to assign. * @param charset - * The Charset to decode the given file. + * The character encoding to be used for the reader. * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either the reader or format is null. * @throws IOException diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index 2593ac496c..158f90a755 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -57,7 +57,7 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { private long bytesRead; private long bytesReadMark; - /** Encoder used to calculate the bytes of characters. */ + /** Encoder for calculating the number of bytes for each character read. */ private CharsetEncoder encoder; /** diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index 8adb9a4f28..2b68155624 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -708,9 +708,9 @@ public void testGetRecordThreeBytesRead() throws Exception { "22222222222222,'4017-01-01',おはよう私の友人~,v4\n" + "33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n"; final CSVFormat format = CSVFormat.Builder.create() - .setDelimiter(',') - .setQuote('\'') - .get(); + .setDelimiter(',') + .setQuote('\'') + .get(); try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).get() ) { CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L); @@ -741,13 +741,13 @@ public void testGetRecordThreeBytesRead() throws Exception { @Test public void testGetRecordFourBytesRead() throws Exception { String code = "id,a,b,c\n" + - "1,😊,🤔,😂\n" + - "2,😊,🤔,😂\n" + - "3,😊,🤔,😂\n"; + "1,😊,🤔,😂\n" + + "2,😊,🤔,😂\n" + + "3,😊,🤔,😂\n"; final CSVFormat format = CSVFormat.Builder.create() - .setDelimiter(',') - .setQuote('\'') - .get(); + .setDelimiter(',') + .setQuote('\'') + .get(); try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).get()) { CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L); diff --git a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java index 746bdea1b0..853007f9e5 100644 --- a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java +++ b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java @@ -30,14 +30,14 @@ public class JiraCsv196Test { @Test public void parseThreeBytes() throws IOException { final CSVFormat format = CSVFormat.Builder.create() - .setDelimiter(',') - .setQuote('\'') - .get(); + .setDelimiter(',') + .setQuote('\'') + .get(); CSVParser parser = new CSVParser.Builder() - .setFormat(format) - .setReader(getTestInput("org/apache/commons/csv/CSV-196/japanese.csv")) - .setCharset(StandardCharsets.UTF_8) - .get(); + .setFormat(format) + .setReader(getTestInput("org/apache/commons/csv/CSV-196/japanese.csv")) + .setCharset(StandardCharsets.UTF_8) + .get(); long[] charByteKey = {0, 89, 242, 395}; int idx = 0; for (CSVRecord record : parser) { @@ -50,14 +50,14 @@ public void parseThreeBytes() throws IOException { @Test public void parseFourBytes() throws IOException { final CSVFormat format = CSVFormat.Builder.create() - .setDelimiter(',') - .setQuote('\'') - .get(); + .setDelimiter(',') + .setQuote('\'') + .get(); CSVParser parser = new CSVParser.Builder() - .setFormat(format) - .setReader(getTestInput("org/apache/commons/csv/CSV-196/emoji.csv")) - .setCharset(StandardCharsets.UTF_8) - .get(); + .setFormat(format) + .setReader(getTestInput("org/apache/commons/csv/CSV-196/emoji.csv")) + .setCharset(StandardCharsets.UTF_8) + .get(); long[] charByteKey = {0, 84, 701, 1318, 1935}; int idx = 0; for (CSVRecord record : parser) { From 65fa71a3ebbfb4bc08e3c89151f7b9419388e9a1 Mon Sep 17 00:00:00 2001 From: Yuzhan Jiang Date: Tue, 19 Nov 2024 12:19:52 -0800 Subject: [PATCH 4/4] Fix the indentation --- src/main/java/org/apache/commons/csv/CSVParser.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index f698532cc2..024dd562d4 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -521,7 +521,7 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact this(reader, format, characterOffset, recordNumber, null); } - /** + /** * Constructs a new instance using the given {@link CSVFormat} * *