Skip to content

Commit

Permalink
Add support in Commons CSV for tracking byte positions during parsing (
Browse files Browse the repository at this point in the history
…#12)

Add support in Commons CSV for tracking byte positions during parsing
  • Loading branch information
DarrenJAN authored Nov 19, 2024
1 parent b244cb1 commit 3599f5b
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 122 deletions.
1 change: 0 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
<url>https://commons.apache.org/proper/commons-csv/</url>
<inceptionYear>2005</inceptionYear>
<description>The Apache Commons CSV library provides a simple interface for reading and writing CSV files of various types.</description>
<packaging>jar</packaging>

<dependencies>
<dependency>
Expand Down
24 changes: 0 additions & 24 deletions src/main/java/org/apache/commons/csv/CSVFormat.java
Original file line number Diff line number Diff line change
Expand Up @@ -2097,30 +2097,6 @@ public CSVParser parse(final Reader reader) throws IOException {
return CSVParser.builder().setReader(reader).setFormat(this).get();
}

/**
* Parses the specified content.
*
* <p>
* This method provides a way to parse CSV data from an input stream, starting at a specified character offset and record number,
* using a specified encoding. It returns a {@link CSVParser} that can be used to iterate over the parsed {@link CSVRecord}s.
* </p>
*
* <p>
* For additional parsing options, see the various static parse methods available on {@link CSVParser}.
* </p>
*
* @param reader the input stream
* @param characterOffset the character offset to start parsing from
* @param recordNumber the initial record number to start counting from
* @param encoding the character encoding of the input stream
* @return a parser over a stream of {@link CSVRecord}s.
* @throws IOException If an I/O error occurs
* @throws CSVException Thrown on invalid input.
*/
public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException {
return new CSVParser(reader, this, characterOffset, recordNumber, encoding);
}

/**
* Prints to the specified output.
*
Expand Down
30 changes: 21 additions & 9 deletions src/main/java/org/apache/commons/csv/CSVParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ public static class Builder extends AbstractStreamBuilder<CSVParser, Builder> {
private CSVFormat format;
private long characterOffset;
private long recordNumber = 1;
private Charset charset;

/**
* Constructs a new instance.
Expand All @@ -164,7 +165,7 @@ protected Builder() {
@SuppressWarnings("resource")
@Override
public CSVParser get() throws IOException {
return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber);
return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, charset);
}

/**
Expand Down Expand Up @@ -200,6 +201,16 @@ public Builder setRecordNumber(final long recordNumber) {
return asThis();
}

/**
* Sets the character encoding to be used for the reader.
*
* @param charset the character encoding.
* @return this instance.
*/
public Builder setCharset(final Charset charset) {
this.charset = charset;
return asThis();
}
}

final class CSVRecordIterator implements Iterator<CSVRecord> {
Expand Down Expand Up @@ -510,7 +521,7 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact
this(reader, format, characterOffset, recordNumber, null);
}

/**
/**
* Constructs a new instance using the given {@link CSVFormat}
*
* <p>
Expand All @@ -525,21 +536,22 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact
* @param characterOffset
* Lexer offset when the parser does not start parsing at the beginning of the source.
* @param recordNumber
* The next record number to assign
* @param encoding
* The encoding to use for the reader
* The next record number to assign.
* @param charset
* The character encoding to be used for the reader.
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either the reader or format is null.
* @throws IOException
* If there is a problem reading the header or skipping the first record
* If there is a problem reading the header or skipping the first record.
* @throws CSVException Thrown on invalid input.
* @since 1.13.0.
*/
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
String encoding) throws IOException {
private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, final Charset charset)
throws IOException {
Objects.requireNonNull(reader, "reader");
Objects.requireNonNull(format, "format");
this.format = format.copy();
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, encoding));
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset));
this.csvRecordIterator = new CSVRecordIterator();
this.headers = createHeaders();
this.characterOffset = characterOffset;
Expand Down
46 changes: 28 additions & 18 deletions src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,12 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
private long position;
private long positionMark;

/** The number of bytes read so far */
/** The number of bytes read so far. */
private long bytesRead;
private long bytesReadMark;

/** Encoder used to calculate the bytes of characters */
CharsetEncoder encoder;
/** Encoder for calculating the number of bytes for each character read. */
private CharsetEncoder encoder;

/**
* Constructs a new instance using the default buffer size.
Expand All @@ -67,10 +67,10 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
super(reader);
}

ExtendedBufferedReader(final Reader reader, String encoding) {
ExtendedBufferedReader(final Reader reader, Charset charset) {
super(reader);
if (encoding != null) {
encoder = Charset.forName(encoding).newEncoder();
if (charset != null) {
encoder = charset.newEncoder();
}
}

Expand Down Expand Up @@ -146,20 +146,30 @@ public int read() throws IOException {
}

/**
* In Java, a char data type are based on the original Unicode
* specification, which defined characters as fixed-width 16-bit entities.
* U+0000 to U+FFFF:
* - BMP, represented using 1 16-bit char
* - Consists of UTF-8 1-byte, 2-byte, some 3-byte chars
* U+10000 to U+10FFFF:
* - Supplementary characters, represented as a pair of characters,
* the first char from the high-surrogates range (\uD800-\uDBFF),
* and the second char from the low-surrogates range (uDC00-\uDFFF).
* - Consists of UTF-8 some 3-byte chars and 4-byte chars
* In Java, the {@code char} data type is based on the original Unicode
* specification, which defined characters as fixed-width 16-bit entities.
* <p>
* The Unicode characters are divided into two main ranges:
* <ul>
* <li><b>U+0000 to U+FFFF (Basic Multilingual Plane, BMP):</b>
* <ul>
* <li>Represented using a single 16-bit {@code char}.</li>
* <li>Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.</li>
* </ul>
* </li>
* <li><b>U+10000 to U+10FFFF (Supplementary Characters):</b>
* <ul>
* <li>Represented as a pair of {@code char}s:</li>
* <li>The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).</li>
* <li>The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).</li>
* <li>Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.</li>
* </ul>
* </li>
* </ul>
*/
private long getCharBytes(int current) throws CharacterCodingException {
char cChar = (char) current;
char lChar = (char) lastChar;
final char cChar = (char) current;
final char lChar = (char) lastChar;
if (!Character.isSurrogate(cChar)) {
return encoder.encode(
CharBuffer.wrap(new char[] {cChar})).limit();
Expand Down
99 changes: 46 additions & 53 deletions src/test/java/org/apache/commons/csv/CSVParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -707,38 +707,34 @@ public void testGetRecordThreeBytesRead() throws Exception {
"11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" +
"22222222222222,'4017-01-01',おはよう私の友人~,v4\n" +
"33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n";
// String code = "'1',4";
// final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
final CSVFormat format = CSVFormat.Builder.create()
.setDelimiter(',')
.setQuote('\'')
.build();
// CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8");

CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);
assertEquals(0, parser.getRecordNumber());
assertNotNull(record = parser.nextRecord());
assertEquals(1, record.getRecordNumber());
assertEquals(code.indexOf('i'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
.setDelimiter(',')
.setQuote('\'')
.get();
try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).get() ) {
CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);

assertNotNull(record = parser.nextRecord());
assertEquals(2, record.getRecordNumber());
assertEquals(code.indexOf('1'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
assertEquals(0, parser.getRecordNumber());
assertNotNull(record = parser.nextRecord());
assertEquals(1, record.getRecordNumber());
assertEquals(code.indexOf('i'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), record.getCharacterPosition());

assertNotNull(record = parser.nextRecord());
assertEquals(3, record.getRecordNumber());
assertEquals(code.indexOf('2'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), 95);
assertNotNull(record = parser.nextRecord());
assertEquals(2, record.getRecordNumber());
assertEquals(code.indexOf('1'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), record.getCharacterPosition());

assertNotNull(record = parser.nextRecord());
assertEquals(4, record.getRecordNumber());
assertEquals(code.indexOf('3'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), 154);
assertNotNull(record = parser.nextRecord());
assertEquals(3, record.getRecordNumber());
assertEquals(code.indexOf('2'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), 95);

parser.close();
assertNotNull(record = parser.nextRecord());
assertEquals(4, record.getRecordNumber());
assertEquals(code.indexOf('3'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), 154);
};

}

Expand All @@ -748,35 +744,32 @@ public void testGetRecordFourBytesRead() throws Exception {
"1,😊,🤔,😂\n" +
"2,😊,🤔,😂\n" +
"3,😊,🤔,😂\n";
// final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
final CSVFormat format = CSVFormat.Builder.create()
.setDelimiter(',')
.setQuote('\'')
.build();

// CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8");

CSVRecord record;
assertEquals(0, parser.getRecordNumber());
assertNotNull(record = parser.nextRecord());
assertEquals(1, record.getRecordNumber());
assertEquals(code.indexOf('i'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), record.getCharacterPosition());

assertNotNull(record = parser.nextRecord());
assertEquals(2, record.getRecordNumber());
assertEquals(code.indexOf('1'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
assertNotNull(record = parser.nextRecord());
assertEquals(3, record.getRecordNumber());
assertEquals(code.indexOf('2'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), 26);
assertNotNull(record = parser.nextRecord());
assertEquals(4, record.getRecordNumber());
assertEquals(code.indexOf('3'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), 43);
parser.close();
.get();
try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).get()) {
CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);

assertEquals(0, parser.getRecordNumber());
assertNotNull(record = parser.nextRecord());
assertEquals(1, record.getRecordNumber());
assertEquals(code.indexOf('i'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), record.getCharacterPosition());

assertNotNull(record = parser.nextRecord());
assertEquals(2, record.getRecordNumber());
assertEquals(code.indexOf('1'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
assertNotNull(record = parser.nextRecord());
assertEquals(3, record.getRecordNumber());
assertEquals(code.indexOf('2'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), 26);
assertNotNull(record = parser.nextRecord());
assertEquals(4, record.getRecordNumber());
assertEquals(code.indexOf('3'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), 43);
}
}

@Test
Expand Down
32 changes: 15 additions & 17 deletions src/test/java/org/apache/commons/csv/JiraCsv196Test.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,24 +21,23 @@
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;

import java.nio.charset.StandardCharsets;

import org.junit.jupiter.api.Test;


public class JiraCsv196Test {
@Test
public void parseThreeBytes() throws IOException {

// final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
final CSVFormat format = CSVFormat.Builder.create()
.setDelimiter(',')
.setQuote('\'')
.build();
// CSVParser parser = new CSVParser(getTestInput(
// "org/apache/commons/csv/CSV-196/japanese.csv"), format, 0L, 1L, "UTF-8");
CSVParser parser = format.parse(getTestInput(
"org/apache/commons/csv/CSV-196/japanese.csv"), 0L, 1L, "UTF-8");
.setDelimiter(',')
.setQuote('\'')
.get();
CSVParser parser = new CSVParser.Builder()
.setFormat(format)
.setReader(getTestInput("org/apache/commons/csv/CSV-196/japanese.csv"))
.setCharset(StandardCharsets.UTF_8)
.get();
long[] charByteKey = {0, 89, 242, 395};
int idx = 0;
for (CSVRecord record : parser) {
Expand All @@ -50,15 +49,15 @@ public void parseThreeBytes() throws IOException {

@Test
public void parseFourBytes() throws IOException {
// final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
final CSVFormat format = CSVFormat.Builder.create()
.setDelimiter(',')
.setQuote('\'')
.build();

CSVParser parser = format.parse(getTestInput(
"org/apache/commons/csv/CSV-196/emoji.csv"), 0L, 1L, "UTF-8");

.get();
CSVParser parser = new CSVParser.Builder()
.setFormat(format)
.setReader(getTestInput("org/apache/commons/csv/CSV-196/emoji.csv"))
.setCharset(StandardCharsets.UTF_8)
.get();
long[] charByteKey = {0, 84, 701, 1318, 1935};
int idx = 0;
for (CSVRecord record : parser) {
Expand All @@ -67,7 +66,6 @@ public void parseFourBytes() throws IOException {
parser.close();
}


private Reader getTestInput(String path) {
return new InputStreamReader(
ClassLoader.getSystemClassLoader().getResourceAsStream(path));
Expand Down

0 comments on commit 3599f5b

Please sign in to comment.