Skip to content

Commit

Permalink
Add support in Commons CSV for tracking byte positions during parsing. (
Browse files Browse the repository at this point in the history
#11)

Add support in Commons CSV for tracking byte positions during parsing
  • Loading branch information
DarrenJAN authored Dec 2, 2024
1 parent 74f0970 commit 281bd89
Show file tree
Hide file tree
Showing 10 changed files with 347 additions and 3 deletions.
11 changes: 10 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@
See the License for the specific language governing permissions and
limitations under the License.
-->

<!--
Modifications copyright © 2017, 2022, 2024 MarkLogic Corporation.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
Expand All @@ -23,11 +29,12 @@
<version>75</version>
</parent>
<artifactId>commons-csv</artifactId>
<version>1.12.1-SNAPSHOT</version>
<version>1.12.1-marklogic</version>
<name>Apache Commons CSV</name>
<url>https://commons.apache.org/proper/commons-csv/</url>
<inceptionYear>2005</inceptionYear>
<description>The Apache Commons CSV library provides a simple interface for reading and writing CSV files of various types.</description>
<packaging>jar</packaging>

<dependencies>
<dependency>
Expand Down Expand Up @@ -231,6 +238,8 @@
<exclude>src/test/resources/org/apache/commons/csv/CSV-141/csv-141.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/csv-167/sample1.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv</exclude>
Expand Down
29 changes: 29 additions & 0 deletions src/main/java/org/apache/commons/csv/CSVFormat.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
* limitations under the License.
*/

/*
* Modifications copyright © 2024 MarkLogic Corporation.
*/
package org.apache.commons.csv;

import static org.apache.commons.io.IOUtils.EOF;
Expand Down Expand Up @@ -2074,6 +2079,30 @@ public CSVParser parse(final Reader reader) throws IOException {
return new CSVParser(reader, this);
}

/**
* Parses the specified content.
*
* <p>
* This method provides a way to parse CSV data from an input stream, starting at a specified character offset and record number,
* using a specified encoding. It returns a {@link CSVParser} that can be used to iterate over the parsed {@link CSVRecord}s.
* </p>
*
* <p>
* For additional parsing options, see the various static parse methods available on {@link CSVParser}.
* </p>
*
* @param reader the input stream
* @param characterOffset the character offset to start parsing from
* @param recordNumber the initial record number to start counting from
* @param encoding the character encoding of the input stream
* @return a parser over a stream of {@link CSVRecord}s.
* @throws IOException If an I/O error occurs
* @throws CSVException Thrown on invalid input.
*/
public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException {
return new CSVParser(reader, this, characterOffset, recordNumber, encoding);
}

/**
* Prints to the specified output.
*
Expand Down
39 changes: 37 additions & 2 deletions src/main/java/org/apache/commons/csv/CSVParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
* limitations under the License.
*/

/*
* Modifications copyright © 2017, 2024 MarkLogic Corporation.
*/
package org.apache.commons.csv;

import static org.apache.commons.csv.Token.Type.TOKEN;
Expand Down Expand Up @@ -438,10 +443,39 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException
@SuppressWarnings("resource")
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
throws IOException {
this(reader, format, characterOffset, recordNumber, null);
}

/**
* Constructs a new instance using the given {@link CSVFormat}
*
* <p>
* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
* unless you close the {@code reader}.
* </p>
*
* @param reader
* a Reader containing CSV-formatted input. Must not be null.
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @param characterOffset
* Lexer offset when the parser does not start parsing at the beginning of the source.
* @param recordNumber
* The next record number to assign
* @param encoding
* The encoding to use for the reader
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either the reader or format is null.
* @throws IOException
* If there is a problem reading the header or skipping the first record
* @throws CSVException Thrown on invalid input.
*/
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
String encoding) throws IOException {
Objects.requireNonNull(reader, "reader");
Objects.requireNonNull(format, "format");
this.format = format.copy();
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, encoding));
this.csvRecordIterator = new CSVRecordIterator();
this.headers = createHeaders();
this.characterOffset = characterOffset;
Expand Down Expand Up @@ -768,6 +802,7 @@ CSVRecord nextRecord() throws IOException {
recordList.clear();
StringBuilder sb = null;
final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
final long startCharByte = lexer.getBytesRead() + this.characterOffset;
do {
reusableToken.reset();
lexer.nextToken(reusableToken);
Expand Down Expand Up @@ -805,7 +840,7 @@ CSVRecord nextRecord() throws IOException {
recordNumber++;
final String comment = Objects.toString(sb, null);
result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment,
recordNumber, startCharPosition);
recordNumber, startCharPosition, startCharByte);
}
return result;
}
Expand Down
29 changes: 29 additions & 0 deletions src/main/java/org/apache/commons/csv/CSVRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
* limitations under the License.
*/

/*
* Modifications copyright © 2017, 2024 MarkLogic Corporation.
*/
package org.apache.commons.csv;

import java.io.Serializable;
Expand Down Expand Up @@ -48,6 +53,11 @@ public final class CSVRecord implements Serializable, Iterable<String> {
*/
private final long characterPosition;

/**
* The start byte of this record as a character byte in the source stream.
*/
private final long characterByte;

/** The accumulated comments (if any) */
private final String comment;

Expand All @@ -67,8 +77,18 @@ public final class CSVRecord implements Serializable, Iterable<String> {
this.parser = parser;
this.comment = comment;
this.characterPosition = characterPosition;
this.characterByte = 0L;
}

CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber,
final long characterPosition, final long characterByte) {
this.recordNumber = recordNumber;
this.values = values != null ? values : Constants.EMPTY_STRING_ARRAY;
this.parser = parser;
this.comment = comment;
this.characterPosition = characterPosition;
this.characterByte = characterByte;
}
/**
* Returns a value by {@link Enum}.
*
Expand Down Expand Up @@ -144,6 +164,15 @@ public long getCharacterPosition() {
return characterPosition;
}

/**
* Returns the start byte of this record as a character byte in the source stream.
*
* @return the start byte of this record as a character byte in the source stream.
*/
public long getCharacterByte() {
return characterByte;
}

/**
* Returns the comment for this record, if any.
* Note that comments are attached to the following record.
Expand Down
66 changes: 66 additions & 0 deletions src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
* limitations under the License.
*/

/*
* Modifications copyright © 2017, 2022, 2024 MarkLogic Corporation.
*/
package org.apache.commons.csv;

import static org.apache.commons.csv.Constants.CR;
Expand All @@ -24,6 +29,10 @@

import java.io.IOException;
import java.io.Reader;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.UnsynchronizedBufferedReader;
Expand All @@ -49,13 +58,27 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
private long position;
private long positionMark;

/** The number of bytes read so far */
private long bytesRead;
private long bytesReadMark;

/** Encoder used to calculate the bytes of characters */
private CharsetEncoder encoder;

/**
* Constructs a new instance using the default buffer size.
*/
ExtendedBufferedReader(final Reader reader) {
super(reader);
}

ExtendedBufferedReader(final Reader reader, String encoding) {
super(reader);
if (encoding != null) {
encoder = Charset.forName(encoding).newEncoder();
}
}

/**
* Closes the stream.
*
Expand Down Expand Up @@ -108,6 +131,7 @@ public void mark(final int readAheadLimit) throws IOException {
lineNumberMark = lineNumber;
lastCharMark = lastChar;
positionMark = position;
bytesReadMark = bytesRead;
super.mark(readAheadLimit);
}

Expand All @@ -118,11 +142,43 @@ public int read() throws IOException {
current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
lineNumber++;
}
if (encoder != null) {
this.bytesRead += getCharBytes(current);
}
lastChar = current;
position++;
return lastChar;
}

/**
* In Java, a char data type are based on the original Unicode
* specification, which defined characters as fixed-width 16-bit entities.
* U+0000 to U+FFFF:
* - BMP, represented using 1 16-bit char
* - Consists of UTF-8 1-byte, 2-byte, some 3-byte chars
* U+10000 to U+10FFFF:
* - Supplementary characters, represented as a pair of characters,
* the first char from the high-surrogates range (\uD800-\uDBFF),
* and the second char from the low-surrogates range (uDC00-\uDFFF).
* - Consists of UTF-8 some 3-byte chars and 4-byte chars
*/
private long getCharBytes(int current) throws CharacterCodingException {
char cChar = (char) current;
char lChar = (char) lastChar;
if (!Character.isSurrogate(cChar)) {
return encoder.encode(
CharBuffer.wrap(new char[] {cChar})).limit();
} else {
if (Character.isHighSurrogate(cChar)) {
// Move on to the next char (low surrogate)
return 0;
} else if (Character.isSurrogatePair(lChar, cChar)) {
return encoder.encode(
CharBuffer.wrap(new char[] {lChar, cChar})).limit();
} else throw new CharacterCodingException();
}
}

@Override
public int read(final char[] buf, final int offset, final int length) throws IOException {
if (length == 0) {
Expand Down Expand Up @@ -187,7 +243,17 @@ public void reset() throws IOException {
lineNumber = lineNumberMark;
lastChar = lastCharMark;
position = positionMark;
bytesRead = bytesReadMark;
super.reset();
}

/**
* Gets the number of bytes read by the reader.
*
* @return the number of bytes read by the read
*/
long getBytesRead() {
return this.bytesRead;
}

}
14 changes: 14 additions & 0 deletions src/main/java/org/apache/commons/csv/Lexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
* limitations under the License.
*/

/*
* Modifications copyright © 2017 MarkLogic Corporation.
*/
package org.apache.commons.csv;

import static org.apache.commons.io.IOUtils.EOF;
Expand Down Expand Up @@ -103,6 +108,15 @@ long getCharacterPosition() {
return reader.getPosition();
}

/**
* Returns the number of bytes read
*
* @return the number of bytes read
*/
long getBytesRead() {
return reader.getBytesRead();
}

/**
* Returns the current line number
*
Expand Down
Loading

0 comments on commit 281bd89

Please sign in to comment.