Add support in Commons CSV for tracking byte positions during parsing. (

#11) Add support in Commons CSV for tracking byte positions during parsing
marklogic · Dec 2, 2024 · 281bd89 · 281bd89
1 parent 74f0970
commit 281bd89
Show file tree

Hide file tree

Showing 10 changed files with 347 additions and 3 deletions.
diff --git a/pom.xml b/pom.xml
@@ -15,6 +15,12 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 -->
+
+<!--
+
+ Modifications copyright © 2017, 2022, 2024 MarkLogic Corporation.
+
+-->
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
   <modelVersion>4.0.0</modelVersion>
   <parent>
@@ -23,11 +29,12 @@
     <version>75</version>
   </parent>
   <artifactId>commons-csv</artifactId>
-  <version>1.12.1-SNAPSHOT</version>
+  <version>1.12.1-marklogic</version>
   <name>Apache Commons CSV</name>
   <url>https://commons.apache.org/proper/commons-csv/</url>
   <inceptionYear>2005</inceptionYear>
   <description>The Apache Commons CSV library provides a simple interface for reading and writing CSV files of various types.</description>
+  <packaging>jar</packaging>
 
   <dependencies>
     <dependency>
@@ -231,6 +238,8 @@
               <exclude>src/test/resources/org/apache/commons/csv/CSV-141/csv-141.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/csv-167/sample1.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv</exclude>
+              <exclude>src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv</exclude>
+              <exclude>src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv</exclude>

diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java
@@ -15,6 +15,11 @@
  * limitations under the License.
  */
 
+ /*
+
+ * Modifications copyright © 2024 MarkLogic Corporation.
+
+ */
 package org.apache.commons.csv;
 
 import static org.apache.commons.io.IOUtils.EOF;
@@ -2074,6 +2079,30 @@ public CSVParser parse(final Reader reader) throws IOException {
         return new CSVParser(reader, this);
     }
 
+    /**
+     * Parses the specified content.
+     *
+     * <p>
+     * This method provides a way to parse CSV data from an input stream, starting at a specified character offset and record number,
+     * using a specified encoding. It returns a {@link CSVParser} that can be used to iterate over the parsed {@link CSVRecord}s.
+     * </p>
+     *
+     * <p>
+     * For additional parsing options, see the various static parse methods available on {@link CSVParser}.
+     * </p>
+     *
+     * @param reader the input stream
+     * @param characterOffset the character offset to start parsing from
+     * @param recordNumber the initial record number to start counting from
+     * @param encoding the character encoding of the input stream
+     * @return a parser over a stream of {@link CSVRecord}s.
+     * @throws IOException If an I/O error occurs
+     * @throws CSVException Thrown on invalid input.
+     */
+    public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException {
+        return new CSVParser(reader, this, characterOffset, recordNumber, encoding);
+    }
+
     /**
      * Prints to the specified output.
      *

diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java
@@ -15,6 +15,11 @@
  * limitations under the License.
  */
 
+ /*
+
+ * Modifications copyright © 2017, 2024 MarkLogic Corporation.
+
+ */
 package org.apache.commons.csv;
 
 import static org.apache.commons.csv.Token.Type.TOKEN;
@@ -438,10 +443,39 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException
     @SuppressWarnings("resource")
     public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
         throws IOException {
+            this(reader, format, characterOffset, recordNumber, null);
+        }
+
+        /**
+     * Constructs a new instance using the given {@link CSVFormat}
+     *
+     * <p>
+     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
+     * unless you close the {@code reader}.
+     * </p>
+     *
+     * @param reader
+     *            a Reader containing CSV-formatted input. Must not be null.
+     * @param format
+     *            the CSVFormat used for CSV parsing. Must not be null.
+     * @param characterOffset
+     *            Lexer offset when the parser does not start parsing at the beginning of the source.
+     * @param recordNumber
+     *            The next record number to assign
+     * @param encoding
+     *            The encoding to use for the reader
+     * @throws IllegalArgumentException
+     *             If the parameters of the format are inconsistent or if either the reader or format is null.
+     * @throws IOException
+     *             If there is a problem reading the header or skipping the first record
+     * @throws CSVException Thrown on invalid input.
+     */
+    public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
+        String encoding) throws IOException {
         Objects.requireNonNull(reader, "reader");
         Objects.requireNonNull(format, "format");
         this.format = format.copy();
-        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
+        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, encoding));
         this.csvRecordIterator = new CSVRecordIterator();
         this.headers = createHeaders();
         this.characterOffset = characterOffset;
@@ -768,6 +802,7 @@ CSVRecord nextRecord() throws IOException {
         recordList.clear();
         StringBuilder sb = null;
         final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
+        final long startCharByte = lexer.getBytesRead() + this.characterOffset;
         do {
             reusableToken.reset();
             lexer.nextToken(reusableToken);
@@ -805,7 +840,7 @@ CSVRecord nextRecord() throws IOException {
             recordNumber++;
             final String comment = Objects.toString(sb, null);
             result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment,
-                recordNumber, startCharPosition);
+                recordNumber, startCharPosition, startCharByte);
         }
         return result;
     }

diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java
@@ -15,6 +15,11 @@
  * limitations under the License.
  */
 
+/*
+
+ * Modifications copyright © 2017, 2024 MarkLogic Corporation.
+
+ */
 package org.apache.commons.csv;
 
 import java.io.Serializable;
@@ -48,6 +53,11 @@ public final class CSVRecord implements Serializable, Iterable<String> {
      */
     private final long characterPosition;
 
+    /**
+     * The start byte of this record as a character byte in the source stream.
+     */
+    private final long characterByte;
+
     /** The accumulated comments (if any) */
     private final String comment;
 
@@ -67,8 +77,18 @@ public final class CSVRecord implements Serializable, Iterable<String> {
         this.parser = parser;
         this.comment = comment;
         this.characterPosition = characterPosition;
+        this.characterByte = 0L;
     }
 
+    CSVRecord(final CSVParser parser, final String[] values,  final String comment, final long recordNumber,
+            final long characterPosition, final long characterByte) {
+        this.recordNumber = recordNumber;
+        this.values = values != null ? values : Constants.EMPTY_STRING_ARRAY;
+        this.parser = parser;
+        this.comment = comment;
+        this.characterPosition = characterPosition;
+        this.characterByte = characterByte;
+    }
     /**
      * Returns a value by {@link Enum}.
      *
@@ -144,6 +164,15 @@ public long getCharacterPosition() {
         return characterPosition;
     }
 
+    /**
+     * Returns the start byte of this record as a character byte in the source stream.
+     *
+     * @return the start byte of this record as a character byte in the source stream.
+     */
+    public long getCharacterByte() {
+        return characterByte;
+    }
+
     /**
      * Returns the comment for this record, if any.
      * Note that comments are attached to the following record.

diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
@@ -15,6 +15,11 @@
  * limitations under the License.
  */
 
+ /*
+
+ * Modifications copyright © 2017, 2022, 2024 MarkLogic Corporation.
+
+ */
 package org.apache.commons.csv;
 
 import static org.apache.commons.csv.Constants.CR;
@@ -24,6 +29,10 @@
 
 import java.io.IOException;
 import java.io.Reader;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetEncoder;
 
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.UnsynchronizedBufferedReader;
@@ -49,13 +58,27 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
     private long position;
     private long positionMark;
 
+    /** The number of bytes read so far */
+    private long bytesRead;
+    private long bytesReadMark;
+
+    /** Encoder used to calculate the bytes of characters */
+    private CharsetEncoder encoder;
+
     /**
      * Constructs a new instance using the default buffer size.
      */
     ExtendedBufferedReader(final Reader reader) {
         super(reader);
     }
 
+    ExtendedBufferedReader(final Reader reader, String encoding) {
+        super(reader);
+        if (encoding != null) {
+            encoder = Charset.forName(encoding).newEncoder();
+        }
+    }
+
     /**
      * Closes the stream.
      *
@@ -108,6 +131,7 @@ public void mark(final int readAheadLimit) throws IOException {
         lineNumberMark = lineNumber;
         lastCharMark = lastChar;
         positionMark = position;
+        bytesReadMark = bytesRead;
         super.mark(readAheadLimit);
     }
 
@@ -118,11 +142,43 @@ public int read() throws IOException {
             current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
             lineNumber++;
         }
+        if (encoder != null) {
+            this.bytesRead += getCharBytes(current);
+        }
         lastChar = current;
         position++;
         return lastChar;
     }
 
+    /**
+     *  In Java, a char data type are based on the original Unicode
+     *  specification, which defined characters as fixed-width 16-bit entities.
+     *   U+0000 to U+FFFF:
+     *     - BMP, represented using 1 16-bit char
+     *     - Consists of UTF-8 1-byte, 2-byte, some 3-byte chars
+     *   U+10000 to U+10FFFF:
+     *     - Supplementary characters, represented as a pair of characters,
+     *     the first char from the high-surrogates range (\uD800-\uDBFF),
+     *     and the second char from the low-surrogates range (uDC00-\uDFFF).
+     *     - Consists of UTF-8 some 3-byte chars and 4-byte chars
+     */
+    private long getCharBytes(int current) throws CharacterCodingException {
+        char cChar = (char) current;
+        char lChar = (char) lastChar;
+        if (!Character.isSurrogate(cChar)) {
+            return encoder.encode(
+                CharBuffer.wrap(new char[] {cChar})).limit();
+        } else {
+            if (Character.isHighSurrogate(cChar)) {
+                // Move on to the next char (low surrogate)
+                return 0;
+            } else if (Character.isSurrogatePair(lChar, cChar)) {
+                return encoder.encode(
+                    CharBuffer.wrap(new char[] {lChar, cChar})).limit();
+            } else throw new CharacterCodingException();
+        }
+    }
+
     @Override
     public int read(final char[] buf, final int offset, final int length) throws IOException {
         if (length == 0) {
@@ -187,7 +243,17 @@ public void reset() throws IOException {
         lineNumber = lineNumberMark;
         lastChar = lastCharMark;
         position = positionMark;
+        bytesRead = bytesReadMark;
         super.reset();
     }
 
+    /**
+     * Gets the number of bytes read by the reader.
+     *
+     * @return the number of bytes read by the read
+     */
+    long getBytesRead() {
+        return this.bytesRead;
+    }
+
 }
diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java
@@ -15,6 +15,11 @@
  * limitations under the License.
  */
 
+ /*
+
+ * Modifications copyright © 2017 MarkLogic Corporation.
+
+ */
 package org.apache.commons.csv;
 
 import static org.apache.commons.io.IOUtils.EOF;
@@ -103,6 +108,15 @@ long getCharacterPosition() {
         return reader.getPosition();
     }
 
+    /**
+     * Returns the number of bytes read
+     *
+     * @return the number of bytes read
+     */
+    long getBytesRead() {
+        return reader.getBytesRead();
+    }
+
     /**
      * Returns the current line number
      *