Revert "Add support in Commons CSV for tracking byte positions during…

… parsing…" (#8) This reverts commit c413aac.
marklogic · Nov 5, 2024 · 457eeb3 · 457eeb3
1 parent c413aac
commit 457eeb3
Show file tree

Hide file tree

Showing 10 changed files with 2 additions and 315 deletions.
diff --git a/pom.xml b/pom.xml
@@ -28,7 +28,6 @@
   <url>https://commons.apache.org/proper/commons-csv/</url>
   <inceptionYear>2005</inceptionYear>
   <description>The Apache Commons CSV library provides a simple interface for reading and writing CSV files of various types.</description>
-  <packaging>jar</packaging>
 
   <dependencies>
     <dependency>
@@ -232,8 +231,6 @@
               <exclude>src/test/resources/org/apache/commons/csv/CSV-141/csv-141.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/csv-167/sample1.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv</exclude>
-              <exclude>src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv</exclude>
-              <exclude>src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv</exclude>

diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java
@@ -2097,30 +2097,6 @@ public CSVParser parse(final Reader reader) throws IOException {
         return new CSVParser(reader, this);
     }
 
-    /**
-     * Parses the specified content.
-     *
-     * <p>
-     * This method provides a way to parse CSV data from an input stream, starting at a specified character offset and record number,
-     * using a specified encoding. It returns a {@link CSVParser} that can be used to iterate over the parsed {@link CSVRecord}s.
-     * </p>
-     *
-     * <p>
-     * For additional parsing options, see the various static parse methods available on {@link CSVParser}.
-     * </p>
-     *
-     * @param reader the input stream
-     * @param characterOffset the character offset to start parsing from
-     * @param recordNumber the initial record number to start counting from
-     * @param encoding the character encoding of the input stream
-     * @return a parser over a stream of {@link CSVRecord}s.
-     * @throws IOException If an I/O error occurs
-     * @throws CSVException Thrown on invalid input.
-     */
-    public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException {
-        return new CSVParser(reader, this, characterOffset, recordNumber, encoding);
-    }
-
     /**
      * Prints to the specified output.
      *

diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java
@@ -511,39 +511,10 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException
     @SuppressWarnings("resource")
     public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
         throws IOException {
-            this(reader, format, characterOffset, recordNumber, null);
-        }
-
-        /**
-     * Constructs a new instance using the given {@link CSVFormat}
-     *
-     * <p>
-     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
-     * unless you close the {@code reader}.
-     * </p>
-     *
-     * @param reader
-     *            a Reader containing CSV-formatted input. Must not be null.
-     * @param format
-     *            the CSVFormat used for CSV parsing. Must not be null.
-     * @param characterOffset
-     *            Lexer offset when the parser does not start parsing at the beginning of the source.
-     * @param recordNumber
-     *            The next record number to assign
-     * @param encoding
-     *            The encoding to use for the reader
-     * @throws IllegalArgumentException
-     *             If the parameters of the format are inconsistent or if either the reader or format is null.
-     * @throws IOException
-     *             If there is a problem reading the header or skipping the first record
-     * @throws CSVException Thrown on invalid input.
-     */
-    public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
-        String encoding) throws IOException {
         Objects.requireNonNull(reader, "reader");
         Objects.requireNonNull(format, "format");
         this.format = format.copy();
-        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, encoding));
+        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
         this.csvRecordIterator = new CSVRecordIterator();
         this.headers = createHeaders();
         this.characterOffset = characterOffset;
@@ -870,7 +841,6 @@ CSVRecord nextRecord() throws IOException {
         recordList.clear();
         StringBuilder sb = null;
         final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
-        final long startCharByte = lexer.getBytesRead() + this.characterOffset;
         do {
             reusableToken.reset();
             lexer.nextToken(reusableToken);
@@ -908,7 +878,7 @@ CSVRecord nextRecord() throws IOException {
             recordNumber++;
             final String comment = Objects.toString(sb, null);
             result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment,
-                recordNumber, startCharPosition, startCharByte);
+                recordNumber, startCharPosition);
         }
         return result;
     }

diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java
@@ -48,11 +48,6 @@ public final class CSVRecord implements Serializable, Iterable<String> {
      */
     private final long characterPosition;
 
-    /**
-     * The start byte of this record as a character byte in the source stream.
-     */
-    private final long characterByte;
-
     /** The accumulated comments (if any) */
     private final String comment;
 
@@ -72,18 +67,8 @@ public final class CSVRecord implements Serializable, Iterable<String> {
         this.parser = parser;
         this.comment = comment;
         this.characterPosition = characterPosition;
-        this.characterByte = 0L;
     }
 
-    CSVRecord(final CSVParser parser, final String[] values,  final String comment, final long recordNumber,
-            final long characterPosition, final long characterByte) {
-        this.recordNumber = recordNumber;
-        this.values = values != null ? values : Constants.EMPTY_STRING_ARRAY;
-        this.parser = parser;
-        this.comment = comment;
-        this.characterPosition = characterPosition;
-        this.characterByte = characterByte;
-    }
     /**
      * Returns a value by {@link Enum}.
      *
@@ -159,15 +144,6 @@ public long getCharacterPosition() {
         return characterPosition;
     }
 
-    /**
-     * Returns the start byte of this record as a character byte in the source stream.
-     *
-     * @return the start byte of this record as a character byte in the source stream.
-     */
-    public long getCharacterByte() {
-        return characterByte;
-    }
-
     /**
      * Returns the comment for this record, if any.
      * Note that comments are attached to the following record.

diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
@@ -24,10 +24,6 @@
 
 import java.io.IOException;
 import java.io.Reader;
-import java.nio.CharBuffer;
-import java.nio.charset.CharacterCodingException;
-import java.nio.charset.Charset;
-import java.nio.charset.CharsetEncoder;
 
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.UnsynchronizedBufferedReader;
@@ -53,27 +49,13 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
     private long position;
     private long positionMark;
 
-    /** The number of bytes read so far */
-    private long bytesRead;
-    private long bytesReadMark;
-
-    /** Encoder used to calculate the bytes of characters */
-    CharsetEncoder encoder;
-
     /**
      * Constructs a new instance using the default buffer size.
      */
     ExtendedBufferedReader(final Reader reader) {
         super(reader);
     }
 
-    ExtendedBufferedReader(final Reader reader, String encoding) {
-        super(reader);
-        if (encoding != null) {
-            encoder = Charset.forName(encoding).newEncoder();
-        }
-    }
-
     /**
      * Closes the stream.
      *
@@ -126,7 +108,6 @@ public void mark(final int readAheadLimit) throws IOException {
         lineNumberMark = lineNumber;
         lastCharMark = lastChar;
         positionMark = position;
-        bytesReadMark = bytesRead;
         super.mark(readAheadLimit);
     }
 
@@ -137,43 +118,11 @@ public int read() throws IOException {
             current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
             lineNumber++;
         }
-        if (encoder != null) {
-            this.bytesRead += getCharBytes(current);
-        }
         lastChar = current;
         position++;
         return lastChar;
     }
 
-    /**
-     *  In Java, a char data type are based on the original Unicode
-     *  specification, which defined characters as fixed-width 16-bit entities.
-     *   U+0000 to U+FFFF:
-     *     - BMP, represented using 1 16-bit char
-     *     - Consists of UTF-8 1-byte, 2-byte, some 3-byte chars
-     *   U+10000 to U+10FFFF:
-     *     - Supplementary characters, represented as a pair of characters,
-     *     the first char from the high-surrogates range (\uD800-\uDBFF),
-     *     and the second char from the low-surrogates range (uDC00-\uDFFF).
-     *     - Consists of UTF-8 some 3-byte chars and 4-byte chars
-     */
-    private long getCharBytes(int current) throws CharacterCodingException {
-        char cChar = (char) current;
-        char lChar = (char) lastChar;
-        if (!Character.isSurrogate(cChar)) {
-            return encoder.encode(
-                CharBuffer.wrap(new char[] {cChar})).limit();
-        } else {
-            if (Character.isHighSurrogate(cChar)) {
-                // Move on to the next char (low surrogate)
-                return 0;
-            } else if (Character.isSurrogatePair(lChar, cChar)) {
-                return encoder.encode(
-                    CharBuffer.wrap(new char[] {lChar, cChar})).limit();
-            } else throw new CharacterCodingException();
-        }
-    }
-
     @Override
     public int read(final char[] buf, final int offset, final int length) throws IOException {
         if (length == 0) {
@@ -238,17 +187,7 @@ public void reset() throws IOException {
         lineNumber = lineNumberMark;
         lastChar = lastCharMark;
         position = positionMark;
-        bytesRead = bytesReadMark;
         super.reset();
     }
 
-    /**
-     * Gets the number of bytes read by the reader.
-     *
-     * @return the number of bytes read by the read
-     */
-    long getBytesRead() {
-        return this.bytesRead;
-    }
-
 }
diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java
@@ -103,15 +103,6 @@ long getCharacterPosition() {
         return reader.getPosition();
     }
 
-    /**
-     * Returns the number of bytes read
-     *
-     * @return the number of bytes read
-     */
-    long getBytesRead() {
-        return reader.getBytesRead();
-    }
-
     /**
      * Returns the current line number
      *

diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java
@@ -701,84 +701,6 @@ public void testGetHeaderComment_NoComment3() throws IOException {
         }
     }
 
-    @Test
-    public void testGetRecordThreeBytesRead() throws Exception {
-        String code = "id,date,val5,val4\n" +
-            "11111111111111,'4017-09-01',きちんと節分近くには咲いてる～,v4\n" +
-            "22222222222222,'4017-01-01',おはよう私の友人～,v4\n" +
-            "33333333333333,'4017-01-01',きる自然の力ってすごいな～,v4\n";
-        // String code = "'1',4";
-        // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
-        final CSVFormat format = CSVFormat.Builder.create()
-                               .setDelimiter(',')
-                               .setQuote('\'')
-                               .build();
-        // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
-        CSVParser parser =  format.parse(new StringReader(code), 0L, 1L, "UTF-8");
-
-        CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);
-        assertEquals(0, parser.getRecordNumber());
-        assertNotNull(record = parser.nextRecord());
-        assertEquals(1, record.getRecordNumber());
-        assertEquals(code.indexOf('i'), record.getCharacterPosition());
-        assertEquals(record.getCharacterByte(), record.getCharacterPosition());
-
-        assertNotNull(record = parser.nextRecord());
-        assertEquals(2, record.getRecordNumber());
-        assertEquals(code.indexOf('1'), record.getCharacterPosition());
-        assertEquals(record.getCharacterByte(), record.getCharacterPosition());
-
-        assertNotNull(record = parser.nextRecord());
-        assertEquals(3, record.getRecordNumber());
-        assertEquals(code.indexOf('2'), record.getCharacterPosition());
-        assertEquals(record.getCharacterByte(), 95);
-
-        assertNotNull(record = parser.nextRecord());
-        assertEquals(4, record.getRecordNumber());
-        assertEquals(code.indexOf('3'), record.getCharacterPosition());
-        assertEquals(record.getCharacterByte(), 154);
-
-        parser.close();
-
-    }
-
-    @Test
-    public void testGetRecordFourBytesRead() throws Exception {
-        String code = "id,a,b,c\n" +
-            "1,😊,🤔,😂\n" +
-            "2,😊,🤔,😂\n" +
-            "3,😊,🤔,😂\n";
-        // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
-        final CSVFormat format = CSVFormat.Builder.create()
-            .setDelimiter(',')
-            .setQuote('\'')
-            .build();
-
-        // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
-        CSVParser parser =  format.parse(new StringReader(code), 0L, 1L, "UTF-8");
-
-        CSVRecord record;
-        assertEquals(0, parser.getRecordNumber());
-        assertNotNull(record = parser.nextRecord());
-        assertEquals(1, record.getRecordNumber());
-        assertEquals(code.indexOf('i'), record.getCharacterPosition());
-        assertEquals(record.getCharacterByte(), record.getCharacterPosition());
-
-        assertNotNull(record = parser.nextRecord());
-        assertEquals(2, record.getRecordNumber());
-        assertEquals(code.indexOf('1'), record.getCharacterPosition());
-        assertEquals(record.getCharacterByte(), record.getCharacterPosition());
-        assertNotNull(record = parser.nextRecord());
-        assertEquals(3, record.getRecordNumber());
-        assertEquals(code.indexOf('2'), record.getCharacterPosition());
-        assertEquals(record.getCharacterByte(), 26);
-        assertNotNull(record = parser.nextRecord());
-        assertEquals(4, record.getRecordNumber());
-        assertEquals(code.indexOf('3'), record.getCharacterPosition());
-        assertEquals(record.getCharacterByte(), 43);
-        parser.close();
-    }
-
     @Test
     public void testGetHeaderMap() throws Exception {
         try (final CSVParser parser = CSVParser.parse("a,b,c\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader("A", "B", "C"))) {