diff --git a/pom.xml b/pom.xml index bfdf9e74a..da5bc1b4e 100644 --- a/pom.xml +++ b/pom.xml @@ -28,7 +28,6 @@ https://commons.apache.org/proper/commons-csv/ 2005 The Apache Commons CSV library provides a simple interface for reading and writing CSV files of various types. - jar @@ -232,8 +231,6 @@ src/test/resources/org/apache/commons/csv/CSV-141/csv-141.csv src/test/resources/org/apache/commons/csv/csv-167/sample1.csv src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv - src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv - src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java index 9833a26ed..3d4b43c6b 100644 --- a/src/main/java/org/apache/commons/csv/CSVFormat.java +++ b/src/main/java/org/apache/commons/csv/CSVFormat.java @@ -2097,30 +2097,6 @@ public CSVParser parse(final Reader reader) throws IOException { return new CSVParser(reader, this); } - /** - * Parses the specified content. - * - *

- * This method provides a way to parse CSV data from an input stream, starting at a specified character offset and record number, - * using a specified encoding. It returns a {@link CSVParser} that can be used to iterate over the parsed {@link CSVRecord}s. - *

- * - *

- * For additional parsing options, see the various static parse methods available on {@link CSVParser}. - *

- * - * @param reader the input stream - * @param characterOffset the character offset to start parsing from - * @param recordNumber the initial record number to start counting from - * @param encoding the character encoding of the input stream - * @return a parser over a stream of {@link CSVRecord}s. - * @throws IOException If an I/O error occurs - * @throws CSVException Thrown on invalid input. - */ - public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException { - return new CSVParser(reader, this, characterOffset, recordNumber, encoding); - } - /** * Prints to the specified output. * diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index 75bf78d20..f0341cf71 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -511,39 +511,10 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException @SuppressWarnings("resource") public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) throws IOException { - this(reader, format, characterOffset, recordNumber, null); - } - - /** - * Constructs a new instance using the given {@link CSVFormat} - * - *

- * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, - * unless you close the {@code reader}. - *

- * - * @param reader - * a Reader containing CSV-formatted input. Must not be null. - * @param format - * the CSVFormat used for CSV parsing. Must not be null. - * @param characterOffset - * Lexer offset when the parser does not start parsing at the beginning of the source. - * @param recordNumber - * The next record number to assign - * @param encoding - * The encoding to use for the reader - * @throws IllegalArgumentException - * If the parameters of the format are inconsistent or if either the reader or format is null. - * @throws IOException - * If there is a problem reading the header or skipping the first record - * @throws CSVException Thrown on invalid input. - */ - public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, - String encoding) throws IOException { Objects.requireNonNull(reader, "reader"); Objects.requireNonNull(format, "format"); this.format = format.copy(); - this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, encoding)); + this.lexer = new Lexer(format, new ExtendedBufferedReader(reader)); this.csvRecordIterator = new CSVRecordIterator(); this.headers = createHeaders(); this.characterOffset = characterOffset; @@ -870,7 +841,6 @@ CSVRecord nextRecord() throws IOException { recordList.clear(); StringBuilder sb = null; final long startCharPosition = lexer.getCharacterPosition() + characterOffset; - final long startCharByte = lexer.getBytesRead() + this.characterOffset; do { reusableToken.reset(); lexer.nextToken(reusableToken); @@ -908,7 +878,7 @@ CSVRecord nextRecord() throws IOException { recordNumber++; final String comment = Objects.toString(sb, null); result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment, - recordNumber, startCharPosition, startCharByte); + recordNumber, startCharPosition); } return result; } diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java index f0a0a6b81..1fac65843 100644 --- a/src/main/java/org/apache/commons/csv/CSVRecord.java +++ b/src/main/java/org/apache/commons/csv/CSVRecord.java @@ -48,11 +48,6 @@ public final class CSVRecord implements Serializable, Iterable { */ private final long characterPosition; - /** - * The start byte of this record as a character byte in the source stream. - */ - private final long characterByte; - /** The accumulated comments (if any) */ private final String comment; @@ -72,18 +67,8 @@ public final class CSVRecord implements Serializable, Iterable { this.parser = parser; this.comment = comment; this.characterPosition = characterPosition; - this.characterByte = 0L; } - CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber, - final long characterPosition, final long characterByte) { - this.recordNumber = recordNumber; - this.values = values != null ? values : Constants.EMPTY_STRING_ARRAY; - this.parser = parser; - this.comment = comment; - this.characterPosition = characterPosition; - this.characterByte = characterByte; - } /** * Returns a value by {@link Enum}. * @@ -159,15 +144,6 @@ public long getCharacterPosition() { return characterPosition; } - /** - * Returns the start byte of this record as a character byte in the source stream. - * - * @return the start byte of this record as a character byte in the source stream. - */ - public long getCharacterByte() { - return characterByte; - } - /** * Returns the comment for this record, if any. * Note that comments are attached to the following record. diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index 2a82d48a5..18c922a50 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -24,10 +24,6 @@ import java.io.IOException; import java.io.Reader; -import java.nio.CharBuffer; -import java.nio.charset.CharacterCodingException; -import java.nio.charset.Charset; -import java.nio.charset.CharsetEncoder; import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.UnsynchronizedBufferedReader; @@ -53,13 +49,6 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { private long position; private long positionMark; - /** The number of bytes read so far */ - private long bytesRead; - private long bytesReadMark; - - /** Encoder used to calculate the bytes of characters */ - CharsetEncoder encoder; - /** * Constructs a new instance using the default buffer size. */ @@ -67,13 +56,6 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { super(reader); } - ExtendedBufferedReader(final Reader reader, String encoding) { - super(reader); - if (encoding != null) { - encoder = Charset.forName(encoding).newEncoder(); - } - } - /** * Closes the stream. * @@ -126,7 +108,6 @@ public void mark(final int readAheadLimit) throws IOException { lineNumberMark = lineNumber; lastCharMark = lastChar; positionMark = position; - bytesReadMark = bytesRead; super.mark(readAheadLimit); } @@ -137,43 +118,11 @@ public int read() throws IOException { current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) { lineNumber++; } - if (encoder != null) { - this.bytesRead += getCharBytes(current); - } lastChar = current; position++; return lastChar; } - /** - * In Java, a char data type are based on the original Unicode - * specification, which defined characters as fixed-width 16-bit entities. - * U+0000 to U+FFFF: - * - BMP, represented using 1 16-bit char - * - Consists of UTF-8 1-byte, 2-byte, some 3-byte chars - * U+10000 to U+10FFFF: - * - Supplementary characters, represented as a pair of characters, - * the first char from the high-surrogates range (\uD800-\uDBFF), - * and the second char from the low-surrogates range (uDC00-\uDFFF). - * - Consists of UTF-8 some 3-byte chars and 4-byte chars - */ - private long getCharBytes(int current) throws CharacterCodingException { - char cChar = (char) current; - char lChar = (char) lastChar; - if (!Character.isSurrogate(cChar)) { - return encoder.encode( - CharBuffer.wrap(new char[] {cChar})).limit(); - } else { - if (Character.isHighSurrogate(cChar)) { - // Move on to the next char (low surrogate) - return 0; - } else if (Character.isSurrogatePair(lChar, cChar)) { - return encoder.encode( - CharBuffer.wrap(new char[] {lChar, cChar})).limit(); - } else throw new CharacterCodingException(); - } - } - @Override public int read(final char[] buf, final int offset, final int length) throws IOException { if (length == 0) { @@ -238,17 +187,7 @@ public void reset() throws IOException { lineNumber = lineNumberMark; lastChar = lastCharMark; position = positionMark; - bytesRead = bytesReadMark; super.reset(); } - /** - * Gets the number of bytes read by the reader. - * - * @return the number of bytes read by the read - */ - long getBytesRead() { - return this.bytesRead; - } - } diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java index afbba4d21..6d9c8a485 100644 --- a/src/main/java/org/apache/commons/csv/Lexer.java +++ b/src/main/java/org/apache/commons/csv/Lexer.java @@ -103,15 +103,6 @@ long getCharacterPosition() { return reader.getPosition(); } - /** - * Returns the number of bytes read - * - * @return the number of bytes read - */ - long getBytesRead() { - return reader.getBytesRead(); - } - /** * Returns the current line number * diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index fd1ecdb02..8f5d577f6 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -701,84 +701,6 @@ public void testGetHeaderComment_NoComment3() throws IOException { } } - @Test - public void testGetRecordThreeBytesRead() throws Exception { - String code = "id,date,val5,val4\n" + - "11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" + - "22222222222222,'4017-01-01',おはよう私の友人~,v4\n" + - "33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n"; - // String code = "'1',4"; - // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); - final CSVFormat format = CSVFormat.Builder.create() - .setDelimiter(',') - .setQuote('\'') - .build(); - // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8"); - CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8"); - - CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L); - assertEquals(0, parser.getRecordNumber()); - assertNotNull(record = parser.nextRecord()); - assertEquals(1, record.getRecordNumber()); - assertEquals(code.indexOf('i'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), record.getCharacterPosition()); - - assertNotNull(record = parser.nextRecord()); - assertEquals(2, record.getRecordNumber()); - assertEquals(code.indexOf('1'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), record.getCharacterPosition()); - - assertNotNull(record = parser.nextRecord()); - assertEquals(3, record.getRecordNumber()); - assertEquals(code.indexOf('2'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), 95); - - assertNotNull(record = parser.nextRecord()); - assertEquals(4, record.getRecordNumber()); - assertEquals(code.indexOf('3'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), 154); - - parser.close(); - - } - - @Test - public void testGetRecordFourBytesRead() throws Exception { - String code = "id,a,b,c\n" + - "1,😊,🤔,😂\n" + - "2,😊,🤔,😂\n" + - "3,😊,🤔,😂\n"; - // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); - final CSVFormat format = CSVFormat.Builder.create() - .setDelimiter(',') - .setQuote('\'') - .build(); - - // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8"); - CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8"); - - CSVRecord record; - assertEquals(0, parser.getRecordNumber()); - assertNotNull(record = parser.nextRecord()); - assertEquals(1, record.getRecordNumber()); - assertEquals(code.indexOf('i'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), record.getCharacterPosition()); - - assertNotNull(record = parser.nextRecord()); - assertEquals(2, record.getRecordNumber()); - assertEquals(code.indexOf('1'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), record.getCharacterPosition()); - assertNotNull(record = parser.nextRecord()); - assertEquals(3, record.getRecordNumber()); - assertEquals(code.indexOf('2'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), 26); - assertNotNull(record = parser.nextRecord()); - assertEquals(4, record.getRecordNumber()); - assertEquals(code.indexOf('3'), record.getCharacterPosition()); - assertEquals(record.getCharacterByte(), 43); - parser.close(); - } - @Test public void testGetHeaderMap() throws Exception { try (final CSVParser parser = CSVParser.parse("a,b,c\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader("A", "B", "C"))) { diff --git a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java deleted file mode 100644 index 7dbc23caf..000000000 --- a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.csv; -import static org.junit.jupiter.api.Assertions.assertEquals; - - -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.Reader; - - -import org.junit.jupiter.api.Test; - - -public class JiraCsv196Test { - @Test - public void parseThreeBytes() throws IOException { - - // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); - final CSVFormat format = CSVFormat.Builder.create() - .setDelimiter(',') - .setQuote('\'') - .build(); - // CSVParser parser = new CSVParser(getTestInput( - // "org/apache/commons/csv/CSV-196/japanese.csv"), format, 0L, 1L, "UTF-8"); - CSVParser parser = format.parse(getTestInput( - "org/apache/commons/csv/CSV-196/japanese.csv"), 0L, 1L, "UTF-8"); - long[] charByteKey = {0, 89, 242, 395}; - int idx = 0; - for (CSVRecord record : parser) { - assertEquals(charByteKey[idx++], record.getCharacterByte()); - } - parser.close(); - } - - - @Test - public void parseFourBytes() throws IOException { - // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); - final CSVFormat format = CSVFormat.Builder.create() - .setDelimiter(',') - .setQuote('\'') - .build(); - - CSVParser parser = format.parse(getTestInput( - "org/apache/commons/csv/CSV-196/emoji.csv"), 0L, 1L, "UTF-8"); - - long[] charByteKey = {0, 84, 701, 1318, 1935}; - int idx = 0; - for (CSVRecord record : parser) { - assertEquals(charByteKey[idx++], record.getCharacterByte()); - } - parser.close(); - } - - - private Reader getTestInput(String path) { - return new InputStreamReader( - ClassLoader.getSystemClassLoader().getResourceAsStream(path)); - } -} diff --git a/src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv b/src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv deleted file mode 100644 index 0bff7a44f..000000000 --- a/src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv +++ /dev/null @@ -1,5 +0,0 @@ -id,val1,val2,val3,val4,val5,val6,val7,val8,val9,val10,val11,val12,val13,val14,val15 -1,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄 -2,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄 -3,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄 -4,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄 \ No newline at end of file diff --git a/src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv b/src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv deleted file mode 100644 index b06e04bd6..000000000 --- a/src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv +++ /dev/null @@ -1,4 +0,0 @@ -id,date,val1,val2,val3,val4,val5,val6,val7,val8,val9,val10,val11,val12,val13,val14,val15 -00000000000001,2017-01-01,きちんと節分近くには咲いてる。自然の力ってすごいな~,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15 -00000000000002,2017-01-01,きちんと節分近くには咲いてる。自然の力ってすごいな~,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15 -00000000000003,2017-01-01,きちんと節分近くには咲いてる。自然の力ってすごいな~,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15 \ No newline at end of file