From 6d8505a19a5d7f9eefd24f85c5b2b42c94c16d37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Finn=20V=C3=B6lkel?= Date: Tue, 9 Apr 2024 10:25:46 +0200 Subject: [PATCH] GH-30866: [Java] fix SplitAndTransfer throws for (0,0) if vector empty (#41066) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is addresses https://issues.apache.org/jira/browse/ARROW-15382 and is reopening of https://github.com/apache/arrow/pull/12250 (which I asked to be reopened). I tried to address all the comments from the previous discussion, added some more tests and fixed an issue in the old commit. * GitHub Issue: #30866 Authored-by: Finn Völkel Signed-off-by: David Li --- .../vector/BaseLargeVariableWidthVector.java | 12 +- .../arrow/vector/BaseVariableWidthVector.java | 6 +- .../arrow/vector/complex/ListVector.java | 28 ++-- .../arrow/vector/TestLargeVarCharVector.java | 4 +- .../arrow/vector/TestSplitAndTransfer.java | 121 +++++++++++++++++- 5 files changed, 140 insertions(+), 31 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java index 34c9e73a0b072..2ef6e4bd8b374 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java @@ -763,16 +763,14 @@ public void transferTo(BaseLargeVariableWidthVector target) { */ public void splitAndTransferTo(int startIndex, int length, BaseLargeVariableWidthVector target) { - Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount, - "Invalid startIndex: %s", startIndex); - Preconditions.checkArgument(startIndex + length <= valueCount, - "Invalid length: %s", length); + Preconditions.checkArgument(startIndex >= 0 && length >= 0 && startIndex + length <= valueCount, + "Invalid parameters startIndex: %s, length: %s for valueCount: %s", startIndex, length, valueCount); compareTypes(target, "splitAndTransferTo"); target.clear(); - splitAndTransferValidityBuffer(startIndex, length, target); - splitAndTransferOffsetBuffer(startIndex, length, target); - target.setLastSet(length - 1); if (length > 0) { + splitAndTransferValidityBuffer(startIndex, length, target); + splitAndTransferOffsetBuffer(startIndex, length, target); + target.setLastSet(length - 1); target.setValueCount(length); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index 6b82dd7729a6c..d533629cdd44e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -808,10 +808,10 @@ public void splitAndTransferTo(int startIndex, int length, "Invalid parameters startIndex: %s, length: %s for valueCount: %s", startIndex, length, valueCount); compareTypes(target, "splitAndTransferTo"); target.clear(); - splitAndTransferValidityBuffer(startIndex, length, target); - splitAndTransferOffsetBuffer(startIndex, length, target); - target.setLastSet(length - 1); if (length > 0) { + splitAndTransferValidityBuffer(startIndex, length, target); + splitAndTransferOffsetBuffer(startIndex, length, target); + target.setLastSet(length - 1); target.setValueCount(length); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index c61a2894aafd9..656c4add04201 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -552,21 +552,23 @@ public void transfer() { public void splitAndTransfer(int startIndex, int length) { Preconditions.checkArgument(startIndex >= 0 && length >= 0 && startIndex + length <= valueCount, "Invalid parameters startIndex: %s, length: %s for valueCount: %s", startIndex, length, valueCount); - final int startPoint = offsetBuffer.getInt(startIndex * OFFSET_WIDTH); - final int sliceLength = offsetBuffer.getInt((startIndex + length) * OFFSET_WIDTH) - startPoint; to.clear(); - to.offsetBuffer = to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); - /* splitAndTransfer offset buffer */ - for (int i = 0; i < length + 1; i++) { - final int relativeOffset = offsetBuffer.getInt((startIndex + i) * OFFSET_WIDTH) - startPoint; - to.offsetBuffer.setInt(i * OFFSET_WIDTH, relativeOffset); + if (length > 0) { + final int startPoint = offsetBuffer.getInt(startIndex * OFFSET_WIDTH); + final int sliceLength = offsetBuffer.getInt((startIndex + length) * OFFSET_WIDTH) - startPoint; + to.offsetBuffer = to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); + /* splitAndTransfer offset buffer */ + for (int i = 0; i < length + 1; i++) { + final int relativeOffset = offsetBuffer.getInt((startIndex + i) * OFFSET_WIDTH) - startPoint; + to.offsetBuffer.setInt(i * OFFSET_WIDTH, relativeOffset); + } + /* splitAndTransfer validity buffer */ + splitAndTransferValidityBuffer(startIndex, length, to); + /* splitAndTransfer data buffer */ + dataTransferPair.splitAndTransfer(startPoint, sliceLength); + to.lastSet = length - 1; + to.setValueCount(length); } - /* splitAndTransfer validity buffer */ - splitAndTransferValidityBuffer(startIndex, length, to); - /* splitAndTransfer data buffer */ - dataTransferPair.splitAndTransfer(startPoint, sliceLength); - to.lastSet = length - 1; - to.setValueCount(length); } /* diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java index 62d09da86d652..06b27a9eba156 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java @@ -166,7 +166,7 @@ public void testInvalidStartIndex() { IllegalArgumentException.class, () -> tp.splitAndTransfer(valueCount, 10)); - assertEquals("Invalid startIndex: 500", e.getMessage()); + assertEquals("Invalid parameters startIndex: 500, length: 10 for valueCount: 500", e.getMessage()); } } @@ -185,7 +185,7 @@ public void testInvalidLength() { IllegalArgumentException.class, () -> tp.splitAndTransfer(0, valueCount * 2)); - assertEquals("Invalid length: 1000", e.getMessage()); + assertEquals("Invalid parameters startIndex: 0, length: 1000 for valueCount: 500", e.getMessage()); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java b/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java index 3580a321f01c9..396f5665e0382 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java @@ -28,6 +28,7 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.complex.DenseUnionVector; import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.MapVector; @@ -49,7 +50,7 @@ public class TestSplitAndTransfer { public void init() { allocator = new RootAllocator(Long.MAX_VALUE); } - + @After public void terminate() throws Exception { allocator.close(); @@ -65,7 +66,116 @@ private void populateVarcharVector(final VarCharVector vector, int valueCount, S } vector.setValueCount(valueCount); } - + + private void populateIntVector(final IntVector vector, int valueCount) { + for (int i = 0; i < valueCount; i++) { + vector.set(i, i); + } + vector.setValueCount(valueCount); + } + + private void populateDenseUnionVector(final DenseUnionVector vector, int valueCount) { + VarCharVector varCharVector = vector.addOrGet("varchar", FieldType.nullable(new ArrowType.Utf8()), + VarCharVector.class); + BigIntVector intVector = vector.addOrGet("int", + FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class); + + for (int i = 0; i < valueCount; i++) { + vector.setTypeId(i, (byte) (i % 2)); + if (i % 2 == 0) { + final String s = String.format("%010d", i); + varCharVector.setSafe(i / 2, s.getBytes(StandardCharsets.UTF_8)); + } else { + intVector.setSafe(i / 2, i); + } + } + vector.setValueCount(valueCount); + } + + @Test + public void testWithEmptyVector() { + // MapVector use TransferImpl from ListVector + ListVector listVector = ListVector.empty("", allocator); + TransferPair transferPair = listVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // BaseFixedWidthVector + IntVector intVector = new IntVector("", allocator); + transferPair = intVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // BaseVariableWidthVector + VarCharVector varCharVector = new VarCharVector("", allocator); + transferPair = varCharVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // BaseLargeVariableWidthVector + LargeVarCharVector largeVarCharVector = new LargeVarCharVector("", allocator); + transferPair = largeVarCharVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // StructVector + StructVector structVector = StructVector.empty("", allocator); + transferPair = structVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // FixedSizeListVector + FixedSizeListVector fixedSizeListVector = FixedSizeListVector.empty("", 0, allocator); + transferPair = fixedSizeListVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // FixedSizeBinaryVector + FixedSizeBinaryVector fixedSizeBinaryVector = new FixedSizeBinaryVector("", allocator, 4); + transferPair = fixedSizeBinaryVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // UnionVector + UnionVector unionVector = UnionVector.empty("", allocator); + transferPair = unionVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // DenseUnionVector + DenseUnionVector duv = DenseUnionVector.empty("", allocator); + transferPair = duv.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + + // non empty from vector + + // BaseFixedWidthVector + IntVector fromIntVector = new IntVector("", allocator); + fromIntVector.allocateNew(100); + populateIntVector(fromIntVector, 100); + transferPair = fromIntVector.getTransferPair(allocator); + IntVector toIntVector = (IntVector) transferPair.getTo(); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, toIntVector.getValueCount()); + + transferPair.splitAndTransfer(50, 0); + assertEquals(0, toIntVector.getValueCount()); + + transferPair.splitAndTransfer(100, 0); + assertEquals(0, toIntVector.getValueCount()); + fromIntVector.clear(); + toIntVector.clear(); + + // DenseUnionVector + DenseUnionVector fromDuv = DenseUnionVector.empty("", allocator); + populateDenseUnionVector(fromDuv, 100); + transferPair = fromDuv.getTransferPair(allocator); + DenseUnionVector toDUV = (DenseUnionVector) transferPair.getTo(); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, toDUV.getValueCount()); + + transferPair.splitAndTransfer(50, 0); + assertEquals(0, toDUV.getValueCount()); + + transferPair.splitAndTransfer(100, 0); + assertEquals(0, toDUV.getValueCount()); + fromDuv.clear(); + toDUV.clear(); + } + @Test /* VarCharVector */ public void test() throws Exception { try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator)) { @@ -73,13 +183,13 @@ public void test() throws Exception { final int valueCount = 500; final String[] compareArray = new String[valueCount]; - + populateVarcharVector(varCharVector, valueCount, compareArray); - + final TransferPair tp = varCharVector.getTransferPair(allocator); final VarCharVector newVarCharVector = (VarCharVector) tp.getTo(); final int[][] startLengths = {{0, 201}, {201, 0}, {201, 200}, {401, 99}}; - + for (final int[] startLength : startLengths) { final int start = startLength[0]; final int length = startLength[1]; @@ -429,5 +539,4 @@ public void testMapVectorZeroStartIndexAndLength() { newMapVector.clear(); } } - }