From 89908f9d4b33094f0923358e3d740a7256754ce7 Mon Sep 17 00:00:00 2001 From: xianyangliu Date: Mon, 8 Aug 2022 19:44:19 +0800 Subject: [PATCH 1/4] fixes int overflow --- .../apache/arrow/vector/BaseVariableWidthVector.java | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index 866dd9e218fc1..39bf29d7d1cfa 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -46,6 +46,7 @@ public abstract class BaseVariableWidthVector extends BaseValueVector implements VariableWidthVector, FieldVector, VectorDefinitionSetter { private static final int DEFAULT_RECORD_BYTE_COUNT = 8; private static final int INITIAL_BYTE_COUNT = INITIAL_VALUE_ALLOCATION * DEFAULT_RECORD_BYTE_COUNT; + private static final int MAX_BUFFER_SIZE = (int) Math.min(MAX_ALLOCATION_SIZE, Integer.MAX_VALUE); private int lastValueCapacity; private long lastValueAllocationSizeInBytes; @@ -430,7 +431,7 @@ public void allocateNew(int valueCount) { /* Check if the data buffer size is within bounds. */ private void checkDataBufferSize(long size) { - if (size > MAX_ALLOCATION_SIZE || size < 0) { + if (size > MAX_BUFFER_SIZE || size < 0) { throw new OversizedAllocationException("Memory required for vector " + " is (" + size + "), which is more than max allowed (" + MAX_ALLOCATION_SIZE + ")"); } @@ -445,7 +446,7 @@ private long computeAndCheckOffsetsBufferSize(int valueCount) { * an additional slot in offset buffer. */ final long size = computeCombinedBufferSize(valueCount + 1, OFFSET_WIDTH); - if (size > MAX_ALLOCATION_SIZE) { + if (size > MAX_BUFFER_SIZE) { throw new OversizedAllocationException("Memory required for vector capacity " + valueCount + " is (" + size + "), which is more than max allowed (" + MAX_ALLOCATION_SIZE + ")"); @@ -1240,7 +1241,7 @@ protected final void handleSafe(int index, int dataLength) { * So even though we may have setup an initial capacity of 1024 * elements in the vector, it is quite possible * that we need to reAlloc() the data buffer when we are setting - * the 5th element in the vector simply because previous + * the 1025th element in the vector simply because previous * variable length elements have exhausted the buffer capacity. * However, we really don't need to reAlloc() validity and * offset buffers until we try to set the 1025th element @@ -1251,7 +1252,9 @@ protected final void handleSafe(int index, int dataLength) { reallocValidityAndOffsetBuffers(); } final int startOffset = lastSet < 0 ? 0 : getStartOffset(lastSet + 1); - while (valueBuffer.capacity() < (startOffset + dataLength)) { + final long targetCapacity = startOffset + dataLength; + checkDataBufferSize(targetCapacity); + while (valueBuffer.capacity() < targetCapacity) { reallocDataBuffer(); } } From 830319f46774e5a94c0a25158882cdd733215592 Mon Sep 17 00:00:00 2001 From: xianyangliu Date: Mon, 8 Aug 2022 19:52:22 +0800 Subject: [PATCH 2/4] update --- .../java/org/apache/arrow/vector/BaseVariableWidthVector.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index 39bf29d7d1cfa..2798682054536 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -1251,7 +1251,7 @@ protected final void handleSafe(int index, int dataLength) { while (index >= getValueCapacity()) { reallocValidityAndOffsetBuffers(); } - final int startOffset = lastSet < 0 ? 0 : getStartOffset(lastSet + 1); + final long startOffset = lastSet < 0 ? 0 : getStartOffset(lastSet + 1); final long targetCapacity = startOffset + dataLength; checkDataBufferSize(targetCapacity); while (valueBuffer.capacity() < targetCapacity) { From b2c9315836f3b50dbeddc2de21d2392f3d8ce742 Mon Sep 17 00:00:00 2001 From: xianyangliu Date: Tue, 9 Aug 2022 11:40:43 +0800 Subject: [PATCH 3/4] address comments and add ut --- .../arrow/vector/BaseVariableWidthVector.java | 2 +- .../apache/arrow/vector/TestValueVector.java | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index 2798682054536..bee85850680aa 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -1241,7 +1241,7 @@ protected final void handleSafe(int index, int dataLength) { * So even though we may have setup an initial capacity of 1024 * elements in the vector, it is quite possible * that we need to reAlloc() the data buffer when we are setting - * the 1025th element in the vector simply because previous + * the 5th element in the vector simply because previous * variable length elements have exhausted the buffer capacity. * However, we really don't need to reAlloc() validity and * offset buffers until we try to set the 1025th element diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 516daa2362280..0928d3eb03082 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -1137,6 +1137,25 @@ public void testNullableVarType2() { } } + @Test(expected = OversizedAllocationException.class) + public void testReallocateCheckSuccess() { + + // Create a new value vector for 1024 integers. + try (final VarBinaryVector vector = newVarBinaryVector(EMPTY_SCHEMA_PATH, allocator)) { + vector.allocateNew(1024 * 10, 1024); + + vector.set(0, STR1); + // Check the sample strings. + assertArrayEquals(STR1, vector.get(0)); + + // update the index offset to a larger one + ArrowBuf offsetBuf = vector.getOffsetBuffer(); + offsetBuf.setInt(VarBinaryVector.OFFSET_WIDTH, Integer.MAX_VALUE - 5); + + vector.setValueLengthSafe(1, 6); + } + } + /* * generic tests From e12a27dc6957ff4db2cdee84ea99c70cb22603e0 Mon Sep 17 00:00:00 2001 From: xianyangliu Date: Wed, 10 Aug 2022 10:56:03 +0800 Subject: [PATCH 4/4] address comments --- .../arrow/vector/BaseVariableWidthVector.java | 34 +++++++++++++++---- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index bee85850680aa..2a89590bf8440 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -433,7 +433,8 @@ public void allocateNew(int valueCount) { private void checkDataBufferSize(long size) { if (size > MAX_BUFFER_SIZE || size < 0) { throw new OversizedAllocationException("Memory required for vector " + - " is (" + size + "), which is more than max allowed (" + MAX_ALLOCATION_SIZE + ")"); + "is (" + size + "), which is overflow or more than max allowed (" + MAX_BUFFER_SIZE + "). " + + "You could consider using LargeVarCharVector/LargeVarBinaryVector for large strings/large bytes types"); } } @@ -449,7 +450,7 @@ private long computeAndCheckOffsetsBufferSize(int valueCount) { if (size > MAX_BUFFER_SIZE) { throw new OversizedAllocationException("Memory required for vector capacity " + valueCount + - " is (" + size + "), which is more than max allowed (" + MAX_ALLOCATION_SIZE + ")"); + " is (" + size + "), which is more than max allowed (" + MAX_BUFFER_SIZE + ")"); } return size; } @@ -515,13 +516,33 @@ public void reallocDataBuffer() { newAllocationSize = INITIAL_BYTE_COUNT * 2L; } } - newAllocationSize = CommonUtil.nextPowerOfTwo(newAllocationSize); + + reallocDataBuffer(newAllocationSize); + } + + /** + * Reallocate the data buffer to given size. Data Buffer stores the actual data for + * VARCHAR or VARBINARY elements in the vector. The actual allocate size may be larger + * than the request one because it will round up the provided value to the nearest + * power of two. + * + * @param desiredAllocSize the desired new allocation size + * @throws OversizedAllocationException if the desired new size is more than + * max allowed + * @throws OutOfMemoryException if the internal memory allocation fails + */ + public void reallocDataBuffer(long desiredAllocSize) { + if (desiredAllocSize == 0) { + return; + } + + final long newAllocationSize = CommonUtil.nextPowerOfTwo(desiredAllocSize); assert newAllocationSize >= 1; checkDataBufferSize(newAllocationSize); final ArrowBuf newBuf = allocator.buffer(newAllocationSize); - newBuf.setBytes(0, valueBuffer, 0, currentBufferCapacity); + newBuf.setBytes(0, valueBuffer, 0, valueBuffer.capacity()); valueBuffer.getReferenceManager().release(); valueBuffer = newBuf; lastValueAllocationSizeInBytes = valueBuffer.capacity(); @@ -1253,9 +1274,8 @@ protected final void handleSafe(int index, int dataLength) { } final long startOffset = lastSet < 0 ? 0 : getStartOffset(lastSet + 1); final long targetCapacity = startOffset + dataLength; - checkDataBufferSize(targetCapacity); - while (valueBuffer.capacity() < targetCapacity) { - reallocDataBuffer(); + if (valueBuffer.capacity() < targetCapacity) { + reallocDataBuffer(targetCapacity); } }