diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java index 5b64353eaf91d..493323c92e7ab 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java @@ -48,8 +48,6 @@ public abstract class BaseVariableWidthViewVector extends BaseValueVector private static final int DEFAULT_RECORD_BYTE_COUNT = 8; private static final int INITIAL_BYTE_COUNT = INITIAL_VALUE_ALLOCATION * DEFAULT_RECORD_BYTE_COUNT; private static final int MAX_BUFFER_SIZE = (int) Math.min(MAX_ALLOCATION_SIZE, Integer.MAX_VALUE); - - private static final int INLINE_SIZE = 12; private int lastValueCapacity; private long lastValueAllocationSizeInBytes; private long initialBufferSize; @@ -57,6 +55,13 @@ public abstract class BaseVariableWidthViewVector extends BaseValueVector /* protected members */ public static final int OFFSET_WIDTH = 4; /* 4 byte unsigned int to track offsets */ + protected static final int INLINE_SIZE = 12; + protected static final int VIEW_BUFFER_SIZE = 16; + protected static final int LENGTH_WIDTH = 4; + protected static final int INLINE_BUF_DATA_WIDTH = 12; + protected static final int PREFIX_WIDTH = 4; + protected static final int BUF_INDEX_WIDTH = 4; + protected static final int BUF_OFFSET_WIDTH = 4; protected static final byte[] emptyByteArray = new byte[]{}; protected ArrowBuf validityBuffer; protected ArrowBuf valueBuffer; @@ -66,129 +71,6 @@ public abstract class BaseVariableWidthViewVector extends BaseValueVector protected final Field field; protected List dataBuffers; - protected List views; - - interface ViewBuffer { - } - - static class InlineValueBuffer implements ViewBuffer { - private final ArrowBuf valueBuffer; - private final int length; - - public InlineValueBuffer(ArrowBuf buffer, int length) { - this.valueBuffer = buffer; - this.length = length; - } - - public ArrowBuf getValueBuffer() { - return valueBuffer; - } - - public int getLength() { - return length; - } - } - - static class ReferenceValueBuffer implements ViewBuffer { - private final int length; - private final byte[] prefix; - private final int bufId; - private final int offset; - - public ReferenceValueBuffer(int length, byte[] prefix, int bufId, int offset) { - this.length = length; - this.prefix = prefix; - this.bufId = bufId; - this.offset = offset; - } - - public int getLength() { - return length; - } - - public byte[] getPrefix() { - return prefix; - } - - public int getBufId() { - return bufId; - } - - public int getOffset() { - return offset; - } - } - - static class ViewBufferFactory { - private static ViewBuffer createInlineValueBuffer(BufferAllocator allocator, byte[] value, int start, int length, - long allocationSizeInBytes) { - ArrowBuf newBuffer = allocateViewDataBuffer(allocator, allocationSizeInBytes); - newBuffer.setBytes(0, value, start, length); - return new InlineValueBuffer(newBuffer, value.length); - } - - private static ViewBuffer createReferenceValueBuffer(BufferAllocator allocator, byte[] value, int start, - int length, int startOffset, List dataBuffers, long allocationSizeInBytes) { - int prefixLen = Math.min(length, 4); - byte [] prefix = new byte[prefixLen]; - System.arraycopy(value, 0, prefix, 0, prefixLen); - if (!dataBuffers.isEmpty()) { - // the dataBuffer is already populated with at least one buffer - // determine bufId - ArrowBuf currentBuf = dataBuffers.get(dataBuffers.size() - 1); - if (currentBuf.capacity() - currentBuf.writerIndex() >= length) { - // we can use the current buffer - currentBuf.setBytes(currentBuf.writerIndex(), value, start, length); - currentBuf.writerIndex(currentBuf.writerIndex() + length); - dataBuffers.set(dataBuffers.size() - 1, currentBuf); - return new ReferenceValueBuffer(value.length, prefix, dataBuffers.size() - 1, - (int) currentBuf.writerIndex()); - } else { - // reallocation i.e creating a new buffer and add it to the dataBuffers - ArrowBuf newBuffer = allocateViewDataBuffer(allocator, allocationSizeInBytes); - newBuffer.setBytes(0, value, start, length); - newBuffer.writerIndex(length); - dataBuffers.add(newBuffer); - return new ReferenceValueBuffer(value.length, prefix, dataBuffers.size() - 1, 0); - } - } else { - // allocating the first buffer and append it to dataBuffers - ArrowBuf newBuffer = allocateViewDataBuffer(allocator, allocationSizeInBytes); - newBuffer.setBytes(0, value, start, length); - newBuffer.writerIndex(length); - dataBuffers.add(newBuffer); - return new ReferenceValueBuffer(value.length, prefix, 0, 0); - } - } - - private static void checkDataBufferSize(long size) { - if (size > MAX_BUFFER_SIZE || size < 0) { - throw new OversizedAllocationException("Memory required for vector " + - "is (" + size + "), which is overflow or more than max allowed (" + MAX_BUFFER_SIZE + "). " + - "You could consider using LargeVarCharVector/LargeVarBinaryVector for large strings/large bytes types"); - } - } - - private static ArrowBuf allocateViewDataBuffer(BufferAllocator allocator, long size) { - final long newAllocationSize = CommonUtil.nextPowerOfTwo(size); - assert newAllocationSize >= 1; - checkDataBufferSize(newAllocationSize); - return allocator.buffer(newAllocationSize); - } - - public static ViewBuffer createValueBuffer(BufferAllocator allocator, byte[] value, int startOffset, int start, - int length, - List dataBuffers) { - if (value.length <= INLINE_SIZE) { - // inline buffer - return createInlineValueBuffer(allocator, value, start, length, length); - } else { - // reference buffer - return createReferenceValueBuffer(allocator, value, start, length, startOffset, dataBuffers, - INITIAL_VALUE_ALLOCATION); - } - } - } /** * Constructs a new instance. @@ -207,7 +89,6 @@ public BaseVariableWidthViewVector(Field field, final BufferAllocator allocator) offsetBuffer = allocator.getEmpty(); validityBuffer = allocator.getEmpty(); valueBuffer = allocator.getEmpty(); - views = new ArrayList<>(); dataBuffers = new ArrayList<>(); } @@ -304,7 +185,7 @@ public void setInitialCapacity(int valueCount) { final long size = (long) valueCount * DEFAULT_RECORD_BYTE_COUNT; checkDataBufferSize(size); computeAndCheckOffsetsBufferSize(valueCount); - // lastValueAllocationSizeInBytes = (int) size; + lastValueAllocationSizeInBytes = (int) size; lastValueCapacity = valueCount; } @@ -319,7 +200,7 @@ public void setInitialCapacity(int valueCount, double density) { long size = Math.max((long) (valueCount * density), 1L); checkDataBufferSize(size); computeAndCheckOffsetsBufferSize(valueCount); - // lastValueAllocationSizeInBytes = (int) size; + lastValueAllocationSizeInBytes = (int) size; lastValueCapacity = valueCount; } @@ -403,26 +284,10 @@ public void clear() { valueBuffer = releaseBuffer(valueBuffer); offsetBuffer = releaseBuffer(offsetBuffer); clearDataBuffers(); - clearViews(); lastSet = -1; valueCount = 0; } - /** - * Release the buffers in views and clear the views. - */ - public void clearViews() { - for (ViewBuffer view : views) { - if (view instanceof InlineValueBuffer) { - ArrowBuf valueBuf = ((InlineValueBuffer) view).valueBuffer; - if (valueBuf != null) { - valueBuf.getReferenceManager().release(); - } - } - } - views.clear(); - } - /** * Release the data buffers and clear the list. */ @@ -533,34 +398,6 @@ private void setReaderAndWriterIndex() { } } - private void setReaderAndWriteIndexForViews() { - if (valueCount == 0) { - for (ViewBuffer view : views) { - if (view instanceof InlineValueBuffer) { - ((InlineValueBuffer) view).getValueBuffer().readerIndex(0); - ((InlineValueBuffer) view).getValueBuffer().writerIndex(0); - } else { - for (ArrowBuf refBuffer : dataBuffers) { - refBuffer.readerIndex(0); - refBuffer.writerIndex(0); - } - } - } - } else { - final int lastDataOffset = getStartOffset(valueCount); - ViewBuffer lastBuffer = views.get(views.size() - 1); - if (lastBuffer instanceof InlineValueBuffer) { - // since we allocate a valueBuffer per each inline, this wouldn't make much sense - ((InlineValueBuffer) lastBuffer).getValueBuffer().writerIndex(lastDataOffset); - } else { - for (ArrowBuf refBuffer : dataBuffers) { - refBuffer.writerIndex(lastDataOffset); - } - } - } - - } - /** * Same as {@link #allocateNewSafe()}. */ @@ -647,11 +484,9 @@ private long computeAndCheckOffsetsBufferSize(int valueCount) { /* allocate the inner buffers */ private void allocateBytes(final long valueBufferSize, final int valueCount) { /* allocate data buffer */ - // long curSize = valueBufferSize; - // valueBuffer = allocator.buffer(curSize); - // valueBuffer.readerIndex(0); - - initialBufferSize = valueBufferSize; + long curSize = valueBufferSize; + valueBuffer = allocator.buffer(curSize); + valueBuffer.readerIndex(0); /* allocate offset buffer and validity buffer */ DataAndValidityBuffers buffers = allocFixedDataAndValidityBufs(valueCount + 1, OFFSET_WIDTH); @@ -661,7 +496,7 @@ private void allocateBytes(final long valueBufferSize, final int valueCount) { initValidityBuffer(); lastValueCapacity = getValueCapacity(); - // lastValueAllocationSizeInBytes = capAtMaxInt(valueBuffer.capacity()); + lastValueAllocationSizeInBytes = capAtMaxInt(valueBuffer.capacity()); } /* allocate offset buffer */ @@ -705,47 +540,45 @@ public void reallocDataBuffer() { } long newAllocationSize = currentBufferCapacity * 2; - // if (newAllocationSize == 0) { - // if (lastValueAllocationSizeInBytes > 0) { - // newAllocationSize = lastValueAllocationSizeInBytes; - // } else { - // newAllocationSize = INITIAL_BYTE_COUNT * 2L; - // } - // } if (newAllocationSize == 0) { - newAllocationSize = INITIAL_BYTE_COUNT; + if (lastValueAllocationSizeInBytes > 0) { + newAllocationSize = lastValueAllocationSizeInBytes; + } else { + newAllocationSize = INITIAL_BYTE_COUNT * 2L; + } } - // reallocDataBuffer(newAllocationSize); + + reallocDataBuffer(newAllocationSize); reallocViewReferenceBuffer(newAllocationSize); } - // /** - // * Reallocate the data buffer to given size. Data Buffer stores the actual data for - // * VARCHAR or VARBINARY elements in the vector. The actual allocate size may be larger - // * than the request one because it will round up the provided value to the nearest - // * power of two. - // * - // * @param desiredAllocSize the desired new allocation size - // * @throws OversizedAllocationException if the desired new size is more than - // * max allowed - // * @throws OutOfMemoryException if the internal memory allocation fails - // */ - // public void reallocDataBuffer(long desiredAllocSize) { - // if (desiredAllocSize == 0) { - // return; - // } - // - // final long newAllocationSize = CommonUtil.nextPowerOfTwo(desiredAllocSize); - // assert newAllocationSize >= 1; - // - // checkDataBufferSize(newAllocationSize); - // - // final ArrowBuf newBuf = allocator.buffer(newAllocationSize); - // newBuf.setBytes(0, valueBuffer, 0, valueBuffer.capacity()); - // valueBuffer.getReferenceManager().release(); - // valueBuffer = newBuf; - // lastValueAllocationSizeInBytes = valueBuffer.capacity(); - // } + /** + * Reallocate the data buffer to given size. Data Buffer stores the actual data for + * VARCHAR or VARBINARY elements in the vector. The actual allocate size may be larger + * than the request one because it will round up the provided value to the nearest + * power of two. + * + * @param desiredAllocSize the desired new allocation size + * @throws OversizedAllocationException if the desired new size is more than + * max allowed + * @throws OutOfMemoryException if the internal memory allocation fails + */ + public void reallocDataBuffer(long desiredAllocSize) { + if (desiredAllocSize == 0) { + return; + } + + final long newAllocationSize = CommonUtil.nextPowerOfTwo(desiredAllocSize); + assert newAllocationSize >= 1; + + checkDataBufferSize(newAllocationSize); + + final ArrowBuf newBuf = allocator.buffer(newAllocationSize); + newBuf.setBytes(0, valueBuffer, 0, valueBuffer.capacity()); + valueBuffer.getReferenceManager().release(); + valueBuffer = newBuf; + lastValueAllocationSizeInBytes = valueBuffer.capacity(); + } /** * Reallocate the data buffer for reference buffer. @@ -1174,7 +1007,6 @@ public void setValueCount(int valueCount) { fillHoles(valueCount); lastSet = valueCount - 1; setReaderAndWriterIndex(); - setReaderAndWriteIndexForViews(); } /** @@ -1490,6 +1322,76 @@ protected final void fillHoles(int index) { lastSet = index - 1; } + protected void createValueBuffer(BufferAllocator allocator, byte[] value, List dataBuffers) { + if (value.length <= INLINE_SIZE) { + // inline buffer + // set length + valueBuffer.setInt(valueBuffer.writerIndex(), value.length); + // set data + valueBuffer.writerIndex(valueBuffer.writerIndex() + LENGTH_WIDTH); + valueBuffer.setBytes(valueBuffer.writerIndex(), value, 0, value.length); + valueBuffer.writerIndex(valueBuffer.writerIndex() + INLINE_BUF_DATA_WIDTH); + } else { + // reference buffer + if (dataBuffers.isEmpty()) { + // first data buffer needs to be added + ArrowBuf newDataBuf = allocator.buffer(lastValueAllocationSizeInBytes); + // set length + valueBuffer.setInt(valueBuffer.writerIndex(), value.length); + valueBuffer.writerIndex(valueBuffer.writerIndex() + LENGTH_WIDTH); + // set prefix + valueBuffer.setBytes(valueBuffer.writerIndex(), value, 0, PREFIX_WIDTH); + valueBuffer.writerIndex(valueBuffer.writerIndex() + PREFIX_WIDTH); + // set buf id + valueBuffer.setInt(valueBuffer.writerIndex(), /*first buffer*/0); + valueBuffer.writerIndex(valueBuffer.writerIndex() + BUF_INDEX_WIDTH); + // set offset + valueBuffer.setInt(valueBuffer.writerIndex(), 0); + valueBuffer.writerIndex(valueBuffer.writerIndex() + BUF_OFFSET_WIDTH); + newDataBuf.setBytes(0, value, 0, value.length); + newDataBuf.writerIndex(value.length); + dataBuffers.add(newDataBuf); + } else { + // insert to the last buffer in the data buffers or allocate new if the last buffer isn't enough + // set length + valueBuffer.setInt(valueBuffer.writerIndex(), value.length); + valueBuffer.writerIndex(valueBuffer.writerIndex() + LENGTH_WIDTH); + // set prefix + valueBuffer.setBytes(valueBuffer.writerIndex(), value, 0, PREFIX_WIDTH); + valueBuffer.writerIndex(valueBuffer.writerIndex() + PREFIX_WIDTH); + // set buf id + int currentBufId = dataBuffers.size() - 1; + ArrowBuf currentBuf = dataBuffers.get(currentBufId); + if (currentBuf.capacity() - currentBuf.writerIndex() >= value.length) { + // current buffer is enough + // set buf index + valueBuffer.setInt(valueBuffer.writerIndex(), currentBufId); + valueBuffer.writerIndex(valueBuffer.writerIndex() + BUF_INDEX_WIDTH); + // set offset + valueBuffer.setInt(valueBuffer.writerIndex(), (int) currentBuf.writerIndex()); + valueBuffer.writerIndex(valueBuffer.writerIndex() + BUF_OFFSET_WIDTH); + currentBuf.setBytes(currentBuf.writerIndex(), value, 0, value.length); + currentBuf.writerIndex(currentBuf.writerIndex() + value.length); + dataBuffers.set(currentBufId, currentBuf); // is this needed? + } else { + // current buffer is not enough + // allocate new buffer + ArrowBuf newBuf = allocator.buffer(lastValueAllocationSizeInBytes); + // set buf index + valueBuffer.setInt(valueBuffer.writerIndex(), dataBuffers.size()); + valueBuffer.writerIndex(valueBuffer.writerIndex() + BUF_INDEX_WIDTH); + // set offset + valueBuffer.setInt(valueBuffer.writerIndex(), 0); + valueBuffer.writerIndex(valueBuffer.writerIndex() + BUF_OFFSET_WIDTH); + newBuf.setBytes(0, value, 0, value.length); + newBuf.writerIndex(newBuf.writerIndex() + value.length); + dataBuffers.add(newBuf); + } + } + + } + } + protected final void setBytes(int index, byte[] value, int start, int length) { /* end offset of current last element in the vector. this will * be the start offset of new element we are trying to store. @@ -1499,45 +1401,7 @@ protected final void setBytes(int index, byte[] value, int start, int length) { offsetBuffer.setInt((long) (index + 1) * OFFSET_WIDTH, startOffset + length); /* store the var length data in value buffer */ /*check whether the buffer is inline or reference buffer*/ - ViewBuffer viewBuffer; - if (initialBufferSize == 0) { - // initial allocation sizes are not set. Allocate the buffer based on the size of the value. - viewBuffer = ViewBufferFactory.createValueBuffer(allocator, value, startOffset, start, length, - dataBuffers); - views.add(viewBuffer); - } else { - if (initialBufferSize - value.length >= 0 && value.length <= INLINE_SIZE) { - viewBuffer = - ViewBufferFactory.createInlineValueBuffer( - allocator, - value, - start, - length, - initialBufferSize); - views.add(viewBuffer); - } - - if (initialBufferSize - value.length >= 0 && value.length > INLINE_SIZE) { - viewBuffer = - ViewBufferFactory.createReferenceValueBuffer( - allocator, - value, - start, - length, - startOffset, - dataBuffers, - initialBufferSize); - views.add(viewBuffer); - } - } - - // if (viewBuffer instanceof InlineValueBuffer) { - // lastValueAllocationSizeInBytes = ((InlineValueBuffer) viewBuffer).getValueBuffer().capacity(); - // } else { - // ReferenceValueBuffer referenceValueBuffer = (ReferenceValueBuffer) viewBuffer; - // lastValueAllocationSizeInBytes = dataBuffers.get(referenceValueBuffer.getBufId()).capacity(); - // } - // valueBuffer.setBytes(startOffset, value, start, length); + createValueBuffer(allocator, value, dataBuffers); } public final int getStartOffset(int index) { @@ -1570,14 +1434,9 @@ protected final void handleSafe(int index, int dataLength) { } final long startOffset = lastSet < 0 ? 0 : getStartOffset(lastSet + 1); final long targetCapacity = startOffset + dataLength; - // if (valueBuffer.capacity() < targetCapacity) { - // reallocDataBuffer(targetCapacity); - // } - - // safe allocation doesn't apply for inline buffers. Inline buffers are always allocated with the required capacity. - // But for reference buffers, we need to check if the buffer has enough capacity to hold the new data. - - + if (valueBuffer.capacity() < targetCapacity) { + reallocDataBuffer(targetCapacity); + } } /** diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ViewVarCharVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ViewVarCharVector.java index 0299ab271a104..e4b86d01f63b2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ViewVarCharVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ViewVarCharVector.java @@ -108,8 +108,20 @@ public byte[] get(int index) { final int startOffset = getStartOffset(index); final int dataLength = getEndOffset(index) - startOffset; final byte[] result = new byte[dataLength]; - valueBuffer.getBytes(startOffset, result, 0, dataLength); - return result; + if (dataLength > INLINE_SIZE) { + // data is in the inline buffer + // get buffer index + final int bufferIndex = valueBuffer.getInt(((long) index * VIEW_BUFFER_SIZE) + LENGTH_WIDTH + PREFIX_WIDTH); + // get data offset + final int dataOffset = valueBuffer.getInt(((long) index * VIEW_BUFFER_SIZE) + LENGTH_WIDTH + PREFIX_WIDTH + + BUF_INDEX_WIDTH); + dataBuffers.get(bufferIndex).getBytes(dataOffset, result, 0, dataLength); + return result; + } else { + // data is in the value buffer + valueBuffer.getBytes((long) index * VIEW_BUFFER_SIZE + BUF_INDEX_WIDTH, result, 0, dataLength); + return result; + } } /** diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVarCharViewVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVarCharViewVector.java index 1c55d3fafd75f..59e4f6380d13f 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestVarCharViewVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVarCharViewVector.java @@ -19,14 +19,14 @@ import java.nio.charset.StandardCharsets; +import java.util.ArrayList; import java.util.List; +import java.util.Random; + import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.BaseVariableWidthViewVector.InlineValueBuffer; -import org.apache.arrow.vector.BaseVariableWidthViewVector.ReferenceValueBuffer; -import org.apache.arrow.vector.BaseVariableWidthViewVector.ViewBuffer; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -40,6 +40,7 @@ public class TestVarCharViewVector { private static final byte[] STR3 = "01234567891234567".getBytes(StandardCharsets.UTF_8); private static final byte[] STR4 = "01234567".getBytes(StandardCharsets.UTF_8); private static final byte[] STR5 = "012345678912345678".getBytes(StandardCharsets.UTF_8); + private static final byte[] STR6 = "01234567891234567890".getBytes(StandardCharsets.UTF_8); private BufferAllocator allocator; @@ -82,154 +83,120 @@ public void testAllocationLimits() { @Test public void testInlineAllocation() { - try (final ViewVarCharVector largeVarCharVector = new ViewVarCharVector("myvector", allocator)) { - largeVarCharVector.allocateNew(32, 3); + try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("myvector", allocator)) { + viewVarCharVector.allocateNew(48, 3); final int valueCount = 3; - largeVarCharVector.set(0, STR0); - largeVarCharVector.set(1, STR1); - largeVarCharVector.set(2, STR4); - largeVarCharVector.setValueCount(valueCount); - - List views = largeVarCharVector.views; - List dataBuffers = largeVarCharVector.dataBuffers; + viewVarCharVector.set(0, STR0); + viewVarCharVector.set(1, STR1); + viewVarCharVector.set(2, STR4); + viewVarCharVector.setValueCount(valueCount); - assert views.size() == 3; - assert dataBuffers.isEmpty(); + byte[] view1 = viewVarCharVector.get(0); + byte[] view2 = viewVarCharVector.get(1); + byte[] view3 = viewVarCharVector.get(2); - ViewBuffer view0 = views.get(0); - assert view0 instanceof InlineValueBuffer; - validateInlineValueBuffer(STR0, (InlineValueBuffer) view0); + assert view1 != null; + assert view2 != null; + assert view3 != null; - ViewBuffer view1 = views.get(1); - assert view1 instanceof InlineValueBuffer; - validateInlineValueBuffer(STR1, (InlineValueBuffer) view1); + System.out.println(new String(view1, StandardCharsets.UTF_8)); + System.out.println(new String(view2, StandardCharsets.UTF_8)); + System.out.println(new String(view3, StandardCharsets.UTF_8)); - ViewBuffer view2 = views.get(1); - assert view2 instanceof InlineValueBuffer; - validateInlineValueBuffer(STR1, (InlineValueBuffer) view2); + // TODO: assert length, offset, and values per each view + // refer to testSetLastSetUsage } } - /** - * Validate the InlineValueBuffer by comparing the expected byte array with the actual byte array stored in the buffer. - * @param expected byte array expected to be compared - * @param inlineValueBuffer InlineValueBuffer to be validated - */ - private void validateInlineValueBuffer(byte[] expected, InlineValueBuffer inlineValueBuffer) { - assert inlineValueBuffer.getLength() == expected.length; - byte[] viewBytes = new byte[expected.length]; - inlineValueBuffer.getValueBuffer().getBytes(0, viewBytes); - String expectedStr = new String(viewBytes, StandardCharsets.UTF_8); - String viewStr = new String(expected, StandardCharsets.UTF_8); - assert expectedStr.equals(viewStr); - } - - /** - * Validate the ReferenceValueBuffer by comparing the expected byte array with the actual byte array stored in the - * buffer. - * @param expected byte array expected to be compared - * @param referenceValueBuffer ReferenceValueBuffer to be validated - * @param dataBuffers List of ArrowBuf extracted from the ViewVarCharVector - * @param startOffSet starting index to read from the ArrowBuf, useful when there are multiple values stored in the - * same ArrowBuf - */ - private void validateReferenceValueBuffer(byte[] expected, ReferenceValueBuffer referenceValueBuffer, - List dataBuffers, int startOffSet) { - int bufId = referenceValueBuffer.getBufId(); - byte[] expectedPrefixBytes = new byte[4]; - System.arraycopy(expected, 0, expectedPrefixBytes, 0, 4); - String expectedPrefix = new String(expectedPrefixBytes, StandardCharsets.UTF_8); - String viewPrefix = new String(referenceValueBuffer.getPrefix(), StandardCharsets.UTF_8); - assert expectedPrefix.equals(viewPrefix); - ArrowBuf dataBuf = dataBuffers.get(bufId); - byte[] dataBufBytes = new byte[expected.length]; - dataBuf.getBytes(startOffSet, dataBufBytes); - String viewData = new String(dataBufBytes, StandardCharsets.UTF_8); - String viewDataExpected = new String(expected, StandardCharsets.UTF_8); - assert viewData.equals(viewDataExpected); - } - @Test public void testReferenceAllocationInSameBuffer() { - try (final ViewVarCharVector largeVarCharVector = new ViewVarCharVector("myvector", allocator)) { - largeVarCharVector.allocateNew(42, 3); - final int valueCount = 3; - largeVarCharVector.set(0, STR1); - largeVarCharVector.set(1, STR2); - largeVarCharVector.set(2, STR3); - largeVarCharVector.setValueCount(valueCount); - - List views = largeVarCharVector.views; - List dataBuffers = largeVarCharVector.dataBuffers; - - assert views.size() == 3; - assert dataBuffers.size() == 1; - - ViewBuffer view0 = views.get(0); - assert view0 instanceof InlineValueBuffer; - validateInlineValueBuffer(STR1, (InlineValueBuffer) view0); - - ViewBuffer view1 = views.get(1); - assert view1 instanceof ReferenceValueBuffer; - validateReferenceValueBuffer(STR2, (ReferenceValueBuffer) view1, dataBuffers, 0); - // third view - ViewBuffer view2 = views.get(2); - assert view2 instanceof ReferenceValueBuffer; - validateReferenceValueBuffer(STR3, (ReferenceValueBuffer) view2, dataBuffers, STR2.length); + try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("myvector", allocator)) { + viewVarCharVector.allocateNew(48, 4); + final int valueCount = 4; + viewVarCharVector.set(0, STR1); + viewVarCharVector.set(1, STR2); + viewVarCharVector.set(2, STR3); + viewVarCharVector.set(3, generateRandomString(34).getBytes(StandardCharsets.UTF_8)); + viewVarCharVector.setValueCount(valueCount); + + byte[] view1 = viewVarCharVector.get(0); + byte[] view2 = viewVarCharVector.get(1); + byte[] view3 = viewVarCharVector.get(2); + byte[] view4 = viewVarCharVector.get(3); + + assert view1 != null; + assert view2 != null; + assert view3 != null; + assert view4 != null; + + System.out.println(new String(view1, StandardCharsets.UTF_8)); + System.out.println(new String(view2, StandardCharsets.UTF_8)); + System.out.println(new String(view3, StandardCharsets.UTF_8)); + System.out.println(new String(view4, StandardCharsets.UTF_8)); + + System.out.println(viewVarCharVector.dataBuffers.size()); // 1 } } @Test public void testReferenceAllocationInOtherBuffer() { - try (final ViewVarCharVector largeVarCharVector = new ViewVarCharVector("myvector", allocator)) { - largeVarCharVector.allocateNew(18, 4); + try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("myvector", allocator)) { + viewVarCharVector.allocateNew(48, 4); final int valueCount = 4; - largeVarCharVector.set(0, STR1); - largeVarCharVector.set(1, STR2); - largeVarCharVector.set(2, STR3); - largeVarCharVector.set(3, STR5); - largeVarCharVector.setValueCount(valueCount); + viewVarCharVector.set(0, STR1); + viewVarCharVector.set(1, STR2); + viewVarCharVector.set(2, STR3); + viewVarCharVector.set(3, generateRandomString(35).getBytes(StandardCharsets.UTF_8)); + viewVarCharVector.setValueCount(valueCount); + + byte[] view1 = viewVarCharVector.get(0); + byte[] view2 = viewVarCharVector.get(1); + byte[] view3 = viewVarCharVector.get(2); + byte[] view4 = viewVarCharVector.get(3); + + assert view1 != null; + assert view2 != null; + assert view3 != null; + assert view4 != null; + + System.out.println(new String(view1, StandardCharsets.UTF_8)); + System.out.println(new String(view2, StandardCharsets.UTF_8)); + System.out.println(new String(view3, StandardCharsets.UTF_8)); + System.out.println(new String(view4, StandardCharsets.UTF_8)); + + System.out.println(viewVarCharVector.dataBuffers.size()); // 2 + } + } - List views = largeVarCharVector.views; - List dataBuffers = largeVarCharVector.dataBuffers; - - assert views.size() == 4; - assert dataBuffers.size() == 2; - - ViewBuffer view0 = views.get(0); - assert view0 instanceof InlineValueBuffer; - validateInlineValueBuffer(STR1, (InlineValueBuffer) view0); - - ViewBuffer view1 = views.get(1); - assert view1 instanceof ReferenceValueBuffer; - validateReferenceValueBuffer(STR2, (ReferenceValueBuffer) view1, dataBuffers, 0); - - ViewBuffer view2 = views.get(2); - assert view2 instanceof ReferenceValueBuffer; - validateReferenceValueBuffer(STR3, (ReferenceValueBuffer) view2, dataBuffers, STR2.length); - // third view - ViewBuffer view3 = views.get(3); - assert view3 instanceof ReferenceValueBuffer; - validateReferenceValueBuffer(STR4, (ReferenceValueBuffer) view2, dataBuffers, 0); - - // checking if the first buffer in `dataBuffers` contains all the data as expected - // view1 and view2 are in the same buffer since we choose 18 as the allocation size - // nearest divisible by 8 and power of 2 allocation size is 32. - // So, the first buffer should contain STR2 with number of bytes 13. - // the second buffer should contain STR3 with number of bytes 17. - // Total space for view1 and view2 is 30 bytes, and we only have 2 bytes left. - // We have to allocate another buffer for view3. So total number of views is 3. - // Total number of data buffers are 2 where the first buffer contains STR2 and STR3 - // and the second buffer contains STR5. - byte[] dataBufAllBytes = new byte[STR2.length + STR3.length]; - ArrowBuf dataBuf1 = dataBuffers.get(0); - dataBuf1.getBytes(0, dataBufAllBytes); - String viewDataAll = new String(dataBufAllBytes, StandardCharsets.UTF_8); - byte[] expectedAllBytes = new byte[STR2.length + STR3.length]; - System.arraycopy(STR2, 0, expectedAllBytes, 0, STR2.length); - System.arraycopy(STR3, 0, expectedAllBytes, STR2.length, STR3.length); - String expectedAll = new String(expectedAllBytes, StandardCharsets.UTF_8); - assert viewDataAll.equals(expectedAll); + @Test + public void testMixedAllocation() { + try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("myvector", allocator)) { + viewVarCharVector.allocateNew(128, 6); + final int valueCount = 6; + viewVarCharVector.set(0, STR1); + viewVarCharVector.set(1, STR2); + viewVarCharVector.set(2, STR3); + viewVarCharVector.set(3, generateRandomString(35).getBytes(StandardCharsets.UTF_8)); + viewVarCharVector.set(4, STR1); + viewVarCharVector.set(5, generateRandomString(40).getBytes(StandardCharsets.UTF_8)); + viewVarCharVector.setValueCount(valueCount); + + byte[] view1 = viewVarCharVector.get(0); + byte[] view2 = viewVarCharVector.get(1); + byte[] view3 = viewVarCharVector.get(2); + byte[] view4 = viewVarCharVector.get(3); + + assert view1 != null; + assert view2 != null; + assert view3 != null; + assert view4 != null; + + System.out.println(new String(view1, StandardCharsets.UTF_8)); + System.out.println(new String(view2, StandardCharsets.UTF_8)); + System.out.println(new String(view3, StandardCharsets.UTF_8)); + System.out.println(new String(view4, StandardCharsets.UTF_8)); + + System.out.println(viewVarCharVector.dataBuffers.size()); // 2 } } @@ -266,6 +233,96 @@ public void testBasicV2() { } } + @Test + public void testBufferCreation() { + // inline buffer creation + final int VIEW_BUFFER_SIZE = 16; + final int LENGTH_WIDTH = 4; + final int INLINE_BUF_DATA_LENGTH = 12; + final int PREFIX_WIDTH = 4; + final int BUF_INDEX_WIDTH = 4; + final int BUF_OFFSET_WIDTH = 4; + int lastWriteIndexInLineBuffer = 0; + List dataBuffer = new ArrayList<>(); + ArrowBuf inlineBuf = allocator.buffer(BaseVariableWidthViewVector.INITIAL_VALUE_ALLOCATION * 8); + inlineBuf.setInt(0, STR0.length); + + inlineBuf.readerIndex(0); + inlineBuf.writerIndex(LENGTH_WIDTH); + + inlineBuf.setBytes(LENGTH_WIDTH, STR0, 0, STR0.length); + inlineBuf.writerIndex(VIEW_BUFFER_SIZE); + + + // reference buffer creation + // set buffer id + if (dataBuffer.isEmpty()) { + // add first buffer + ArrowBuf dataBuf = allocator.buffer(BaseVariableWidthViewVector.INITIAL_VALUE_ALLOCATION * 8); + // set length + inlineBuf.setInt(inlineBuf.writerIndex(), STR2.length); + inlineBuf.writerIndex(inlineBuf.writerIndex() + LENGTH_WIDTH); + // set prefix + inlineBuf.setBytes(inlineBuf.writerIndex(), STR2, 0, PREFIX_WIDTH); + inlineBuf.writerIndex(inlineBuf.writerIndex() + PREFIX_WIDTH); + // set buf id + inlineBuf.setInt(inlineBuf.writerIndex(), 0); + inlineBuf.writerIndex(inlineBuf.writerIndex() + BUF_INDEX_WIDTH); + // add offset + inlineBuf.setInt(inlineBuf.writerIndex(), 0); + inlineBuf.writerIndex(inlineBuf.writerIndex() + BUF_OFFSET_WIDTH); + dataBuf.setBytes(0, STR2, 0, STR2.length); + dataBuf.readerIndex(0); + dataBuf.writerIndex(STR2.length); + dataBuffer.add(dataBuf); + } else { + // continue from last buffer + } + + System.out.println("Reading Inline Buffer"); + System.out.println("Length: " + inlineBuf.getInt(0)); + byte[] bytes = new byte[STR0.length]; + inlineBuf.getBytes(LENGTH_WIDTH, bytes, 0, bytes.length); + String str = new String(bytes, StandardCharsets.UTF_8); + System.out.println("Value: " + str); + + System.out.println("Read Next Buffer"); + inlineBuf.readerIndex(inlineBuf.readerIndex() + VIEW_BUFFER_SIZE); + int refBufLength = inlineBuf.getInt(inlineBuf.readerIndex()); + System.out.println("Length: " + refBufLength); + if (refBufLength > 12) { + // must be a reference buffer + // read prefix + byte[] prefixBytes = new byte[PREFIX_WIDTH]; + inlineBuf.readerIndex(inlineBuf.readerIndex() + PREFIX_WIDTH); + inlineBuf.getBytes(inlineBuf.readerIndex(), prefixBytes, 0, PREFIX_WIDTH); + String prefix = new String(prefixBytes, StandardCharsets.UTF_8); + System.out.println("PREFIX : " + prefix); + // read buf Id + inlineBuf.readerIndex(inlineBuf.readerIndex() + BUF_INDEX_WIDTH); + int bufId = inlineBuf.getInt(inlineBuf.readerIndex()); + System.out.println("Buf Id: " + bufId); + // read offset + inlineBuf.readerIndex(inlineBuf.readerIndex() + BUF_OFFSET_WIDTH); + int offset = inlineBuf.getInt(inlineBuf.readerIndex()); + System.out.println("Offset: " + offset); + ArrowBuf refBuf = dataBuffer.get(bufId); + byte[] refBytes = new byte[refBufLength]; + refBuf.getBytes(offset, refBytes, 0, refBytes.length); + String refStr = new String(refBytes, StandardCharsets.UTF_8); + System.out.println("Value: " + refStr); + } else { + // inline buffer + } + + inlineBuf.clear(); + inlineBuf.close(); + ArrowBuf referenceBuf = dataBuffer.get(0); + referenceBuf.clear(); + referenceBuf.close(); + allocator.close(); + } + public static void setBytes(int index, byte[] bytes, LargeVarCharVector vector) { final long currentOffset = vector.offsetBuffer.getLong((long) index * BaseLargeVariableWidthVector.OFFSET_WIDTH); @@ -276,4 +333,13 @@ public static void setBytes(int index, byte[] bytes, LargeVarCharVector vector) currentOffset + bytes.length); vector.valueBuffer.setBytes(currentOffset, bytes, 0, bytes.length); } + + private String generateRandomString(int length) { + Random random = new Random(); + StringBuilder sb = new StringBuilder(length); + for (int i = 0; i < length; i++) { + sb.append(random.nextInt(10)); // 0-9 + } + return sb.toString(); + } }