Skip to content

Commit

Permalink
Merge pull request #2199 from revans2/string-append
Browse files Browse the repository at this point in the history
[REVIEW] Updated API to better support appending strings when building a column vector
  • Loading branch information
revans2 authored Jul 16, 2019
2 parents a110fdf + fd96617 commit 37f098e
Show file tree
Hide file tree
Showing 6 changed files with 128 additions and 75 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
- PR #2168 Use cudf.Column for CategoricalColumn's categories instead of a tuple
- PR #2193 Added more docuemtnation to `type_dispatcher` for specializing dispatched functors
- PR #2197 CSV Writer: Expose `chunksize` as a parameter for `to_csv`
- PR #2199 Better java support for appending strings
- PR #2176 Added column dtype support for datetime, int8, int16 to csv_writer
- PR #2209 Matching `get_dummies` & `select_dtypes` behavior to pandas
- PR #2217 Updated Java bindings to use the new groupby API
Expand Down
194 changes: 123 additions & 71 deletions java/src/main/java/ai/rapids/cudf/ColumnVector.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@
* to increment the reference count.
*/
public final class ColumnVector implements AutoCloseable, BinaryOperable {
/**
* The size in bytes of an offset entry
*/
static final int OFFSET_SIZE = DType.INT32.sizeInBytes;
private static final Logger log = LoggerFactory.getLogger(ColumnVector.class);

static {
Expand Down Expand Up @@ -674,8 +678,8 @@ public final boolean getBoolean(long index) {
public String getJavaString(long index) {
assert type == DType.STRING || type == DType.STRING_CATEGORY;
assertsForGet(index);
int start = offHeap.hostData.offsets.getInt(index * 4); // size of an int
int size = offHeap.hostData.offsets.getInt((index + 1) * 4) - start;
int start = offHeap.hostData.offsets.getInt(index * OFFSET_SIZE);
int size = offHeap.hostData.offsets.getInt((index + 1) * OFFSET_SIZE) - start;
byte[] rawData = new byte[size];
offHeap.hostData.data.getBytes(rawData, 0, start, size);
return new String(rawData, StandardCharsets.UTF_8);
Expand Down Expand Up @@ -1462,7 +1466,7 @@ public boolean clean(boolean logErrorIfNotClean) {
* @return the builder to use.
*/
public static Builder builder(DType type, int rows) {
return new Builder(type, TimeUnit.NONE, rows);
return new Builder(type, TimeUnit.NONE, rows, 0);
}

/**
Expand All @@ -1474,7 +1478,21 @@ public static Builder builder(DType type, int rows) {
* @return the builder to use.
*/
public static Builder builder(DType type, TimeUnit tsTimeUnit, int rows) {
return new Builder(type, tsTimeUnit, rows);
return new Builder(type, tsTimeUnit, rows, 0);
}

/**
* Create a new Builder to hold the specified number of rows and with enough space to hold the
* given amount of string data. Be sure to close the builder when done with it. Please try to
* use {@see #build(int, int, Consumer)} instead to avoid needing to close the builder.
* @param type the type of vector to build.
* @param rows the number of rows this builder can hold
* @param stringBufferSize the size of the string buffer to allocate.
* @return the builder to use.
*/
private static Builder builder(DType type, int rows, long stringBufferSize) {
assert type == DType.STRING_CATEGORY || type == DType.STRING;
return new Builder(type, TimeUnit.NONE, rows, stringBufferSize);
}

/**
Expand Down Expand Up @@ -1504,6 +1522,13 @@ public static ColumnVector build(DType type, TimeUnit tsTimeUnit, int rows,
}
}

private static ColumnVector build(DType type, int rows, long stringBufferSize, Consumer<Builder> init) {
try (Builder builder = builder(type, rows, stringBufferSize)) {
init.accept(builder);
return builder.build();
}
}

/**
* Create a new vector without sending data to the device.
* @param type the type of vector to build.
Expand Down Expand Up @@ -1609,67 +1634,26 @@ public static ColumnVector timestampsFromLongs(TimeUnit tsTimeUnit, long... valu
}

private static ColumnVector fromStrings(DType type, String... values) {
HostMemoryBuffer data = null;
HostMemoryBuffer offsets = null;
HostMemoryBuffer valid = null;
ColumnVector ret = null;
boolean needsCleanup = true;
try {
int rows = values.length;
long nullCount = 0;
// How many bytes do we need to hold the data. Sorry this is really expensive
long bufferSize = 0;
for (String s : values) {
if (s == null) {
nullCount++;
} else {
bufferSize += s.getBytes(StandardCharsets.UTF_8).length;
}
}
data = HostMemoryBuffer.allocate(bufferSize);
if (nullCount > 0) {
// copy and pasted from allocateBitmaskAndSetDefaultValues
long bitmaskSize = BitVectorHelper.getValidityAllocationSizeInBytes(rows);
valid = HostMemoryBuffer.allocate(bitmaskSize);
valid.setMemory(0, bitmaskSize, (byte) 0xFF);
}

offsets = HostMemoryBuffer.allocate((rows + 1) * 4);
int offset = 0;
// The initial offset is always 0
offsets.setInt(0, offset);
for (int i = 0; i < values.length; i++) {
String s = values[i];
if (s == null) {
BitVectorHelper.setNullAt(valid, i);
} else {
byte[] utf8 = s.getBytes(StandardCharsets.UTF_8);
data.setBytes(offset, utf8, 0, utf8.length);
offset += utf8.length;
}
offsets.setInt((i + 1L) * 4, offset);
}
ret = new ColumnVector(type, TimeUnit.NONE, rows, nullCount, data, valid, offsets);
ret.ensureOnDevice();
needsCleanup = false;
return ret;
} finally {
if (needsCleanup) {
if (ret != null) {
ret.close();
} else {
if (data != null) {
data.close();
}
if (offsets != null) {
offsets.close();
}
if (valid != null) {
valid.close();
}
}
assert type == DType.STRING || type == DType.STRING_CATEGORY;
int rows = values.length;
long nullCount = 0;
// How many bytes do we need to hold the data. Sorry this is really expensive
long bufferSize = 0;
for (String s: values) {
if (s == null) {
nullCount++;
} else {
bufferSize += s.getBytes(StandardCharsets.UTF_8).length;
}
}
if (nullCount > 0) {
return build(type, rows, bufferSize, (b) -> b.appendBoxed(values));
}
return build(type, rows, bufferSize, (b) -> {
for (String s: values) {
b.append(s);
}
});
}

/**
Expand Down Expand Up @@ -1899,21 +1883,35 @@ public static final class Builder implements AutoCloseable {
private final TimeUnit tsTimeUnit;
private HostMemoryBuffer data;
private HostMemoryBuffer valid;
private HostMemoryBuffer offsets;
private long currentIndex = 0;
private long nullCount;
private long stringBufferSize = 0;
private int currentStringByteIndex = 0;
private boolean built;

/**
* Create a builder with a buffer of size rows
* @param type datatype
* @param tsTimeUnit for TIMESTAMP the unit of time it is storing.
* @param rows number of rows to allocate.
* @param stringBufferSize the size of the string data buffer if we are
* working with Strings. It is ignored otherwise.
*/
Builder(DType type, TimeUnit tsTimeUnit, long rows) {
Builder(DType type, TimeUnit tsTimeUnit, long rows, long stringBufferSize) {
this.type = type;
this.tsTimeUnit = tsTimeUnit;
this.rows = rows;
this.data = HostMemoryBuffer.allocate(rows * type.sizeInBytes);
if (type == DType.STRING || type == DType.STRING_CATEGORY) {
this.data = HostMemoryBuffer.allocate(stringBufferSize);
// The offsets are ints and there is 1 more than the number of rows.
this.offsets = HostMemoryBuffer.allocate((rows + 1) * OFFSET_SIZE);
// The first offset is always 0
this.offsets.setInt(0, 0);
this.stringBufferSize = stringBufferSize;
} else {
this.data = HostMemoryBuffer.allocate(rows * type.sizeInBytes);
}
}

/**
Expand All @@ -1924,9 +1922,10 @@ public static final class Builder implements AutoCloseable {
* @param testData a buffer to hold the data (should be large enough to hold rows entries).
* @param testValid a buffer to hold the validity vector (should be large enough to hold
* rows entries or is null).
* @param testOffsets a buffer to hold the offsets for strings and string categories.
*/
Builder(DType type, TimeUnit tsTimeUnit, long rows, HostMemoryBuffer testData,
HostMemoryBuffer testValid) {
HostMemoryBuffer testValid, HostMemoryBuffer testOffsets) {
this.type = type;
this.tsTimeUnit = tsTimeUnit;
this.rows = rows;
Expand Down Expand Up @@ -1998,6 +1997,30 @@ public final Builder append(double value) {
return this;
}

public final Builder append(String value) {
assert value != null : "appendNull must be used to append null strings";
return appendUTF8String(value.getBytes(StandardCharsets.UTF_8));
}

private Builder appendUTF8String(byte[] value) {
return appendUTF8String(value, 0, value.length);
}

private Builder appendUTF8String(byte[] value, int offset, int length) {
assert value != null : "appendNull must be used to append null strings";
assert offset >= 0;
assert length >= 0;
assert value.length + offset <= length;
assert type == DType.STRING_CATEGORY || type == DType.STRING;
assert currentIndex < rows;
assert (currentStringByteIndex + length) <= stringBufferSize;
data.setBytes(currentStringByteIndex, value, offset, length);
currentStringByteIndex += length;
currentIndex++;
offsets.setInt(currentIndex * OFFSET_SIZE, currentStringByteIndex);
return this;
}

public final Builder appendArray(byte... values) {
assert (values.length + currentIndex) <= rows;
assert type == DType.INT8 || type == DType.BOOL8;
Expand Down Expand Up @@ -2165,6 +2188,23 @@ public final Builder appendBoxed(Double... values) throws IndexOutOfBoundsExcept
return this;
}

/**
* Append multiple values. This is very slow and should really only be used for tests.
* @param values the values to append, including nulls.
* @return this for chaining.
* @throws {@link IndexOutOfBoundsException}
*/
public final Builder appendBoxed(String... values) throws IndexOutOfBoundsException {
for (String b : values) {
if (b == null) {
appendNull();
} else {
append(b);
}
}
return this;
}

/**
* Append this vector to the end of this vector
* @param columnVector - Vector to be added
Expand All @@ -2175,9 +2215,14 @@ public final Builder append(ColumnVector columnVector) {
assert columnVector.type == type;
assert columnVector.offHeap.hostData != null;

data.copyFromHostBuffer(currentIndex * type.sizeInBytes, columnVector.offHeap.hostData.data,
0L,
columnVector.getRowCount() * type.sizeInBytes);
if (type == DType.STRING_CATEGORY || type == DType.STRING) {
throw new UnsupportedOperationException(
"Appending a string column vector client side is not currently supported");
} else {
data.copyFromHostBuffer(currentIndex * type.sizeInBytes, columnVector.offHeap.hostData.data,
0L,
columnVector.getRowCount() * type.sizeInBytes);
}

if (columnVector.nullCount != 0) {
if (valid == null) {
Expand All @@ -2204,6 +2249,9 @@ private void allocateBitmaskAndSetDefaultValues() {
public final Builder appendNull() {
setNullAt(currentIndex);
currentIndex++;
if (type == DType.STRING || type == DType.STRING_CATEGORY) {
offsets.setInt(currentIndex * OFFSET_SIZE, currentStringByteIndex);
}
return this;
}

Expand All @@ -2230,7 +2278,7 @@ public final ColumnVector build() {
throw new IllegalStateException("Cannot reuse a builder.");
}
ColumnVector cv = new ColumnVector(type, tsTimeUnit,
currentIndex, nullCount, data, valid);
currentIndex, nullCount, data, valid, offsets);
try {
cv.ensureOnDevice();
built = true;
Expand All @@ -2250,7 +2298,7 @@ public final ColumnVector buildOnHost() {
throw new IllegalStateException("Cannot reuse a builder.");
}
ColumnVector cv = new ColumnVector(type, tsTimeUnit,
currentIndex, nullCount, data, valid);
currentIndex, nullCount, data, valid, offsets);
built = true;
return cv;
}
Expand All @@ -2268,6 +2316,10 @@ public final void close() {
valid.close();
valid = null;
}
if (offsets != null) {
offsets.close();
offsets = null;
}
built = true;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,4 +145,4 @@ void testAppendVector() {
}
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -160,4 +160,4 @@ void testAppendVector() {
}
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -144,4 +144,4 @@ void testAppendVector() {
}
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -149,4 +149,4 @@ void testAppendVector() {
}
}
}
}
}

0 comments on commit 37f098e

Please sign in to comment.