Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Initial work for decimal type in Java/JNI [skip ci] #6514

Merged
merged 32 commits into from
Oct 27, 2020
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
4a4ea1d
Initial changes for supporting decimal type
nartal1 Oct 8, 2020
a549452
Modify constructors and JNI methods to create column.
nartal1 Oct 12, 2020
31f131b
Add inner class NativeDataType
nartal1 Oct 13, 2020
4e5f4e1
Merge branch 'branch-0.17' of https://github.com/rapidsai/cudf into d…
nartal1 Oct 13, 2020
23ef61d
Pass in scale and ordinal instead of saving native data_type handle
nartal1 Oct 14, 2020
17e1d30
Add DataType in HostColumnVector and address review comments
nartal1 Oct 14, 2020
b29c59b
Support DataType in Builder
nartal1 Oct 14, 2020
a7ae605
Merge branch 'branch-0.17' of https://github.com/rapidsai/cudf into d…
nartal1 Oct 15, 2020
9a2e948
addressed few review comments
nartal1 Oct 15, 2020
da9e4d6
addressed review comments
nartal1 Oct 15, 2020
b3d72d1
update PR with new design and addressed review comments
nartal1 Oct 21, 2020
7dba559
addressed few nits
nartal1 Oct 22, 2020
b0108ef
add factory methods for DType
sperlingxx Oct 22, 2020
4278d48
addressed review comments
nartal1 Oct 22, 2020
86b8a6b
Merge pull request #1 from sperlingxx/dtype_class_alfxu
nartal1 Oct 22, 2020
1434312
addressed review comments
nartal1 Oct 22, 2020
1931e38
Merge branch 'dtype_class' of github.com:nartal1/cudf into dtype_class
nartal1 Oct 22, 2020
f59c524
addressed review comments
nartal1 Oct 22, 2020
5220a7d
add decimal support for Scalar
sperlingxx Oct 23, 2020
e4af748
Merge pull request #2 from sperlingxx/dtype_class_alfxu
nartal1 Oct 23, 2020
1d21b47
addressed review comments
nartal1 Oct 24, 2020
3635007
update changelog
nartal1 Oct 24, 2020
9680e44
Merge branch 'branch-0.17' of https://github.com/rapidsai/cudf into d…
nartal1 Oct 24, 2020
fa8af4b
update changelog
nartal1 Oct 24, 2020
5c52d88
updated toString method:
nartal1 Oct 26, 2020
6ad640b
addressed review comments
nartal1 Oct 26, 2020
9b5b4b1
Merge branch 'branch-0.17' of https://github.com/rapidsai/cudf into d…
nartal1 Oct 26, 2020
767b50c
update changelog
nartal1 Oct 26, 2020
6d2bac9
update comments
nartal1 Oct 26, 2020
74c311c
update comments
nartal1 Oct 26, 2020
ef9d557
Merge branch 'branch-0.17' of https://github.com/rapidsai/cudf into d…
nartal1 Oct 27, 2020
f5c6d56
update changelog
nartal1 Oct 27, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 131 additions & 71 deletions java/src/main/java/ai/rapids/cudf/ColumnVector.java

Large diffs are not rendered by default.

30 changes: 30 additions & 0 deletions java/src/main/java/ai/rapids/cudf/DType.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ public enum DType {

STRING(0, 23, "str"),
LIST(0, 24, "list"),
DECIMAL32(4, 25, "decimal32"),
DECIMAL64(8, 26, "decimal64"),
STRUCT(0, 27, "struct");

private static final DType[] D_TYPES = DType.values();
Expand Down Expand Up @@ -119,6 +121,23 @@ public boolean isBackedByLong() {
return LONGS.contains(this);
}

/**
* Returns true if this type is backed by short type
* Namely this method will return true for the following types
* DType.INT16,
* DType.UINT16
*/
public boolean isBackedByShort() { return SHORTS.contains(this); }

/**
* Returns true if this type is backed by byte type
* Namely this method will return true for the following types
* DType.INT8,
* DType.UINT8,
* DType.BOOL8
*/
public boolean isBackedByByte() { return BYTES.contains(this); }

/**
* Returns true for duration types
*/
Expand Down Expand Up @@ -196,6 +215,17 @@ public static DType fromNative(int nativeId) {
DType.TIMESTAMP_DAYS
);

private static final EnumSet<DType> SHORTS = EnumSet.of(
DType.INT16,
DType.UINT16
);

private static final EnumSet<DType> BYTES = EnumSet.of(
DType.INT8,
jlowe marked this conversation as resolved.
Show resolved Hide resolved
DType.UINT8,
DType.BOOL8
);

private static final EnumSet<DType> NESTED_TYPE = EnumSet.of(
DType.LIST,
DType.STRUCT
Expand Down
46 changes: 46 additions & 0 deletions java/src/main/java/ai/rapids/cudf/DataType.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ai.rapids.cudf;

public class DataType {
jlowe marked this conversation as resolved.
Show resolved Hide resolved

final DType typeId;
int scale = 0;
jlowe marked this conversation as resolved.
Show resolved Hide resolved

public DataType(DType id) { typeId = id; }
jlowe marked this conversation as resolved.
Show resolved Hide resolved

public DataType(DType id, int decimalScale) {
typeId = id;
scale = decimalScale;
}

public boolean isTimestamp() { return typeId.isTimestamp();}

public boolean hasTimeResolution() { return typeId.hasTimeResolution(); }

public boolean isBackedByInt() { return typeId.isBackedByInt(); }

public boolean isBackedByLong() { return typeId.isBackedByLong(); }

public boolean isBackedByShort() { return typeId.isBackedByShort(); }

public boolean isBackedByByte() { return typeId.isBackedByByte(); }

public boolean isNestedType() { return typeId.isNestedType(); }

public int getNativeId() { return typeId.getNativeId(); }

}
112 changes: 77 additions & 35 deletions java/src/main/java/ai/rapids/cudf/HostColumnVector.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ public final class HostColumnVector extends HostColumnVectorCore {
this(type, rows, nullCount, hostDataBuffer, hostValidityBuffer, null);
}

HostColumnVector(ai.rapids.cudf.DataType type, long rows, Optional<Long> nullCount,
jlowe marked this conversation as resolved.
Show resolved Hide resolved
HostMemoryBuffer hostDataBuffer, HostMemoryBuffer hostValidityBuffer) {
this(type, rows, nullCount, hostDataBuffer, hostValidityBuffer, null);
}

/**
* Create a new column vector with data populated on the host.
* @param type the type of the vector
Expand Down Expand Up @@ -89,6 +94,21 @@ public final class HostColumnVector extends HostColumnVectorCore {
incRefCountInternal(true);
}

HostColumnVector(ai.rapids.cudf.DataType type, long rows, Optional<Long> nullCount,
HostMemoryBuffer hostDataBuffer, HostMemoryBuffer hostValidityBuffer,
HostMemoryBuffer offsetBuffer) {
super(type, rows, nullCount, hostDataBuffer, hostValidityBuffer, offsetBuffer, new ArrayList<>());
assert type.typeId != DType.LIST : "This constructor should not be used for list type";
if (nullCount.isPresent() && nullCount.get() > 0 && hostValidityBuffer == null) {
throw new IllegalStateException("Buffer cannot have a nullCount without a validity buffer");
}
if (type.typeId != DType.STRING && type.typeId != DType.LIST) {
assert offsetBuffer == null : "offsets are only supported for STRING and LIST";
}
refCount = 0;
incRefCountInternal(true);
}

/**
* This is a really ugly API, but it is possible that the lifecycle of a column of
* data may not have a clear lifecycle thanks to java and GC. This API informs the leak
Expand Down Expand Up @@ -175,8 +195,8 @@ public ColumnVector copyToDevice() {
if (!type.isNestedType()) {
jlowe marked this conversation as resolved.
Show resolved Hide resolved
HostMemoryBuffer hdata = this.offHeap.data;
if (hdata != null) {
long dataLen = rows * type.sizeInBytes;
if (type == DType.STRING) {
long dataLen = rows * type.typeId.sizeInBytes;
jlowe marked this conversation as resolved.
Show resolved Hide resolved
if (type.typeId == DType.STRING) {
// This needs a different type
dataLen = getEndStringOffset(rows - 1);
if (dataLen == 0 && getNullCount() == 0) {
Expand Down Expand Up @@ -239,6 +259,10 @@ public static Builder builder(DType type, int rows) {
return new Builder(type, rows, 0);
}

public static Builder builder(ai.rapids.cudf.DataType type, int rows) {
return new Builder(type, rows, 0);
}

/**
* Create a new Builder to hold the specified number of rows and with enough space to hold the
* given amount of string data. Be sure to close the builder when done with it. Please try to
Expand Down Expand Up @@ -1074,7 +1098,7 @@ public String toString() {

public static final class Builder implements AutoCloseable {
private final long rows;
private final DType type;
private final ai.rapids.cudf.DataType type;
private HostMemoryBuffer data;
private HostMemoryBuffer valid;
private HostMemoryBuffer offsets;
Expand All @@ -1091,7 +1115,7 @@ public static final class Builder implements AutoCloseable {
* working with Strings. It is ignored otherwise.
*/
Builder(DType type, long rows, long stringBufferSize) {
this.type = type;
this.type = new ai.rapids.cudf.DataType(type);
this.rows = rows;
if (type == DType.STRING) {
if (stringBufferSize <= 0) {
Expand All @@ -1108,6 +1132,24 @@ public static final class Builder implements AutoCloseable {
}
}

Builder(ai.rapids.cudf.DataType type, long rows, long stringBufferSize) {
this.type = type;
this.rows = rows;
if (type.typeId == DType.STRING) {
if (stringBufferSize <= 0) {
// We need at least one byte or we will get NULL back for data
stringBufferSize = 1;
}
this.data = HostMemoryBuffer.allocate(stringBufferSize);
// The offsets are ints and there is 1 more than the number of rows.
this.offsets = HostMemoryBuffer.allocate((rows + 1) * OFFSET_SIZE);
// The first offset is always 0
this.offsets.setInt(0, 0);
} else {
this.data = HostMemoryBuffer.allocate(rows * type.typeId.sizeInBytes);
}
}

/**
* Create a builder with a buffer of size rows (for testing ONLY).
* @param type datatype
Expand All @@ -1119,72 +1161,72 @@ public static final class Builder implements AutoCloseable {
*/
Builder(DType type, long rows, HostMemoryBuffer testData,
HostMemoryBuffer testValid, HostMemoryBuffer testOffsets) {
this.type = type;
this.type = new ai.rapids.cudf.DataType(type);
this.rows = rows;
this.data = testData;
this.valid = testValid;
}

public final Builder append(boolean value) {
assert type == DType.BOOL8;
assert type.typeId == DType.BOOL8;
assert currentIndex < rows;
data.setByte(currentIndex * type.sizeInBytes, value ? (byte)1 : (byte)0);
data.setByte(currentIndex * type.typeId.sizeInBytes, value ? (byte)1 : (byte)0);
currentIndex++;
return this;
}

public final Builder append(byte value) {
assert type == DType.INT8 || type == DType.UINT8 || type == DType.BOOL8;
assert type.typeId == DType.INT8 || type.typeId == DType.UINT8 || type.typeId == DType.BOOL8;
assert currentIndex < rows;
data.setByte(currentIndex * type.sizeInBytes, value);
data.setByte(currentIndex * type.typeId.sizeInBytes, value);
currentIndex++;
return this;
}

public final Builder append(byte value, long count) {
assert (count + currentIndex) <= rows;
assert type == DType.INT8 || type == DType.UINT8 || type == DType.BOOL8;
data.setMemory(currentIndex * type.sizeInBytes, count, value);
assert type.typeId == DType.INT8 || type.typeId == DType.UINT8 || type.typeId == DType.BOOL8;
data.setMemory(currentIndex * type.typeId.sizeInBytes, count, value);
currentIndex += count;
return this;
}

public final Builder append(short value) {
assert type == DType.INT16 || type == DType.UINT16;
assert type.typeId == DType.INT16 || type.typeId == DType.UINT16;
assert currentIndex < rows;
data.setShort(currentIndex * type.sizeInBytes, value);
data.setShort(currentIndex * type.typeId.sizeInBytes, value);
currentIndex++;
return this;
}

public final Builder append(int value) {
assert type.isBackedByInt();
assert currentIndex < rows;
data.setInt(currentIndex * type.sizeInBytes, value);
data.setInt(currentIndex * type.typeId.sizeInBytes, value);
currentIndex++;
return this;
}

public final Builder append(long value) {
assert type.isBackedByLong();
assert currentIndex < rows;
data.setLong(currentIndex * type.sizeInBytes, value);
data.setLong(currentIndex * type.typeId.sizeInBytes, value);
currentIndex++;
return this;
}

public final Builder append(float value) {
assert type == DType.FLOAT32;
assert type.typeId == DType.FLOAT32;
assert currentIndex < rows;
data.setFloat(currentIndex * type.sizeInBytes, value);
data.setFloat(currentIndex * type.typeId.sizeInBytes, value);
currentIndex++;
return this;
}

public final Builder append(double value) {
assert type == DType.FLOAT64;
assert type.typeId == DType.FLOAT64;
assert currentIndex < rows;
data.setDouble(currentIndex * type.sizeInBytes, value);
data.setDouble(currentIndex * type.typeId.sizeInBytes, value);
currentIndex++;
return this;
}
Expand All @@ -1203,7 +1245,7 @@ public Builder appendUTF8String(byte[] value, int offset, int length) {
assert offset >= 0;
assert length >= 0;
assert value.length + offset <= length;
assert type == DType.STRING;
assert type.typeId == DType.STRING;
assert currentIndex < rows;
// just for strings we want to throw a real exception if we would overrun the buffer
long oldLen = data.getLength();
Expand Down Expand Up @@ -1239,48 +1281,48 @@ public Builder appendUTF8String(byte[] value, int offset, int length) {

public Builder appendArray(byte... values) {
assert (values.length + currentIndex) <= rows;
assert type == DType.INT8 || type == DType.UINT8 || type == DType.BOOL8;
data.setBytes(currentIndex * type.sizeInBytes, values, 0, values.length);
assert type.isBackedByByte();
data.setBytes(currentIndex * type.typeId.sizeInBytes, values, 0, values.length);
currentIndex += values.length;
return this;
}

public Builder appendArray(short... values) {
assert type == DType.INT16 || type == DType.UINT16;
assert type.isBackedByShort();
assert (values.length + currentIndex) <= rows;
data.setShorts(currentIndex * type.sizeInBytes, values, 0, values.length);
data.setShorts(currentIndex * type.typeId.sizeInBytes, values, 0, values.length);
currentIndex += values.length;
return this;
}

public Builder appendArray(int... values) {
assert type.isBackedByInt();
assert (values.length + currentIndex) <= rows;
data.setInts(currentIndex * type.sizeInBytes, values, 0, values.length);
data.setInts(currentIndex * type.typeId.sizeInBytes, values, 0, values.length);
currentIndex += values.length;
return this;
}

public Builder appendArray(long... values) {
assert type.isBackedByLong();
assert (values.length + currentIndex) <= rows;
data.setLongs(currentIndex * type.sizeInBytes, values, 0, values.length);
data.setLongs(currentIndex * type.typeId.sizeInBytes, values, 0, values.length);
currentIndex += values.length;
return this;
}

public Builder appendArray(float... values) {
assert type == DType.FLOAT32;
assert type.typeId == DType.FLOAT32;
assert (values.length + currentIndex) <= rows;
data.setFloats(currentIndex * type.sizeInBytes, values, 0, values.length);
data.setFloats(currentIndex * type.typeId.sizeInBytes, values, 0, values.length);
currentIndex += values.length;
return this;
}

public Builder appendArray(double... values) {
assert type == DType.FLOAT64;
assert type.typeId == DType.FLOAT64;
assert (values.length + currentIndex) <= rows;
data.setDoubles(currentIndex * type.sizeInBytes, values, 0, values.length);
data.setDoubles(currentIndex * type.typeId.sizeInBytes, values, 0, values.length);
currentIndex += values.length;
return this;
}
Expand Down Expand Up @@ -1429,15 +1471,15 @@ public final Builder appendBoxed(String... values) throws IndexOutOfBoundsExcept
*/
public final Builder append(HostColumnVector columnVector) {
assert columnVector.rows <= (rows - currentIndex);
assert columnVector.type == type;
assert columnVector.type.typeId == type.typeId;
jlowe marked this conversation as resolved.
Show resolved Hide resolved

if (type == DType.STRING) {
if (type.typeId == DType.STRING) {
throw new UnsupportedOperationException(
"Appending a string column vector client side is not currently supported");
} else {
data.copyFromHostBuffer(currentIndex * type.sizeInBytes, columnVector.offHeap.data,
data.copyFromHostBuffer(currentIndex * type.typeId.sizeInBytes, columnVector.offHeap.data,
0L,
columnVector.getRowCount() * type.sizeInBytes);
columnVector.getRowCount() * type.typeId.sizeInBytes);
}

//As this is doing the append on the host assume that a null count is available
Expand Down Expand Up @@ -1467,7 +1509,7 @@ private void allocateBitmaskAndSetDefaultValues() {
public final Builder appendNull() {
setNullAt(currentIndex);
currentIndex++;
if (type == DType.STRING) {
if (type.typeId == DType.STRING) {
offsets.setInt(currentIndex * OFFSET_SIZE, currentStringByteIndex);
}
return this;
Expand Down
Loading