NVIDIA · liurenjie1024 · Oct 24, 2024 · Oct 24, 2024 · Oct 29, 2024 · jlowe
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/Pair.java b/src/main/java/com/nvidia/spark/rapids/jni/Pair.java
@@ -0,0 +1,23 @@
+package com.nvidia.spark.rapids.jni;
+
+public class Pair<K, V> {
+    private final K left;
+    private final V right;
+
+    public Pair(K left, V right) {
+        this.left = left;
+        this.right = right;
+    }
+
+    public K getLeft() {
+        return left;
+    }
+
+    public V getRight() {
+        return right;
+    }
+
+    public static <K, V> Pair<K, V> of(K left, V right) {
+        return new Pair<>(left, right);
+    }
+}
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/SlicedTable.java b/src/main/java/com/nvidia/spark/rapids/jni/SlicedTable.java
@@ -0,0 +1,49 @@
+package com.nvidia.spark.rapids.jni;
+
+import ai.rapids.cudf.Table;
+
+import java.util.Objects;
+
+import static com.nvidia.spark.rapids.jni.TableUtils.ensure;
+
+
+/**
+ * A sliced view to table.
+ * This table doesn't change ownership of the underlying data.
+ */
+public class SlicedTable {
+    private final long startRow;
+    private final long numRows;
+    private final Table table;
+
+    public SlicedTable(long startRow, long numRows, Table table) {
+        Objects.requireNonNull(table, "table must not be null");
+        ensure(startRow >= 0, "startRow must be >= 0");
+        ensure(startRow < table.getRowCount(),
+                () -> "startRow " + startRow  + " is larger than table row count " + table.getRowCount());
+        ensure(numRows >= 0, () -> "numRows " + numRows + " is negative");
+        ensure(startRow + numRows <= table.getRowCount(), () -> "startRow + numRows is " + (startRow + numRows)
+                + ",  must be less than table row count " + table.getRowCount());
+
+        this.startRow = startRow;
+        this.numRows = numRows;
+        this.table = table;
+    }
+
+    public long getStartRow() {
+        return startRow;
+    }
+
+    public long getNumRows() {
+        return numRows;
+    }
+
+    public Table getTable() {
+        return table;
+    }
+
+    public static SlicedTable from(Table table, long startRow, long numRows) {
+        return new SlicedTable(startRow, numRows, table);
+    }
+}
+
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/TableUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/TableUtils.java
@@ -0,0 +1,131 @@
+package com.nvidia.spark.rapids.jni;
+
+import ai.rapids.cudf.*;
+
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.function.Function;
+import java.util.function.LongConsumer;
+import java.util.function.Supplier;
+import java.util.stream.Collectors;
+
+public class TableUtils {
+  public static Schema schemaOf(Table t) {
+    Schema.Builder builder = Schema.builder();
+
+    for (int i = 0; i < t.getNumberOfColumns(); i++) {
+      ColumnVector cv = t.getColumn(i);
+      addToSchema(cv, "col_" + i + "_", builder);
+    }
+
+    return builder.build();
+  }
+
+  public static void addToSchema(ColumnView cv, String namePrefix, Schema.Builder builder) {
+    toSchemaInner(cv, 0, namePrefix, builder);
+  }
+
+  private static int toSchemaInner(ColumnView cv, int idx, String namePrefix,
+      Schema.Builder builder) {
+    String name = namePrefix + idx;
+
+    Schema.Builder thisBuilder = builder.addColumn(cv.getType(), name);
+    int lastIdx = idx;
+    for (int i = 0; i < cv.getNumChildren(); i++) {
+      lastIdx = toSchemaInner(cv.getChildColumnView(i), lastIdx + 1, namePrefix,
+          thisBuilder);
+    }
+
+    return lastIdx;
+  }
+
+  public static void addToSchema(HostColumnVectorCore cv, String namePrefix, Schema.Builder builder) {
+    toSchemaInner(cv, 0, namePrefix, builder);
+  }
+
+  private static int toSchemaInner(HostColumnVectorCore cv, int idx, String namePrefix,
+      Schema.Builder builder) {
+    String name = namePrefix + idx;
+
+    Schema.Builder thisBuilder = builder.addColumn(cv.getType(), name);
+    int lastIdx = idx;
+    for (int i=0; i < cv.getNumChildren(); i++) {
+      lastIdx = toSchemaInner(cv.getChildColumnView(i), lastIdx + 1, namePrefix, thisBuilder);
+    }
+
+    return lastIdx;
+  }
+
+  public static void ensure(boolean condition, String message) {
+    if (!condition) {
+      throw new IllegalArgumentException(message);
+    }
+  }
+
+  public static void ensure(boolean condition, Supplier<String> messageSupplier) {
+    if (!condition) {
+      throw new IllegalArgumentException(messageSupplier.get());
+    }
+  }
+
+  /**
+   * This method returns the length in bytes needed to represent X number of rows
+   * e.g. getValidityLengthInBytes(5) => 1 byte
+   * getValidityLengthInBytes(7) => 1 byte
+   * getValidityLengthInBytes(14) => 2 bytes
+   */
+  public static long getValidityLengthInBytes(long rows) {
+    return (rows + 7) / 8;
+  }
+
+  /**
+   * This method returns the allocation size of the validity vector which is 64-byte aligned
+   * e.g. getValidityAllocationSizeInBytes(5) => 64 bytes
+   * getValidityAllocationSizeInBytes(14) => 64 bytes
+   * getValidityAllocationSizeInBytes(65) => 128 bytes
+   */
+  static long getValidityAllocationSizeInBytes(long rows) {
+    long numBytes = getValidityLengthInBytes(rows);
+    return ((numBytes + 63) / 64) * 64;
+  }
+
+  public static <R extends AutoCloseable, T> T closeIfException(R resource, Function<R, T> function) {
+    try {
+      return function.apply(resource);
+    } catch (Exception e) {
+      if (resource != null) {
+        try {
+          resource.close();
+        } catch (Exception inner) {
+          // ignore
+        }
+      }
+      throw e;
+    }
+  }
+
+  public static <R extends AutoCloseable> void closeQuietly(Iterator<R> resources) {
+    while (resources.hasNext()) {
+      try {
+        resources.next().close();
+      } catch (Exception e) {
+        // ignore
+      }
+    }
+  }
+
+  public static <R extends AutoCloseable> void closeQuietly(R... resources) {
+    closeQuietly(Arrays.stream(resources).collect(Collectors.toList()));
+  }
+
+  public static <R extends AutoCloseable> void closeQuietly(Iterable<R> resources) {
+    closeQuietly(resources.iterator());
+  }
+
+  public static <T> T withTime(Supplier<T> task, LongConsumer timeConsumer) {
+    long now = System.nanoTime();
+    T ret = task.get();
+    timeConsumer.accept(System.nanoTime() - now);
+    return ret;
+  }
+}
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/kudo/ColumnOffsetInfo.java b/src/main/java/com/nvidia/spark/rapids/jni/kudo/ColumnOffsetInfo.java
@@ -0,0 +1,47 @@
+package com.nvidia.spark.rapids.jni.kudo;
+
+import java.util.OptionalLong;
+
+/**
+ * This class is used to store the offsets of the buffer of a column in the serialized data.
+ */
+public class ColumnOffsetInfo {
+    private static final long INVALID_OFFSET = -1L;
+    private final long validity;
+    private final long offset;
+    private final long data;
+    private final long dataLen;
+
+    public ColumnOffsetInfo(long validity, long offset, long data, long dataLen) {
+        this.validity = validity;
+        this.offset = offset;
+        this.data = data;
+        this.dataLen = dataLen;
+    }
+
+    public OptionalLong getValidity() {
+        return (validity == INVALID_OFFSET) ? OptionalLong.empty() : OptionalLong.of(validity);
+    }
+
+    public OptionalLong getOffset() {
+        return (offset == INVALID_OFFSET) ? OptionalLong.empty() : OptionalLong.of(offset);
+    }
+
+    public OptionalLong getData() {
+        return (data == INVALID_OFFSET) ? OptionalLong.empty() : OptionalLong.of(data);
+    }
+
+    public long getDataLen() {
+        return dataLen;
+    }
+
+    @Override
+    public String toString() {
+        return "ColumnOffsets{" +
+                "validity=" + validity +
+                ", offset=" + offset +
+                ", data=" + data +
+                ", dataLen=" + dataLen +
+                '}';
+    }
+}
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/kudo/ColumnViewInfo.java b/src/main/java/com/nvidia/spark/rapids/jni/kudo/ColumnViewInfo.java
@@ -0,0 +1,58 @@
+package com.nvidia.spark.rapids.jni.kudo;
+
+import ai.rapids.cudf.DType;
+import ai.rapids.cudf.DeviceMemoryBuffer;
+
+import static com.nvidia.spark.rapids.jni.kudo.KudoSerializer.safeLongToInt;
+
+
+public class ColumnViewInfo {
+    private final DType dtype;
+    private final ColumnOffsetInfo offsetInfo;
+    private final long nullCount;
+    private final long rowCount;
+
+    public ColumnViewInfo(DType dtype, ColumnOffsetInfo offsetInfo,
+                          long nullCount, long rowCount) {
+        this.dtype = dtype;
+        this.offsetInfo = offsetInfo;
+        this.nullCount = nullCount;
+        this.rowCount = rowCount;
+    }
+
+    public long buildColumnView(DeviceMemoryBuffer buffer, long[] childrenView) {
+        long bufferAddress = buffer.getAddress();
+
+        long dataAddress = 0;
+        if (offsetInfo.getData().isPresent()) {
+            dataAddress = buffer.getAddress() + offsetInfo.getData().getAsLong();
+        }
+
+        long validityAddress = 0;
+        if (offsetInfo.getValidity().isPresent()) {
+            validityAddress = offsetInfo.getValidity().getAsLong() + bufferAddress;
+        }
+
+        long offsetsAddress = 0;
+        if (offsetInfo.getOffset().isPresent()) {
+            offsetsAddress = offsetInfo.getOffset().getAsLong() + bufferAddress;
+        }
+
+        return RefUtils.makeCudfColumnView(
+                dtype.getTypeId().getNativeId(), dtype.getScale(),
+                dataAddress, offsetInfo.getDataLen(),
+                offsetsAddress, validityAddress,
+                safeLongToInt(nullCount), safeLongToInt(rowCount),
+                childrenView);
+    }
+
+    @Override
+    public String toString() {
+        return "ColumnViewInfo{" +
+                "dtype=" + dtype +
+                ", offsetInfo=" + offsetInfo +
+                ", nullCount=" + nullCount +
+                ", rowCount=" + rowCount +
+                '}';
+    }
+}
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/kudo/DataOutputStreamWriter.java b/src/main/java/com/nvidia/spark/rapids/jni/kudo/DataOutputStreamWriter.java
@@ -0,0 +1,67 @@
+package com.nvidia.spark.rapids.jni.kudo;
+
+import ai.rapids.cudf.HostMemoryBuffer;
+
+import java.io.DataOutputStream;
+import java.io.IOException;
+
+/**
+ * Visible for testing
+ */
+class DataOutputStreamWriter extends DataWriter {
+    private final byte[] arrayBuffer = new byte[1024 * 128];
+    private final DataOutputStream dout;
+
+    public DataOutputStreamWriter(DataOutputStream dout) {
+        this.dout = dout;
+    }
+
+    @Override
+    public void writeByte(byte b) throws IOException {
+        dout.writeByte(b);
+    }
+
+    @Override
+    public void writeShort(short s) throws IOException {
+        dout.writeShort(s);
+    }
+
+    @Override
+    public void writeInt(int i) throws IOException {
+        dout.writeInt(i);
+    }
+
+    @Override
+    public void writeIntNativeOrder(int i) throws IOException {
+        // TODO this only works on Little Endian Architectures, x86.  If we need
+        // to support others we need to detect the endianness and switch on the right implementation.
+        writeInt(Integer.reverseBytes(i));
+    }
+
+    @Override
+    public void writeLong(long val) throws IOException {
+        dout.writeLong(val);
+    }
+
+    @Override
+    public void copyDataFrom(HostMemoryBuffer src, long srcOffset, long len) throws IOException {
+        long dataLeft = len;
+        while (dataLeft > 0) {
+            int amountToCopy = (int)Math.min(arrayBuffer.length, dataLeft);
+            src.getBytes(arrayBuffer, 0, srcOffset, amountToCopy);
+            dout.write(arrayBuffer, 0, amountToCopy);
+            srcOffset += amountToCopy;
+            dataLeft -= amountToCopy;
+        }
+    }
+
+    @Override
+    public void flush() throws IOException {
+        dout.flush();
+    }
+
+    @Override
+    public void write(byte[] arr, int offset, int length) throws IOException {
+        dout.write(arr, offset, length);
+    }
+}