Skip to content

Commit

Permalink
feat: initial formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
vibhatha committed May 25, 2024
1 parent 5d0431b commit 0b980df
Show file tree
Hide file tree
Showing 54 changed files with 1,266 additions and 1,137 deletions.
15 changes: 13 additions & 2 deletions java/algorithm/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<description>(Experimental/Contrib) A collection of algorithms for working with ValueVectors.</description>

<properties>
<spotless.version>2.42.0</spotless.version>
<spotless.version>2.30.0</spotless.version>
</properties>

<dependencies>
Expand Down Expand Up @@ -53,6 +53,16 @@
</dependencies>

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-checkstyle-plugin</artifactId>
<version>3.1.0</version>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
</plugins>
</build>

<profiles>
Expand Down Expand Up @@ -92,7 +102,7 @@
</formats>
<java>
<googleJavaFormat>
<version>1.9</version>
<version>1.17.0</version>
<style>GOOGLE</style>
</googleJavaFormat>
</java>
Expand All @@ -114,6 +124,7 @@
<execution>
<id>spotless-check</id>
<goals>
<goal>apply</goal>
<goal>check</goal>
</goals>
<phase>validate</phase>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,18 @@
import org.apache.arrow.vector.compare.RangeEqualsVisitor;
import org.apache.arrow.vector.util.DataSizeRoundingUtil;

/**
* Utilities for vector deduplication.
*/
/** Utilities for vector deduplication. */
class DeduplicationUtils {

/**
* Gets the start positions of the first distinct values in a vector.
*
* @param vector the target vector.
* @param runStarts the bit set to hold the start positions.
* @param <V> vector type.
*/
public static <V extends ValueVector> void populateRunStartIndicators(V vector, ArrowBuf runStarts) {
public static <V extends ValueVector> void populateRunStartIndicators(
V vector, ArrowBuf runStarts) {
int bufSize = DataSizeRoundingUtil.divideBy8Ceil(vector.getValueCount());
Preconditions.checkArgument(runStarts.capacity() >= bufSize);
runStarts.setZero(0, bufSize);
Expand All @@ -55,6 +55,7 @@ public static <V extends ValueVector> void populateRunStartIndicators(V vector,

/**
* Gets the run lengths, given the start positions.
*
* @param runStarts the bit set for start positions.
* @param runLengths the run length vector to populate.
* @param valueCount the number of values in the bit set.
Expand All @@ -76,15 +77,15 @@ public static void populateRunLengths(ArrowBuf runStarts, IntVector runLengths,
}

/**
* Gets distinct values from the input vector by removing adjacent
* duplicated values.
* Gets distinct values from the input vector by removing adjacent duplicated values.
*
* @param indicators the bit set containing the start positions of distinct values.
* @param inputVector the input vector.
* @param outputVector the output vector.
* @param <V> vector type.
*/
public static <V extends ValueVector> void populateDeduplicatedValues(
ArrowBuf indicators, V inputVector, V outputVector) {
ArrowBuf indicators, V inputVector, V outputVector) {
int dstIdx = 0;
for (int srcIdx = 0; srcIdx < inputVector.getValueCount(); srcIdx++) {
if (BitVectorHelper.get(indicators, srcIdx) != 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,29 +26,28 @@
import org.apache.arrow.vector.util.DataSizeRoundingUtil;

/**
* Remove adjacent equal elements from a vector.
* If the vector is sorted, it removes all duplicated values in the vector.
* Remove adjacent equal elements from a vector. If the vector is sorted, it removes all duplicated
* values in the vector.
*
* @param <V> vector type.
*/
public class VectorRunDeduplicator<V extends ValueVector> implements AutoCloseable {

/**
* Bit set for distinct values.
* If the value at some index is not equal to the previous value,
* its bit is set to 1, otherwise its bit is set to 0.
* Bit set for distinct values. If the value at some index is not equal to the previous value, its
* bit is set to 1, otherwise its bit is set to 0.
*/
private ArrowBuf distinctValueBuffer;

/**
* The vector to deduplicate.
*/
/** The vector to deduplicate. */
private final V vector;

private final BufferAllocator allocator;

/**
* Constructs a vector run deduplicator for a given vector.
* @param vector the vector to deduplicate. Ownership is NOT taken.
*
* @param vector the vector to deduplicate. Ownership is NOT taken.
* @param allocator the allocator used for allocating buffers for start indices.
*/
public VectorRunDeduplicator(V vector, BufferAllocator allocator) {
Expand All @@ -65,17 +64,20 @@ private void createDistinctValueBuffer() {

/**
* Gets the number of values which are different from their predecessor.
*
* @return the run count.
*/
public int getRunCount() {
if (distinctValueBuffer == null) {
createDistinctValueBuffer();
}
return vector.getValueCount() - BitVectorHelper.getNullCount(distinctValueBuffer, vector.getValueCount());
return vector.getValueCount()
- BitVectorHelper.getNullCount(distinctValueBuffer, vector.getValueCount());
}

/**
* Gets the vector with deduplicated adjacent values removed.
*
* @param outVector the output vector.
*/
public void populateDeduplicatedValues(V outVector) {
Expand All @@ -88,14 +90,16 @@ public void populateDeduplicatedValues(V outVector) {

/**
* Gets the length of each distinct value.
*
* @param lengthVector the vector for holding length values.
*/
public void populateRunLengths(IntVector lengthVector) {
if (distinctValueBuffer == null) {
createDistinctValueBuffer();
}

DeduplicationUtils.populateRunLengths(distinctValueBuffer, lengthVector, vector.getValueCount());
DeduplicationUtils.populateRunLengths(
distinctValueBuffer, lengthVector, vector.getValueCount());
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,27 +20,26 @@
import org.apache.arrow.vector.ValueVector;

/**
* A dictionary builder is intended for the scenario frequently encountered in practice:
* the dictionary is not known a priori, so it is generated dynamically.
* In particular, when a new value arrives, it is tested to check if it is already
* in the dictionary. If so, it is simply neglected, otherwise, it is added to the dictionary.
* <p>
* The dictionary builder is intended to build a single dictionary.
* So it cannot be used for different dictionaries.
* </p>
* A dictionary builder is intended for the scenario frequently encountered in practice: the
* dictionary is not known a priori, so it is generated dynamically. In particular, when a new value
* arrives, it is tested to check if it is already in the dictionary. If so, it is simply neglected,
* otherwise, it is added to the dictionary.
*
* <p>The dictionary builder is intended to build a single dictionary. So it cannot be used for
* different dictionaries.
*
* <p>Below gives the sample code for using the dictionary builder
*
* <pre>{@code
* DictionaryBuilder dictionaryBuilder = ...
* ...
* dictionaryBuild.addValue(newValue);
* ...
* }</pre>
* </p>
* <p>
* With the above code, the dictionary vector will be populated,
* and it can be retrieved by the {@link DictionaryBuilder#getDictionary()} method.
* After that, dictionary encoding can proceed with the populated dictionary..
* </p>
*
* <p>With the above code, the dictionary vector will be populated, and it can be retrieved by the
* {@link DictionaryBuilder#getDictionary()} method. After that, dictionary encoding can proceed
* with the populated dictionary..
*
* @param <V> the dictionary vector type.
*/
Expand All @@ -58,7 +57,7 @@ public interface DictionaryBuilder<V extends ValueVector> {
* Try to add an element from the target vector to the dictionary.
*
* @param targetVector the target vector containing new element.
* @param targetIndex the index of the new element in the target vector.
* @param targetIndex the index of the new element in the target vector.
* @return the index of the new element in the dictionary.
*/
int addValue(V targetVector, int targetIndex);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,20 @@

/**
* A dictionary encoder translates one vector into another one based on a dictionary vector.
* According to Arrow specification, the encoded vector must be an integer based vector, which
* is the index of the original vector element in the dictionary.
* According to Arrow specification, the encoded vector must be an integer based vector, which is
* the index of the original vector element in the dictionary.
*
* @param <E> type of the encoded vector.
* @param <D> type of the vector to encode. It is also the type of the dictionary vector.
*/
public interface DictionaryEncoder<E extends BaseIntVector, D extends ValueVector> {

/**
* Translates an input vector into an output vector.
*
* @param input the input vector.
* @param output the output vector. Note that it must be in a fresh state. At least,
* all its validity bits should be clear.
* @param output the output vector. Note that it must be in a fresh state. At least, all its
* validity bits should be clear.
*/
void encode(D input, E output);
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,45 +18,36 @@
package org.apache.arrow.algorithm.dictionary;

import java.util.HashMap;

import org.apache.arrow.memory.util.ArrowBufPointer;
import org.apache.arrow.memory.util.hash.ArrowBufHasher;
import org.apache.arrow.memory.util.hash.SimpleHasher;
import org.apache.arrow.vector.ElementAddressableVector;

/**
* This class builds the dictionary based on a hash table.
* Each add operation can be finished in O(1) time,
* where n is the current dictionary size.
* This class builds the dictionary based on a hash table. Each add operation can be finished in
* O(1) time, where n is the current dictionary size.
*
* @param <V> the dictionary vector type.
*/
public class HashTableBasedDictionaryBuilder<V extends ElementAddressableVector> implements DictionaryBuilder<V> {
public class HashTableBasedDictionaryBuilder<V extends ElementAddressableVector>
implements DictionaryBuilder<V> {

/**
* The dictionary to be built.
*/
/** The dictionary to be built. */
private final V dictionary;

/**
* If null should be encoded.
*/
/** If null should be encoded. */
private final boolean encodeNull;

/**
* The hash map for distinct dictionary entries.
* The key is the pointer to the dictionary element, whereas the value is the index in the dictionary.
* The hash map for distinct dictionary entries. The key is the pointer to the dictionary element,
* whereas the value is the index in the dictionary.
*/
private HashMap<ArrowBufPointer, Integer> hashMap = new HashMap<>();

/**
* The hasher used for calculating the hash code.
*/
/** The hasher used for calculating the hash code. */
private final ArrowBufHasher hasher;

/**
* Next pointer to try to add to the hash table.
*/
/** Next pointer to try to add to the hash table. */
private ArrowBufPointer nextPointer;

/**
Expand All @@ -83,7 +74,7 @@ public HashTableBasedDictionaryBuilder(V dictionary, boolean encodeNull) {
*
* @param dictionary the dictionary to populate.
* @param encodeNull if null values should be added to the dictionary.
* @param hasher the hasher used to compute the hash code.
* @param hasher the hasher used to compute the hash code.
*/
public HashTableBasedDictionaryBuilder(V dictionary, boolean encodeNull, ArrowBufHasher hasher) {
this.dictionary = dictionary;
Expand Down Expand Up @@ -125,7 +116,7 @@ public int addValues(V targetVector) {
* Try to add an element from the target vector to the dictionary.
*
* @param targetVector the target vector containing new element.
* @param targetIndex the index of the new element in the target vector.
* @param targetIndex the index of the new element in the target vector.
* @return the index of the new element in the dictionary.
*/
@Override
Expand Down
Loading

0 comments on commit 0b980df

Please sign in to comment.