Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[HUDI-4284] Implement bloom lookup tree as red-black tree #5978

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import org.apache.hudi.common.util.collection.Pair;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
Expand Down Expand Up @@ -51,10 +50,6 @@ class IntervalTreeBasedGlobalIndexFileFilter implements IndexFileFilter {
allIndexFiles.add(file);
}));

// Note that the interval tree implementation doesn't have auto-balancing to ensure logN search time.
// So, we are shuffling the input here hoping the tree will not have any skewness. If not, the tree could be skewed
// which could result in N search time instead of NlogN.
Collections.shuffle(allIndexFiles);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yabola do you have some micro-benchmark as to how much improvement this change brings?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, actually I don't have benchmark on it. I think red-black tree is a general optimization strategy like hashmap

allIndexFiles.forEach(indexFile -> {
if (indexFile.hasKeyRanges()) {
indexLookUpTree
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

import org.apache.hudi.common.util.collection.Pair;

import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
Expand All @@ -43,10 +42,6 @@ class IntervalTreeBasedIndexFileFilter implements IndexFileFilter {
*/
IntervalTreeBasedIndexFileFilter(final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo) {
partitionToFileIndexInfo.forEach((partition, bloomIndexFiles) -> {
// Note that the interval tree implementation doesn't have auto-balancing to ensure logN search time.
// So, we are shuffling the input here hoping the tree will not have any skewness. If not, the tree could be
// skewed which could result in N search time instead of logN.
Collections.shuffle(bloomIndexFiles);
KeyRangeLookupTree lookUpTree = new KeyRangeLookupTree();
bloomIndexFiles.forEach(indexFileInfo -> {
if (indexFileInfo.hasKeyRanges()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,93 +18,91 @@

package org.apache.hudi.index.bloom;

import java.io.Serializable;
import org.apache.hudi.common.util.rbtree.RedBlackTree;

import java.util.HashSet;
import java.util.Set;

/**
* Look up tree implemented as interval trees to search for any given key in (N logN) time complexity.
* Look up tree implemented as red-black trees to search for any given key in (N logN) time complexity.
*/
class KeyRangeLookupTree implements Serializable {

private KeyRangeNode root;
class KeyRangeLookupTree extends RedBlackTree<KeyRangeNode, RecordKeyRange> {

/**
* @return the root of the tree. Could be {@code null}
* Flag for whether sub-tree min-max metrics need to be recalculated. When inserting or deleting nodes,
* we need to recalculate.
*/
public KeyRangeNode getRoot() {
return root;
private volatile boolean needReloadMetrics = false;

@Override
public void insert(KeyRangeNode newNode) {
needReloadMetrics = true;
super.insert(newNode);
}

@Override
public void remove(RecordKeyRange key) {
needReloadMetrics = true;
super.remove(key);
}

/**
* Inserts a new {@link KeyRangeNode} to this look up tree.
* If original node and new node matches with min record key and max record key, merge two nodes. In other words, add
* files from {@code newNode} to {@code originNode}.
*
* @param newNode the new {@link KeyRangeNode} to be inserted
* @param originNode previously inserted node
* @param newNode newly inserted same node
*/
void insert(KeyRangeNode newNode) {
root = insert(getRoot(), newNode);
@Override
protected void processWhenInsertSame(KeyRangeNode originNode, KeyRangeNode newNode) {
originNode.addFiles(newNode.getFileNameList());
}

/**
* Inserts a new {@link KeyRangeNode} to this look up tree.
*
* If no root exists, make {@code newNode} as the root and return the new root.
*
* If current root and newNode matches with min record key and max record key, merge two nodes. In other words, add
* files from {@code newNode} to current root. Return current root.
*
* If current root is < newNode if current root has no right sub tree update current root's right sub tree max and min
* set newNode as right sub tree else update root's right sub tree min and max with newNode's min and max record key
* as applicable recursively call insert() with root's right subtree as new root
*
* else // current root is >= newNode if current root has no left sub tree update current root's left sub tree max and
* min set newNode as left sub tree else update root's left sub tree min and max with newNode's min and max record key
* as applicable recursively call insert() with root's left subtree as new root
*
* @param root refers to the current root of the look up tree
* @param newNode newNode the new {@link KeyRangeNode} to be inserted
* Traverse the tree to calculate sub-tree min-max metrics.
*/
private KeyRangeNode insert(KeyRangeNode root, KeyRangeNode newNode) {
if (root == null) {
root = newNode;
return root;
private void calculateSubTreeMinMax(KeyRangeNode node) {
if (node == null) {
return;
}
if (node.getLeft() != null) {
calculateSubTreeMinMax(node.getLeft());
node.setLeftSubTreeMin(minRecord(node.getLeft()));
node.setLeftSubTreeMax(maxRecord(node.getLeft()));
}
if (node.getRight() != null) {
calculateSubTreeMinMax(node.getRight());
node.setRightSubTreeMin(minRecord(node.getRight()));
node.setRightSubTreeMax(maxRecord(node.getRight()));
}
}

if (root.compareTo(newNode) == 0) {
root.addFiles(newNode.getFileNameList());
return root;
/**
* Get the minimum value among the node and its child nodes.
*/
private String minRecord(KeyRangeNode node) {
String min = node.getKey().getMinRecordKey();
if (node.getLeft() != null && node.getLeftSubTreeMin().compareTo(min) < 0) {
min = node.getLeftSubTreeMin();
}
if (node.getRight() != null && node.getRightSubTreeMin().compareTo(min) < 0) {
min = node.getRightSubTreeMin();
}
return min;
}

if (root.compareTo(newNode) < 0) {
if (root.getRight() == null) {
root.setRightSubTreeMax(newNode.getMaxRecordKey());
root.setRightSubTreeMin(newNode.getMinRecordKey());
root.setRight(newNode);
} else {
if (root.getRightSubTreeMax().compareTo(newNode.getMaxRecordKey()) < 0) {
root.setRightSubTreeMax(newNode.getMaxRecordKey());
}
if (root.getRightSubTreeMin().compareTo(newNode.getMinRecordKey()) > 0) {
root.setRightSubTreeMin(newNode.getMinRecordKey());
}
insert(root.getRight(), newNode);
}
} else {
if (root.getLeft() == null) {
root.setLeftSubTreeMax(newNode.getMaxRecordKey());
root.setLeftSubTreeMin(newNode.getMinRecordKey());
root.setLeft(newNode);
} else {
if (root.getLeftSubTreeMax().compareTo(newNode.getMaxRecordKey()) < 0) {
root.setLeftSubTreeMax(newNode.getMaxRecordKey());
}
if (root.getLeftSubTreeMin().compareTo(newNode.getMinRecordKey()) > 0) {
root.setLeftSubTreeMin(newNode.getMinRecordKey());
}
insert(root.getLeft(), newNode);
}
/**
* Get the maximum value among the node and its child nodes.
*/
private String maxRecord(KeyRangeNode node) {
String max = node.getKey().getMaxRecordKey();
if (node.getLeft() != null && node.getLeftSubTreeMax().compareTo(max) > 0) {
max = node.getLeftSubTreeMax();
}
return root;
if (node.getRight() != null && node.getRightSubTreeMax().compareTo(max) > 0) {
max = node.getRightSubTreeMax();
}
return max;
}

/**
Expand All @@ -114,6 +112,9 @@ private KeyRangeNode insert(KeyRangeNode root, KeyRangeNode newNode) {
* @return the {@link Set} of matching index file names
*/
Set<String> getMatchingIndexFiles(String lookupKey) {
if (needReloadMetrics) {
calculateSubTreeMinMax(getRoot());
}
Set<String> matchingFileNameSet = new HashSet<>();
getMatchingIndexFiles(getRoot(), lookupKey, matchingFileNameSet);
return matchingFileNameSet;
Expand All @@ -122,7 +123,7 @@ Set<String> getMatchingIndexFiles(String lookupKey) {
/**
* Fetches all the matching index files where the key could possibly be present.
*
* @param root refers to the current root of the look up tree
* @param root refers to the current root of the look up tree
* @param lookupKey the key to be searched for
*/
private void getMatchingIndexFiles(KeyRangeNode root, String lookupKey, Set<String> matchingFileNameSet) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,37 +18,33 @@

package org.apache.hudi.index.bloom;

import java.io.Serializable;
import org.apache.hudi.common.util.rbtree.RedBlackTreeNode;

import java.util.ArrayList;
import java.util.List;

/**
* Represents a node in the {@link KeyRangeLookupTree}. Holds information pertaining to a single index file, viz file
* Represents a red-black tree node in the {@link KeyRangeLookupTree}. Holds information pertaining to a single index file, viz file
* name, min record key and max record key.
*/
class KeyRangeNode implements Comparable<KeyRangeNode>, Serializable {
class KeyRangeNode extends RedBlackTreeNode<RecordKeyRange> {

private final List<String> fileNameList = new ArrayList<>();
private final String minRecordKey;
private final String maxRecordKey;
private String rightSubTreeMax = null;
private String leftSubTreeMax = null;
private String rightSubTreeMin = null;
private String leftSubTreeMin = null;
private KeyRangeNode left = null;
private KeyRangeNode right = null;

/**
* Instantiates a new {@link KeyRangeNode}.
*
* @param minRecordKey min record key of the index file
* @param maxRecordKey max record key of the index file
* @param fileName file name of the index file
* @param fileName file name of the index file
*/
KeyRangeNode(String minRecordKey, String maxRecordKey, String fileName) {
super(new RecordKeyRange(minRecordKey, maxRecordKey));
this.fileNameList.add(fileName);
this.minRecordKey = minRecordKey;
this.maxRecordKey = maxRecordKey;
}

/**
Expand All @@ -62,40 +58,24 @@ void addFiles(List<String> newFiles) {

@Override
public String toString() {
return "KeyRangeNode{minRecordKey='" + minRecordKey + '\'' + ", maxRecordKey='" + maxRecordKey + '\''
+ ", fileNameList=" + fileNameList + ", rightSubTreeMax='" + rightSubTreeMax + '\'' + ", leftSubTreeMax='"
+ leftSubTreeMax + '\'' + ", rightSubTreeMin='" + rightSubTreeMin + '\'' + ", leftSubTreeMin='" + leftSubTreeMin
+ '\'' + '}';
}

/**
* Compares the min record key of two nodes, followed by max record key.
*
* @param that the {@link KeyRangeNode} to be compared with
* @return the result of comparison. 0 if both min and max are equal in both. 1 if this {@link KeyRangeNode} is
* greater than the {@code that} keyRangeNode. -1 if {@code that} keyRangeNode is greater than this {@link
* KeyRangeNode}
*/
@Override
public int compareTo(KeyRangeNode that) {
int compareValue = minRecordKey.compareTo(that.minRecordKey);
if (compareValue == 0) {
return maxRecordKey.compareTo(that.maxRecordKey);
} else {
return compareValue;
}
final RecordKeyRange key = getKey();
String range = key != null ? "minRecordKey='" + key.getMinRecordKey() + '\'' + ", maxRecordKey='"
+ key.getMaxRecordKey() + "', " : "";
return "KeyRangeNode{" + range + "fileNameList=" + fileNameList
+ ", rightSubTreeMax='" + rightSubTreeMax + '\'' + ", leftSubTreeMax='" + leftSubTreeMax + '\''
+ ", rightSubTreeMin='" + rightSubTreeMin + '\'' + ", leftSubTreeMin='" + leftSubTreeMin + '\'' + '}';
}

public List<String> getFileNameList() {
return fileNameList;
public KeyRangeNode getLeft() {
return (KeyRangeNode) super.getLeft();
}

public String getMinRecordKey() {
return minRecordKey;
public KeyRangeNode getRight() {
return (KeyRangeNode) super.getRight();
}

public String getMaxRecordKey() {
return maxRecordKey;
public List<String> getFileNameList() {
return fileNameList;
}

public String getRightSubTreeMin() {
Expand Down Expand Up @@ -130,19 +110,11 @@ public void setLeftSubTreeMax(String leftSubTreeMax) {
this.leftSubTreeMax = leftSubTreeMax;
}

public KeyRangeNode getLeft() {
return left;
}

public void setLeft(KeyRangeNode left) {
this.left = left;
}

public KeyRangeNode getRight() {
return right;
public String getMinRecordKey() {
return getKey().getMinRecordKey();
}

public void setRight(KeyRangeNode right) {
this.right = right;
public String getMaxRecordKey() {
return getKey().getMaxRecordKey();
}
}
Loading