Skip to content

Commit

Permalink
Implement relational NULL semantics for Nothing for in-memory Column …
Browse files Browse the repository at this point in the history
…operations (#8816)

Updates in-memory table column operations to treat Nothing as a relational NULL.
This PR does not include changes to Table.join.
  • Loading branch information
GregoryTravis authored Jan 24, 2024
1 parent 23d6fcd commit 5eb3f3b
Show file tree
Hide file tree
Showing 16 changed files with 444 additions and 115 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,8 @@
- [Added text_length to Column][8606]
- [Added none delimiter option for Data.Read][8627]
- [Added text_left and text_right to Column][8691]
- [Implement relational `NULL` semantics for `Nothing` for in-memory Column
operations.][5156]

[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
Expand Down Expand Up @@ -770,6 +772,7 @@
[4120]: https://github.com/enso-org/enso/pull/4120
[4050]: https://github.com/enso-org/enso/pull/4050
[4072]: https://github.com/enso-org/enso/pull/4072
[5156]: https://github.com/enso-org/enso/pull/5156
[5582]: https://github.com/enso-org/enso/pull/5582
[5645]: https://github.com/enso-org/enso/pull/5645
[5646]: https://github.com/enso-org/enso/pull/5646
Expand Down
21 changes: 21 additions & 0 deletions distribution/lib/Standard/Base/0.0.0-dev/src/Data/Set.enso
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,27 @@ type Set
contains : Any -> Boolean
contains self value = self.underlying_map.contains_key value

## GROUP Logical
Checks if this set contains a given value, treating Nothing as a
relational NULL.

If the argument is non-Nothing and exists in `value`, return true.

If the argument is non-Nothing and does not exist in `value`, return
false if `value` does not contain a Nothing, or Nothing if `value` does
contain a Nothing.

If the argument is Nothing, return Nothing if `value` is non-empty, or
false if `value` is empty.
contains_relational : Any -> Boolean | Nothing
contains_relational self value =
case value of
_ : Nothing -> if self.is_empty then False else Nothing
_ ->
if self.contains value then True else
has_nulls = self.contains Nothing
if has_nulls then Nothing else False

## ALIAS add
GROUP Calculations
Adds a value to this set.
Expand Down
26 changes: 2 additions & 24 deletions distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso
Original file line number Diff line number Diff line change
Expand Up @@ -1536,34 +1536,12 @@ type Column
fix it later) and setting up the query - but at the set up this only
applies to adding nulls - setting any other object does not check the
type at this level anyway.
partitioned = vector.partition .is_nothing
nulls = partitioned.first
non_nulls = partitioned.second
## Since SQL `NULL IN (NULL)` yields `NULL`, we need to handle this case
separately. So we handle all non-null values using `IS_IN` and then
`OR` that with a null check (if the vector contained any nulls to
begin with). The implementation also ensures that even
`NULL IN (...)` is coalesced to False, so that negation works as
expected.
is_in_not_null = self.make_op "IS_IN" operands=non_nulls new_name=new_name
result = case nulls.not_empty of
True -> is_in_not_null || self.is_nothing
False -> is_in_not_null
result = self.make_op "IS_IN" operands=vector new_name=new_name
result.rename new_name
_ : Array -> self.is_in (Vector.from_polyglot_array vector)
column : Column -> if Helpers.check_connection self column . not then (Error.throw (Integrity_Error.Error "Column "+column.name)) else
## We slightly abuse the expression syntax putting a Query as one of
the sub-expressions. Once type-checking is added, we may need to
amend the signature of `SQL_Expression.Operation` to account for
this. Also, unfortunately as `NULL IN (...)` is `NULL` in SQL, we
need to do separate handling of nulls - we check if the target
column has any nulls and if so, we will do `IS NULL` checks for
our columns too. That is because, we want the containment check
for `NULL` to work the same way as for any other value.
in_subquery = Query.Select [Pair.new column.name column.expression] column.context
has_nulls_expression = SQL_Expression.Operation "BOOL_OR" [column.is_nothing.expression]
has_nulls_subquery = Query.Select [Pair.new "has_nulls" has_nulls_expression] column.context
new_expr = SQL_Expression.Operation "IS_IN_COLUMN" [self.expression, in_subquery, has_nulls_subquery]
new_expr = SQL_Expression.Operation "IS_IN_COLUMN" [self.expression, in_subquery]
# This mapping should never be imprecise, if there are errors we need to amend the implementation.
sql_type = self.connection.dialect.get_type_mapping.value_type_to_sql Value_Type.Boolean Problem_Behavior.Report_Error
new_type_ref = SQL_Type_Reference.from_constant sql_type . catch Inexact_Type_Coercion _->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -255,22 +255,16 @@ make_is_in arguments = case arguments.length of
_ ->
expr = arguments.first
list = arguments.drop 1
is_in = expr ++ " IN (" ++ (Builder.join ", " list) ++ ")"
## We ensure that even `NULL IN (...)` is coalesced to False, so that
negation will work as expected.
Builder.code "COALESCE(" ++ is_in ++ ", FALSE)"
expr ++ " IN (" ++ (Builder.join ", " list) ++ ")"

## PRIVATE
make_is_in_column : Vector Builder -> Builder
make_is_in_column arguments = case arguments.length of
3 ->
2 ->
expr = arguments.at 0
in_query = arguments.at 1
has_nulls_query = arguments.at 2
is_in = Builder.code "COALESCE(" ++ expr ++ " IN (" ++ in_query ++ "), FALSE)"
has_nulls = has_nulls_query.paren ++ " = TRUE"
Builder.code "CASE WHEN " ++ expr ++ " IS NULL THEN " ++ has_nulls ++ " ELSE " ++ is_in ++ " END"
_ -> Error.throw <| Illegal_State.Error ("The operation IS_IN_COLUMN requires at exactly 3 arguments: the expression, the IN subquery, the subquery checking for nulls.")
Builder.code "(" ++ expr ++ " IN (" ++ in_query ++ "))"
_ -> Error.throw <| Illegal_State.Error ("The operation IS_IN_COLUMN requires at exactly 2 arguments: the expression and the IN subquery.")

## PRIVATE
make_row_number : Vector Builder -> Row_Number_Metadata -> Builder
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1600,7 +1600,7 @@ type Column
run_vectorized_binary_op self op_name as_vector expected_result_type=Value_Type.Boolean skip_nulls=False new_name=result_name
False ->
set = Set.from_vector as_vector error_on_duplicates=False
run_unary_op self set.contains new_name=result_name skip_nulls=False expected_result_type=Value_Type.Boolean
run_unary_op self set.contains_relational new_name=result_name skip_nulls=False expected_result_type=Value_Type.Boolean

## GROUP Standard.Base.Conversions
ICON convert
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,16 +55,23 @@ public Storage<?> runMap(S storage, List<?> arg) {
Context context = Context.getCurrent();
CompactRepresentation<T> compactRepresentation = prepareList(arg);
BitSet newVals = new BitSet();
for (int i = 0; i < storage.size(); i++) {
if (storage.isNa(i) && compactRepresentation.hasNulls) {
newVals.set(i);
} else if (compactRepresentation.coercedValues.contains(storage.getItemBoxed(i))) {
newVals.set(i);
}
BitSet missing = new BitSet();
if (arg.size() > 0) {
for (int i = 0; i < storage.size(); i++) {
if (storage.isNa(i)) {
missing.set(i);
} else if (compactRepresentation.coercedValues.contains(storage.getItemBoxed(i))) {
newVals.set(i);
} else if (compactRepresentation.hasNulls) {
missing.set(i);
} else {
// Leave as default=false
}

context.safepoint();
context.safepoint();
}
}
return new BoolStorage(newVals, new BitSet(), storage.size(), false);
return new BoolStorage(newVals, missing, storage.size(), false);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
package org.enso.table.data.column.operation.map.bool;

import java.util.BitSet;
import java.util.List;
import org.enso.table.data.column.operation.map.BinaryMapOperation;
import org.enso.table.data.column.operation.map.MapOperationProblemAggregator;
import org.enso.table.data.column.storage.BoolStorage;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.util.ImmutableBitSet;
import org.graalvm.polyglot.Context;

/**
* A specialized implementation for the IS_IN operation on booleans - since booleans have just three
* possible values we can have a highly efficient implementation that does not even rely on hashmap
* and after processing the input vector, performs the checks in constant time.
* and after processing the input vector, performs the checks using only BitSet builtins.
*/
public class BooleanIsInOp extends BinaryMapOperation<Boolean, BoolStorage> {
public BooleanIsInOp() {
Expand Down Expand Up @@ -60,43 +60,30 @@ public Storage<?> runZip(
}

private BoolStorage run(BoolStorage storage, boolean hadNull, boolean hadTrue, boolean hadFalse) {
BitSet newVals;
boolean negated = false;
int size = storage.size();
ImmutableBitSet values = new ImmutableBitSet(storage.getValues(), size);
ImmutableBitSet missing = new ImmutableBitSet(storage.getIsMissing(), size);
boolean negated = storage.isNegated();

if (hadNull && hadTrue && hadFalse) {
// We use empty newVals which has everything set to false and negate it to make all of that
// set to true with zero cost.
newVals = new BitSet();
negated = true;
} else if (!hadNull && !hadTrue && !hadFalse) {
// No values are present, so the result is to be false everywhere.
newVals = new BitSet();
} else if (hadNull && !hadTrue && !hadFalse) {
// Only missing values are in the set, so we just return the missing indicator.
newVals = storage.getIsMissing();
} else if (hadTrue && hadFalse) { // && !hadNull
// All non-missing values are in the set - so we just return the negated missing indicator.
newVals = storage.getIsMissing();
negated = true;
} else {
// hadTrue != hadFalse
newVals = storage.getValues().get(0, storage.size());
if (hadTrue) {
if (storage.isNegated()) {
newVals.flip(0, storage.size());
}
} else { // hadFalse
if (!storage.isNegated()) {
newVals.flip(0, storage.size());
}
}
newVals.andNot(storage.getIsMissing());
ImmutableBitSet newValues;
ImmutableBitSet newMissing;

if (hadNull) {
newVals.or(storage.getIsMissing());
}
if (hadTrue && !hadFalse) {
newValues = storage.isNegated() ? missing.notAndNot(values) : missing.notAnd(values);
newMissing =
hadNull ? (storage.isNegated() ? missing.or(values) : missing.orNot(values)) : missing;
} else if (!hadTrue && hadFalse) {
newValues = storage.isNegated() ? missing.notAnd(values) : missing.notAndNot(values);
newMissing =
hadNull ? (storage.isNegated() ? missing.orNot(values) : missing.or(values)) : missing;
} else if (hadTrue && hadFalse) {
newValues = missing.not();
newMissing = missing;
} else {
newValues = ImmutableBitSet.allFalse(size);
newMissing = hadNull ? ImmutableBitSet.allTrue(size) : ImmutableBitSet.allFalse(size);
}

return new BoolStorage(newVals, new BitSet(), storage.size(), negated);
return new BoolStorage(newValues.toBitSet(), newMissing.toBitSet(), size, false);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public BoolStorage runBinaryMap(
BitSet missing = new BitSet();
Context context = Context.getCurrent();
for (int i = 0; i < storage.size(); i++) {
if (storage.getItem(i) == null) {
if (storage.getItem(i) == null || arg == null) {
missing.set(i);
} else if (arg instanceof String s && Text_Utils.equals(storage.getItem(i), s)) {
r.set(i);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
package org.enso.table.util;

import java.util.BitSet;

/**
* A wrapper around BitSet that implements boolean operations conveniently. Unlike BitSet,
* ImmutableBitSet takes a size parameter, which allows .not to be implemented.
*/
public class ImmutableBitSet {
private BitSet bitSet;
private int size;

public ImmutableBitSet(BitSet bitSet, int size) {
this.bitSet = bitSet;
this.size = size;
}

public BitSet toBitSet() {
return bitSet;
}

public ImmutableBitSet and(ImmutableBitSet other) {
assert size == other.size;
BitSet result = (BitSet) bitSet.clone();
result.and(other.bitSet);
return new ImmutableBitSet(result, size);
}

public ImmutableBitSet or(ImmutableBitSet other) {
assert size == other.size;
BitSet result = (BitSet) bitSet.clone();
result.or(other.bitSet);
return new ImmutableBitSet(result, size);
}

public ImmutableBitSet andNot(ImmutableBitSet other) {
assert size == other.size;
BitSet result = (BitSet) bitSet.clone();
result.andNot(other.bitSet);
return new ImmutableBitSet(result, size);
}

public ImmutableBitSet not() {
BitSet result = (BitSet) bitSet.clone();
result.flip(0, size);
return new ImmutableBitSet(result, size);
}

public ImmutableBitSet notAnd(ImmutableBitSet other) {
assert size == other.size;
BitSet result = (BitSet) bitSet.clone();
result.flip(0, size);
result.and(other.bitSet);
return new ImmutableBitSet(result, size);
}

public ImmutableBitSet notAndNot(ImmutableBitSet other) {
assert size == other.size;
BitSet result = (BitSet) bitSet.clone();
result.flip(0, size);
result.andNot(other.bitSet);
return new ImmutableBitSet(result, size);
}

public ImmutableBitSet orNot(ImmutableBitSet other) {
// Doing an extra operation to avoid doing an extra allocation.
// a || !b => !(!a && b)
assert size == other.size;
BitSet result = (BitSet) bitSet.clone();
result.flip(0, size);
result.and(other.bitSet);
result.flip(0, size);
return new ImmutableBitSet(result, size);
}

public static ImmutableBitSet allFalse(int size) {
return new ImmutableBitSet(new BitSet(), size);
}

public static ImmutableBitSet allTrue(int size) {
return new ImmutableBitSet(new BitSet(), size).not();
}
}
10 changes: 10 additions & 0 deletions test/Base_Tests/src/Data/Set_Spec.enso
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,16 @@ spec =
s1.contains 3 . should_be_true
s1.contains 4 . should_be_false

Test.specify "should allow checking contains with relational NULL logic" <|
Set.from_vector [1, 2] . contains_relational 1 . should_be_true
Set.from_vector [1, 2] . contains_relational 3 . should_be_false
Set.from_vector [1, 2, Nothing] . contains_relational 1 . should_be_true
Set.from_vector [1, 2, Nothing] . contains_relational 3 . should_equal Nothing
Set.from_vector [1, 2, Nothing] . contains_relational Nothing . should_equal Nothing
Set.from_vector [1, 2] . contains_relational Nothing . should_equal Nothing
Set.from_vector [Nothing] . contains_relational Nothing . should_equal Nothing
Set.from_vector [] . contains_relational Nothing . should_be_false

Test.specify "should allow to compute a union, intersection and difference" <|
s1 = Set.from_vector [1, 2]
s2 = Set.from_vector [2, 3]
Expand Down
Loading

0 comments on commit 5eb3f3b

Please sign in to comment.