From 0e2a91cfe118173a232b0081344a881411cf5155 Mon Sep 17 00:00:00 2001 From: James Dunkerley Date: Thu, 22 Feb 2024 19:32:46 +0000 Subject: [PATCH] Remove `countMissing` from Storage and replace with a new `CountNothing` operation. (#9137) Removing another small piece of logic from the storages to it's own operation. --- .../Table/0.0.0-dev/src/Data/Column.enso | 3 +- .../data/column/operation/CountNothing.java | 64 +++++++++++++++++++ .../data/column/storage/BoolStorage.java | 8 --- .../column/storage/MixedStorageFacade.java | 5 -- .../column/storage/SpecializedStorage.java | 20 +----- .../table/data/column/storage/Storage.java | 5 -- .../storage/numeric/AbstractLongStorage.java | 12 ++-- .../storage/numeric/ComputedLongStorage.java | 5 -- .../numeric/ComputedNullableLongStorage.java | 27 ++++---- .../column/storage/numeric/DoubleStorage.java | 8 --- .../column/storage/numeric/LongStorage.java | 16 +---- .../table/data/index/MultiValueIndex.java | 3 +- .../table/parsing/TypeInferringParser.java | 4 +- .../table_test_helpers/ExplodingStorage.java | 5 -- .../src/Formatting/Parse_Values_Spec.enso | 7 ++ 15 files changed, 101 insertions(+), 91 deletions(-) create mode 100644 std-bits/table/src/main/java/org/enso/table/data/column/operation/CountNothing.java diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso index 03fd5ab3942e..e7388ff41b67 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso @@ -33,6 +33,7 @@ from project.Internal.Column_Format import all from project.Internal.Java_Exports import make_date_builder_adapter, make_string_builder polyglot java import org.enso.base.Time_Utils +polyglot java import org.enso.table.data.column.operation.CountNothing polyglot java import org.enso.table.data.column.operation.UnaryOperation polyglot java import org.enso.table.data.column.operation.cast.CastProblemAggregator polyglot java import org.enso.table.data.column.operation.unary.DatePartOperation @@ -2085,7 +2086,7 @@ type Column example_count_nothing = Examples.text_column_2.count_nothing count_nothing : Integer - count_nothing self = self.java_column.getStorage.countMissing + count_nothing self = CountNothing.apply self.java_column ## GROUP Standard.Base.Metadata Returns the number of non-null items in this column. diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountNothing.java b/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountNothing.java new file mode 100644 index 000000000000..f1f4e064d5b7 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountNothing.java @@ -0,0 +1,64 @@ +package org.enso.table.data.column.operation; + +import org.enso.table.data.column.storage.ColumnStorage; +import org.enso.table.data.column.storage.ColumnStorageWithNothingMap; +import org.enso.table.data.table.Column; +import org.graalvm.polyglot.Context; + +/** An operation for counting the number of Nothing values in a Column. */ +public class CountNothing { + /** Counts the number of Nothing values in the given column. */ + public static long apply(Column column) { + ColumnStorage storage = column.getStorage(); + return applyToStorage(storage); + } + + /** Counts the number of Nothing values in the given storage. */ + public static long applyToStorage(ColumnStorage storage) { + if (storage instanceof ColumnStorageWithNothingMap withNothingMap) { + return withNothingMap.getIsNothingMap().cardinality(); + } + + Context context = Context.getCurrent(); + long count = 0; + for (long i = 0; i < storage.getSize(); i++) { + if (storage.isNothing(i)) { + count += 1; + } + context.safepoint(); + } + return count; + } + + /** Returns true if any value in the storage is Nothing. */ + public static boolean anyNothing(ColumnStorage storage) { + if (storage instanceof ColumnStorageWithNothingMap withNothingMap) { + return !withNothingMap.getIsNothingMap().isEmpty(); + } + + Context context = Context.getCurrent(); + for (long i = 0; i < storage.getSize(); i++) { + if (storage.isNothing(i)) { + return true; + } + context.safepoint(); + } + return false; + } + + /** Returns true if all values in the storage are Nothing. */ + public static boolean allNothing(ColumnStorage storage) { + if (storage instanceof ColumnStorageWithNothingMap withNothingMap) { + return withNothingMap.getIsNothingMap().nextClearBit(0) >= storage.getSize(); + } + + Context context = Context.getCurrent(); + for (long i = 0; i < storage.getSize(); i++) { + if (!storage.isNothing(i)) { + return false; + } + context.safepoint(); + } + return true; + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/BoolStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/BoolStorage.java index 3e7eca08c9e2..fd54d4d9a9c2 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/BoolStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/BoolStorage.java @@ -51,14 +51,6 @@ public int size() { return size; } - /** - * @inheritDoc - */ - @Override - public int countMissing() { - return isMissing.cardinality(); - } - @Override public StorageType getType() { return BooleanType.INSTANCE; diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorageFacade.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorageFacade.java index edef42330a55..e66678a0bb32 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorageFacade.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorageFacade.java @@ -26,11 +26,6 @@ public int size() { return underlyingStorage.size(); } - @Override - public int countMissing() { - return underlyingStorage.countMissing(); - } - @Override public StorageType getType() { return AnyObjectType.INSTANCE; diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/SpecializedStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/SpecializedStorage.java index 54fe8bcc2296..f149dc1d7349 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/SpecializedStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/SpecializedStorage.java @@ -3,6 +3,7 @@ import java.util.AbstractList; import java.util.BitSet; import java.util.List; +import org.enso.table.data.column.operation.CountNothing; import org.enso.table.data.column.operation.map.MapOperationProblemAggregator; import org.enso.table.data.column.operation.map.MapOperationStorage; import org.enso.table.data.column.storage.type.StorageType; @@ -42,23 +43,6 @@ public int size() { return size; } - /** - * @inheritDoc - */ - @Override - public int countMissing() { - Context context = Context.getCurrent(); - int count = 0; - for (int i = 0; i < size; i++) { - if (data[i] == null) { - count += 1; - } - - context.safepoint(); - } - return count; - } - /** * @param idx an index * @return the data item contained at the given index. @@ -161,7 +145,7 @@ public Storage appendNulls(int count) { @Override public Storage fillMissingFromPrevious(BoolStorage missingIndicator) { - if (missingIndicator != null && missingIndicator.countMissing() > 0) { + if (missingIndicator != null && CountNothing.anyNothing(missingIndicator)) { throw new IllegalArgumentException( "Missing indicator must not contain missing values itself."); } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java index 0b0b8592ce95..3df2b2f6a35c 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java @@ -28,11 +28,6 @@ public abstract class Storage implements ColumnStorage { */ public abstract int size(); - /** - * @return the number of NA elements in this column - */ - public abstract int countMissing(); - /** * @return the type tag of this column's storage. */ diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/AbstractLongStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/AbstractLongStorage.java index a59b49242a3d..954a778e7418 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/AbstractLongStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/AbstractLongStorage.java @@ -16,16 +16,13 @@ import org.enso.table.data.column.operation.map.numeric.comparisons.LessComparison; import org.enso.table.data.column.operation.map.numeric.comparisons.LessOrEqualComparison; import org.enso.table.data.column.operation.map.numeric.isin.LongIsInOp; -import org.enso.table.data.column.storage.BoolStorage; -import org.enso.table.data.column.storage.ColumnLongStorage; -import org.enso.table.data.column.storage.Storage; -import org.enso.table.data.column.storage.ValueIsNothingException; +import org.enso.table.data.column.storage.*; import org.enso.table.data.column.storage.type.IntegerType; import org.enso.table.data.column.storage.type.StorageType; import org.graalvm.polyglot.Context; public abstract class AbstractLongStorage extends NumericStorage - implements ColumnLongStorage { + implements ColumnLongStorage, ColumnStorageWithNothingMap { public abstract long getItem(int idx); public abstract BitSet getIsMissing(); @@ -173,4 +170,9 @@ public long get(long index) throws ValueIsNothingException { } return getItem((int) index); } + + @Override + public BitSet getIsNothingMap() { + return getIsMissing(); + } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/ComputedLongStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/ComputedLongStorage.java index 37bedc257c6c..02e9d6776a75 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/ComputedLongStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/ComputedLongStorage.java @@ -27,11 +27,6 @@ public int size() { return size; } - @Override - public int countMissing() { - return 0; - } - @Override public IntegerType getType() { return IntegerType.INT_64; diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/ComputedNullableLongStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/ComputedNullableLongStorage.java index 0e72870ef197..3244197eb4b8 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/ComputedNullableLongStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/ComputedNullableLongStorage.java @@ -17,6 +17,8 @@ public abstract class ComputedNullableLongStorage extends AbstractLongStorage { protected final int size; + protected BitSet isMissing = null; + protected abstract Long computeItem(int idx); protected ComputedNullableLongStorage(int size) { @@ -28,11 +30,6 @@ public int size() { return size; } - @Override - public int countMissing() { - return 0; - } - @Override public IntegerType getType() { return IntegerType.INT_64; @@ -64,16 +61,20 @@ public long getItem(int idx) { @Override public BitSet getIsMissing() { - BitSet missing = new BitSet(); - Context context = Context.getCurrent(); - for (int i = 0; i < size; i++) { - if (computeItem(i) == null) { - missing.set(i); - } + if (isMissing == null) { + // Only compute once as needed. + BitSet missing = new BitSet(); + Context context = Context.getCurrent(); + for (int i = 0; i < size; i++) { + if (computeItem(i) == null) { + missing.set(i); + } - context.safepoint(); + context.safepoint(); + } + isMissing = missing; } - return missing; + return isMissing; } @Override diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java index 1176474590bb..c21e32d56ed5 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java @@ -67,14 +67,6 @@ public int size() { return size; } - /** - * @inheritDoc - */ - @Override - public int countMissing() { - return isMissing.cardinality(); - } - /** * @param idx an index * @return the data item contained at the given index. diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/LongStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/LongStorage.java index 4142623e59b6..e143ee19b7bc 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/LongStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/LongStorage.java @@ -6,7 +6,6 @@ import org.enso.base.polyglot.NumericConverter; import org.enso.table.data.column.builder.BigIntegerBuilder; import org.enso.table.data.column.builder.NumericBuilder; -import org.enso.table.data.column.storage.ColumnStorageWithNothingMap; import org.enso.table.data.column.storage.Storage; import org.enso.table.data.column.storage.type.IntegerType; import org.enso.table.data.column.storage.type.StorageType; @@ -18,7 +17,7 @@ import org.graalvm.polyglot.Value; /** A column storing 64-bit integers. */ -public final class LongStorage extends AbstractLongStorage implements ColumnStorageWithNothingMap { +public final class LongStorage extends AbstractLongStorage { // TODO [RW] at some point we will want to add separate storage classes for byte, short and int, // for more compact storage and more efficient handling of smaller integers; for now we will be // handling this just by checking the bounds @@ -64,14 +63,6 @@ public int size() { return size; } - /** - * @inheritDoc - */ - @Override - public int countMissing() { - return isMissing.cardinality(); - } - /** * @param idx an index * @return the data item contained at the given index. @@ -260,9 +251,4 @@ public LongStorage widen(IntegerType widerType) { assert widerType.fits(type); return new LongStorage(data, size, isMissing, widerType); } - - @Override - public BitSet getIsNothingMap() { - return isMissing; - } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueIndex.java b/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueIndex.java index 7500ba691892..827726cdab5e 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueIndex.java +++ b/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueIndex.java @@ -14,6 +14,7 @@ import org.enso.base.text.TextFoldingStrategy; import org.enso.table.aggregations.Aggregator; import org.enso.table.data.column.builder.Builder; +import org.enso.table.data.column.operation.CountNothing; import org.enso.table.data.column.storage.Storage; import org.enso.table.data.table.Column; import org.enso.table.data.table.Table; @@ -166,7 +167,7 @@ public int size() { */ public KeyType findAnyNullKey() { for (Column c : keyColumns) { - boolean containsNulls = c.getStorage().countMissing() > 0; + boolean containsNulls = CountNothing.anyNothing(c.getStorage()); if (containsNulls) { for (KeyType key : locs.keySet()) { if (key.hasAnyNulls()) { diff --git a/std-bits/table/src/main/java/org/enso/table/parsing/TypeInferringParser.java b/std-bits/table/src/main/java/org/enso/table/parsing/TypeInferringParser.java index d8fcc66815e7..51691148abc1 100644 --- a/std-bits/table/src/main/java/org/enso/table/parsing/TypeInferringParser.java +++ b/std-bits/table/src/main/java/org/enso/table/parsing/TypeInferringParser.java @@ -1,6 +1,7 @@ package org.enso.table.parsing; import org.enso.table.data.column.builder.Builder; +import org.enso.table.data.column.operation.CountNothing; import org.enso.table.data.column.storage.Storage; import org.enso.table.parsing.problems.CommonParseProblemAggregator; import org.enso.table.parsing.problems.ParseProblemAggregator; @@ -43,8 +44,7 @@ public Storage parseColumn( Storage sourceStorage, CommonParseProblemAggregator problemAggregator) { // If there are no values, the Auto parser would guess some random type (the first one that is // checked). Instead, we just return the empty column unchanged. - boolean hasNoValues = - (sourceStorage.size() == 0) || (sourceStorage.countMissing() == sourceStorage.size()); + boolean hasNoValues = (sourceStorage.size() == 0) || CountNothing.allNothing(sourceStorage); if (hasNoValues) { return fallbackParser.parseColumn(sourceStorage, problemAggregator); } diff --git a/test/Base_Tests/polyglot-sources/enso-test-java-helpers/src/main/java/org/enso/table_test_helpers/ExplodingStorage.java b/test/Base_Tests/polyglot-sources/enso-test-java-helpers/src/main/java/org/enso/table_test_helpers/ExplodingStorage.java index f10848b763e0..bbee33be42fc 100644 --- a/test/Base_Tests/polyglot-sources/enso-test-java-helpers/src/main/java/org/enso/table_test_helpers/ExplodingStorage.java +++ b/test/Base_Tests/polyglot-sources/enso-test-java-helpers/src/main/java/org/enso/table_test_helpers/ExplodingStorage.java @@ -34,11 +34,6 @@ public int size() { return array.length; } - @Override - public int countMissing() { - return 0; - } - @Override public StorageType getType() { return IntegerType.INT_64; diff --git a/test/Table_Tests/src/Formatting/Parse_Values_Spec.enso b/test/Table_Tests/src/Formatting/Parse_Values_Spec.enso index 59d384ac3115..a0a2bc211679 100644 --- a/test/Table_Tests/src/Formatting/Parse_Values_Spec.enso +++ b/test/Table_Tests/src/Formatting/Parse_Values_Spec.enso @@ -630,6 +630,13 @@ add_specs suite_builder = c1 = Column.from_vector "A" ["1", "2", "3"] c1.parse type=Nothing . should_fail_with Illegal_Argument + group_builder.specify "should return unchanged if all are Nothing or no rows" <| + c1 = Column.from_vector "A" [Nothing, Nothing, Nothing] Value_Type.Char + c1.parse.value_type . should_equal Value_Type.Char + + c2 = Column.from_vector "A" [] Value_Type.Char + c2.parse.value_type . should_equal Value_Type.Char + group_builder.specify "should error if the input column is not text" <| c1 = Column.from_vector "A" [1, 2, 3] r1 = c1.parse