diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c242d4d9b54..f697afa66d75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -578,6 +578,7 @@ - [Renamed `Decimal` to `Float`.][7807] - [Implemented `Date_Time_Formatter` for more user-friendly date/time format parsing.][7826] +- [Implemented `Table.auto_value_types` for in-memory tables.][7908] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -820,6 +821,7 @@ [7776]: https://github.com/enso-org/enso/pull/7776 [7807]: https://github.com/enso-org/enso/pull/7807 [7826]: https://github.com/enso-org/enso/pull/7826 +[7908]: https://github.com/enso-org/enso/pull/7908 #### Enso Compiler diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Boolean.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Boolean.enso index e03cd62ef880..22f88f46efdf 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Boolean.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Boolean.enso @@ -1,6 +1,4 @@ import project.Any.Any -import project.Data.Ordering.Comparable -import project.Data.Ordering.Ordering import project.Nothing.Nothing from project.Data.Boolean.Boolean import False, True @@ -98,4 +96,3 @@ type Boolean if (27 % 3) == 0 then IO.println "Fizz" if_then : Any -> Any | Nothing if_then self ~on_true = @Builtin_Method "Boolean.if_then" - diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Numbers.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Numbers.enso index 2cf6d7a10fda..b3c92a28dcce 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Numbers.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Numbers.enso @@ -1,4 +1,5 @@ import project.Any.Any +import project.Data.Ordering.Comparable import project.Data.Locale.Locale import project.Data.Text.Text import project.Error.Error @@ -1169,3 +1170,39 @@ type Number_Parse_Error to_display_text : Text to_display_text self = "Could not parse " + self.text.to_text + " as a double." + +## A wrapper type that ensures that a function may only take positive integers. +type Positive_Integer + ## PRIVATE + This constructor should not be used by user code as it can be used to + break the invariants. Instead, this type should only be created by `new` + or conversions. + Value (integer : Integer) + + ## PRIVATE + ADVANCED + Constructor to create a `Positive_Integer` from an `Integer` - checking + if it satisfies the condition. User code should prefer the + `Positive_Integer.from` conversion. + new (integer : Integer) = + if integer > 0 then Positive_Integer.Value integer else + Error.throw (Illegal_Argument.Error "Expected a positive integer, but got "+integer.to_display_text) + +## Allows to create a `Positive_Integer` from an `Integer`. + It will throw `Illegal_Argument` if the provided integer is not positive. +Positive_Integer.from (that : Integer) = Positive_Integer.new that + +## PRIVATE +Integer.from (that : Positive_Integer) = that.integer + +## PRIVATE +type Positive_Integer_Comparator + ## PRIVATE + compare x y = + Comparable.from x.integer . compare x.integer y.integer + + ## PRIVATE + hash x = Comparable.from x.integer . hash x.integer + +## PRIVATE +Comparable.from (_:Positive_Integer) = Positive_Integer_Comparator diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso index bd4b51cc806a..7f69a2e98519 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso @@ -1579,6 +1579,15 @@ type Column check_cast_compatibility self.value_type value_type <| self.internal_do_cast value_type on_problems + ## Change the value type of the column to a more specific one, based on its + contents. + + This operation is currently not available in the Database backend. + auto_value_type : Boolean -> Column + auto_value_type self shrink_types=False = + _ = shrink_types + Error.throw <| Unsupported_Database_Operation.Error "`Column.auto_value_type` is not supported in the Database backends." + ## PRIVATE Shares the core CAST logic between `cast` and `parse`. internal_do_cast : Value_Type -> Problem_Behavior -> Column diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso index c8dc2f54dc27..7397614c1d48 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso @@ -1979,6 +1979,15 @@ type Table new_column = column_to_cast.cast value_type on_problems table.set new_column new_name=column_to_cast.name set_mode=Set_Mode.Update + ## Change the value type of table columns to a more specific one, based on + their contents. + + This operation is currently not available in the Database backend. + auto_value_types : Vector (Text | Integer | Regex) | Text | Integer | Regex -> Boolean -> Boolean -> Problem_Behavior -> Table + auto_value_types self columns=self.column_names shrink_types=False error_on_missing_columns=True on_problems=Problem_Behavior.Report_Warning = + _ = [columns, shrink_types, error_on_missing_columns, on_problems] + Error.throw (Unsupported_Database_Operation.Error "Table.auto_value_types is not supported in the Database backends.") + ## ALIAS drop_missing_rows, dropna GROUP Standard.Base.Selections Remove rows which are all blank or containing blank values. diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso index d9c38acdf63d..38641c25c1ae 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso @@ -1762,6 +1762,43 @@ type Column on_problems.attach_problems_before problems <| Column.from_storage self.name new_storage + ## Change the value type of the column to a more specific one, based on its + contents. + + Arguments: + - shrink_types: If set `True`, smaller types will be chosen if possible, + according to the rules below. Defaults to `False`. + + ? Auto Type Selection Rules + + - If a `Mixed` column can be assigned a single type, like `Char` or + `Integer`, that will be used. + - Text columns are not parsed. To do that, use the `parse` method. + - If a `Float` column contains only integers, it will be converted to + an Integer column. + - If a `Decimal` column contains only integers that could fit in a + 64-bit integer storage, it will be converted to an Integer column. + - If `shrink_types` is `False` (default), no other transformations are + applied. + - However, if `shrink_types` is set to `True`, then: + - Integer columns will be assigned the smallest size that can fit all + values (down to 16-bit integers; converting to the `Byte` type has + to be done manually through `cast`). + - If all elements in a text column have the same length, the type + will become fixed length. + - Otherwise, if a text column is variable length, but all text + elements are no longer than 255 characters, the column will get a + max length of 255. Otherwise, the column size limit will stay + unchanged. + auto_value_type : Boolean -> Column + auto_value_type self shrink_types=False = + new_value_type = case shrink_types of + False -> self.inferred_precise_value_type + True -> + Storage.to_value_type self.java_column.getStorage.inferPreciseTypeShrunk + # We run with Report_Error because we do not expect any problems. + self.cast new_value_type on_problems=Problem_Behavior.Report_Error + ## ALIAS transform column Applies `function` to each item in this column and returns the column diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso index 3b5dd7f1aa9e..31ce6bfb36f3 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso @@ -88,10 +88,20 @@ type Table Column.from_vector (v.at 0) (v.at 1) . java_column Column.Value java_col -> java_col _ -> invalid_input_shape - if cols.is_empty then Error.throw (Illegal_Argument.Error "Cannot create a table with no columns.") else - if (cols.all c-> c.getSize == cols.first.getSize).not then Error.throw (Illegal_Argument.Error "All columns must have the same row count.") else - if cols.distinct .getName . length != cols.length then Error.throw (Illegal_Argument.Error "Column names must be distinct.") else - Table.Value (Java_Table.new cols) + Panic.recover Illegal_Argument <| + if cols.is_empty then + Panic.throw (Illegal_Argument.Error "Cannot create a table with no columns.") + + if cols.distinct .getName . length != cols.length then + Panic.throw (Illegal_Argument.Error "Column names must be distinct.") + + mismatched_size_column = cols.find if_missing=Nothing c-> + c.getSize != cols.first.getSize + if mismatched_size_column.is_nothing.not then + msg = "All columns must have the same row count, but the column [" + mismatched_size_column.getName + "] has " + mismatched_size_column.getSize.to_text + " rows, while the column [" + cols.first.getName + "] has " + cols.first.getSize.to_text + " rows." + Panic.throw (Illegal_Argument.Error msg) + + Table.Value (Java_Table.new cols) ## GROUP Standard.Base.Constants Creates a new table from a vector of column names and a vector of vectors @@ -946,6 +956,9 @@ type Table Arguments: - columns: The selection of columns to cast. - value_type: The `Value_Type` to cast the column to. + - error_on_missing_columns: Specifies if a missing input column should + result in an error regardless of the `on_problems` settings. Defaults + to `True`. - on_problems: Specifies how to handle problems if they occur, reporting them as warnings by default. @@ -996,6 +1009,50 @@ type Table new_column = column_to_cast.cast value_type on_problems table.set new_column new_name=column_to_cast.name set_mode=Set_Mode.Update + ## Change the value type of table columns to a more specific one, based on + their contents. + + This is most useful for `Mixed` type columns and will allow to narrow + down the type if all values in the column fit a more specific type. + + Arguments: + - columns: The selection of columns to convert. + - shrink_types: If set `True`, smaller types will be chosen if possible, + according to the rules below. Defaults to `False`. + - error_on_missing_columns: Specifies if a missing input column should + result in an error regardless of the `on_problems` settings. Defaults + to `True`. + - on_problems: Specifies how to handle problems if they occur, reporting + them as warnings by default. + + ? Auto Type Selection Rules + + - If a `Mixed` column can be assigned a single type, like `Char` or + `Integer`, that will be used. + - Text columns are not parsed. To do that, use the `parse` method. + - If a `Float` column contains only integers, it will be converted to + an Integer column. + - If a `Decimal` column contains only integers that could fit in a + 64-bit integer storage, it will be converted to an Integer column. + - If `shrink_types` is `False` (default), no other transformations are + applied. + - However, if `shrink_types` is set to `True`, then: + - Integer columns will be assigned the smallest size that can fit all + values (down to 16-bit integers; converting to the `Byte` type has + to be done manually through `cast`). + - If all elements in a text column have the same length, the type + will become fixed length. + - Otherwise, if a text column is variable length, but all text + elements are no longer than 255 characters, the column will get a + max length of 255. Otherwise, the column size limit will stay + unchanged. + auto_value_types : Vector (Text | Integer | Regex) | Text | Integer | Regex -> Boolean -> Boolean -> Problem_Behavior -> Table + auto_value_types self columns=self.column_names shrink_types=False error_on_missing_columns=True on_problems=Problem_Behavior.Report_Warning = + selected = self.columns_helper.select_columns columns Case_Sensitivity.Default reorder=False error_on_missing_columns=error_on_missing_columns on_problems=on_problems error_on_empty=False + selected.fold self table-> column_to_cast-> + new_column = column_to_cast.auto_value_type shrink_types + table.set new_column new_name=column_to_cast.name set_mode=Set_Mode.Update + ## GROUP Standard.Base.Conversions Splits a column of text into a set of new columns. The original column will be removed from the table. diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Enso_Types.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Enso_Types.enso index 1ef8d038f15f..ea1e57a7d3c1 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Enso_Types.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Enso_Types.enso @@ -18,7 +18,7 @@ polyglot java import org.enso.table.data.column.storage.type.IntegerType most_specific_value_type : Any -> Boolean -> Value_Type most_specific_value_type value use_smallest=False = case value of - _ : Float -> Value_Type.Float Bits.Bits_64 + _ : Float -> Value_Type.Float Bits.Bits_64 _ : Boolean -> Value_Type.Boolean _ : Date -> Value_Type.Date _ : Time_Of_Day -> Value_Type.Time @@ -33,9 +33,12 @@ most_specific_value_type value use_smallest=False = # We do a small rewrite here - for integers we always return the Integer type, even if the value is small enough to fit in a Byte. if value_type == Value_Type.Byte then Value_Type.Integer Bits.Bits_16 else value_type True -> Value_Type.Decimal precision=Nothing scale=0 - text : Text -> case use_smallest of - False -> Value_Type.Char size=Nothing variable_length=True - True -> Value_Type.Char size=text.length variable_length=False + text : Text -> + length = text.length + # Not using Char size=0 for empty strings, because that would be an invalid value. + case use_smallest && length > 0 of + True -> Value_Type.Char size=text.length variable_length=False + False -> Value_Type.Char size=Nothing variable_length=True ## TODO [RW] once we add Enso Native Object Type Value Type, we probably want to prefer it over Mixed _ -> Value_Type.Mixed diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Storage.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Storage.enso index a71f907c6c0d..61d761293118 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Storage.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Storage.enso @@ -54,7 +54,7 @@ closest_storage_type value_type = case value_type of Error.throw (Illegal_Argument.Error "Value_Type.Char with fixed length must have a non-nothing size") Value_Type.Char max_length variable_length -> fixed_length = variable_length.not - TextType.new max_length fixed_length + TextType.new (max_length : Integer) fixed_length Value_Type.Date -> DateType.INSTANCE # We currently will not support storing dates without timezones in in-memory mode. Value_Type.Date_Time _ -> DateTimeType.INSTANCE diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Value_Type.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Value_Type.enso index 0a0f63da9c7d..7af1c1298fe7 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Value_Type.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Value_Type.enso @@ -1,4 +1,5 @@ from Standard.Base import all +import Standard.Base.Data.Numbers.Positive_Integer import Standard.Base.Errors.Illegal_Argument.Illegal_Argument import project.Data.Type.Value_Type_Helpers @@ -95,12 +96,22 @@ type Value_Type ANSI SQL: CHAR, VARCHAR, TEXT, LONGVARCHAR, NCHAR, NVARCHAR, TEXT, CLOB, NCLOB + ! Counting Characters + + Note that different backends may count the text in different ways. + The in-memory backend treats a single grapheme cluster (e.g. 💡) as a + single character unit. In most database systems more complex grapheme + clusters may be counted as multiple characters. So there isn't a 1-1 + correspondence between these limits across backends which may cause + strings to be truncated if they contain such characters and are close + to the limit. + Arguments: - size: the maximum number of characters that can be stored in the - column. It can be nothing to indicate no limit. + column. It can be nothing to indicate no limit. It cannot be 0. - variable_length: whether the size is a maximum or a fixed length. A fixed length string must have a non-nothing size. - Char size:(Integer|Nothing)=Nothing variable_length:Boolean=True + Char (size : (Positive_Integer | Nothing) = Nothing) variable_length:Boolean=True ## Date @@ -383,15 +394,23 @@ type Value_Type Value_Type.Integer size -> "Integer (" + size.to_text + ")" Value_Type.Float size -> "Float (" + size.to_text + ")" Value_Type.Decimal precision scale -> "Decimal (precision=" + precision.to_text + ", scale=" + scale.to_text + ")" - Value_Type.Char size variable_length -> case variable_length of - True -> "Char (variable length, max_size=" + size.to_text + ")" - False -> "Char (fixed length, size=" + size.to_text + ")" + Value_Type.Char size variable_length -> + size_text = case size of + Nothing -> "unlimited" + _ -> size.to Integer . to_text + case variable_length of + True -> "Char (variable length, max_size=" + size_text + ")" + False -> "Char (fixed length, size=" + size_text + ")" Value_Type.Date -> "Date" Value_Type.Date_Time with_timezone -> "Date_Time (with_timezone=" + with_timezone.to_text + ")" Value_Type.Time -> "Time" - Value_Type.Binary size variable_length -> case variable_length of - True -> "Binary (variable length, max_size=" + size.to_text + " bytes)" - False -> "Binary (fixed length, size=" + size.to_text + " bytes)" + Value_Type.Binary size variable_length -> + size_text = case size of + Nothing -> "unlimited" + _ -> size.to Integer . to_text + " bytes" + case variable_length of + True -> "Binary (variable length, max_size=" + size_text + ")" + False -> "Binary (fixed length, size=" + size_text + ")" Value_Type.Unsupported_Data_Type type_name _ -> case type_name of Nothing -> "Unsupported_Data_Type" _ : Text -> "Unsupported_Data_Type (" + type_name + ")" diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/builder/BigIntegerBuilder.java b/std-bits/table/src/main/java/org/enso/table/data/column/builder/BigIntegerBuilder.java index abaf10902432..7e4bfc7e072e 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/builder/BigIntegerBuilder.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/builder/BigIntegerBuilder.java @@ -10,6 +10,7 @@ import org.enso.table.data.column.storage.type.FloatType; import org.enso.table.data.column.storage.type.StorageType; import org.enso.table.error.ValueTypeMismatchException; +import org.graalvm.polyglot.Context; // For now the BigInteger builder is just a stub, reusing the ObjectBuilder and adding a warning. public class BigIntegerBuilder extends TypedBuilderImpl { @@ -88,10 +89,12 @@ public void appendRawNoGrow(BigInteger value) { } public static BigIntegerBuilder retypeFromLongBuilder(LongBuilder longBuilder) { + BigIntegerBuilder res = new BigIntegerBuilder(longBuilder.data.length); int n = longBuilder.currentSize; - BigIntegerBuilder res = new BigIntegerBuilder(n); + Context context = Context.getCurrent(); for (int i = 0; i < n; i++) { res.appendNoGrow(BigInteger.valueOf(longBuilder.data[i])); + context.safepoint(); } return res; } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/builder/LongBuilderChecked.java b/std-bits/table/src/main/java/org/enso/table/data/column/builder/LongBuilderChecked.java index d6cd63677956..0ccd80609db2 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/builder/LongBuilderChecked.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/builder/LongBuilderChecked.java @@ -26,10 +26,10 @@ public void appendNoGrow(Object o) { if (o == null) { isMissing.set(currentSize++); } else { - try { - long x = NumericConverter.coerceToLong(o); + Long x = NumericConverter.tryConvertingToLong(o); + if (x != null) { appendLongNoGrow(x); - } catch (UnsupportedOperationException e) { + } else { throw new ValueTypeMismatchException(type, o); } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/builder/LongBuilderUnchecked.java b/std-bits/table/src/main/java/org/enso/table/data/column/builder/LongBuilderUnchecked.java index f36ab5f6c5d2..a835fa692955 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/builder/LongBuilderUnchecked.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/builder/LongBuilderUnchecked.java @@ -19,10 +19,10 @@ public void appendNoGrow(Object o) { if (o == null) { isMissing.set(currentSize++); } else { - try { - long x = NumericConverter.coerceToLong(o); - data[currentSize++] = x; - } catch (UnsupportedOperationException e) { + Long x = NumericConverter.tryConvertingToLong(o); + if (x != null) { + appendLongNoGrow(x); + } else { throw new ValueTypeMismatchException(getType(), o); } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/operation/cast/ToTextStorageConverter.java b/std-bits/table/src/main/java/org/enso/table/data/column/operation/cast/ToTextStorageConverter.java index b76839d32eb4..df3f49be7f23 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/operation/cast/ToTextStorageConverter.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/operation/cast/ToTextStorageConverter.java @@ -29,8 +29,8 @@ public ToTextStorageConverter(TextType textType) { public Storage cast(Storage storage, CastProblemBuilder problemBuilder) { if (storage instanceof StringStorage stringStorage) { - if (stringStorage.getType().equals(targetType)) { - return stringStorage; + if (canAvoidCopying(stringStorage)) { + return retypeStringStorage(stringStorage); } else { return adaptStringStorage(stringStorage, problemBuilder); } @@ -150,7 +150,8 @@ private Storage castDoubleStorage(DoubleStorage doubleStorage, CastProbl return builder.seal(); } - private Storage castDateTimeStorage(Storage storage, Function converter, CastProblemBuilder problemBuilder) { + private Storage castDateTimeStorage(Storage storage, Function converter, + CastProblemBuilder problemBuilder) { Context context = Context.getCurrent(); StringBuilder builder = new StringBuilder(storage.size(), targetType); for (int i = 0; i < storage.size(); i++) { @@ -204,4 +205,43 @@ private Storage adaptStringStorage(StringStorage stringStorage, CastProb problemBuilder.aggregateOtherProblems(builder.getProblems()); return builder.seal(); } + + private boolean canAvoidCopying(StringStorage stringStorage) { + if (targetType.fitsExactly(stringStorage.getType())) { + return true; + } + + long maxLength = Long.MIN_VALUE; + long minLength = Long.MAX_VALUE; + for (int i = 0; i < stringStorage.size(); i++) { + String value = stringStorage.getItem(i); + if (value == null) { + continue; + } + + long length = value.length(); + if (length > maxLength) { + maxLength = length; + } + if (length < minLength) { + minLength = length; + } + } + + if (targetType.fixedLength()) { + boolean effectivelyFixedLength = minLength == maxLength; + return effectivelyFixedLength && targetType.maxLength() == maxLength; + } else { + return targetType.maxLength() == -1 || maxLength <= targetType.maxLength(); + } + } + + /** + * Creates a new storage re-using the existing array. + *

+ * This can only be done if the values do not need any adaptations, checked by {@code canAvoidCopying}. + */ + private Storage retypeStringStorage(StringStorage stringStorage) { + return new StringStorage(stringStorage.getData(), stringStorage.size(), targetType); + } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/StringStringOp.java b/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/StringStringOp.java index 3f67de2f278e..b984f2dcd945 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/StringStringOp.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/StringStringOp.java @@ -23,7 +23,7 @@ public StringStringOp(String name) { public Storage runBinaryMap(SpecializedStorage storage, Object arg, MapOperationProblemBuilder problemBuilder) { int size = storage.size(); if (arg == null) { - StringBuilder builder = new StringBuilder(size, TextType.variableLengthWithLimit(0)); + StringBuilder builder = new StringBuilder(size, TextType.VARIABLE_LENGTH); builder.appendNulls(size); return builder.seal(); } else if (arg instanceof String argString) { diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorage.java index 0736c163c389..ce0e7ef854bd 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorage.java @@ -105,6 +105,20 @@ public StorageType inferPreciseType() { return inferredType; } + @Override + public StorageType inferPreciseTypeShrunk() { + Storage specialized = getInferredStorage(); + if (specialized == null) { + // If no specialized type is available, it means that: + assert inferredType instanceof AnyObjectType; + return AnyObjectType.INSTANCE; + } + + // If we are able to get a more specialized storage for more specific type - we delegate to its + // own shrinking logic. + return specialized.inferPreciseTypeShrunk(); + } + private Storage getInferredStorage() { if (!hasSpecializedStorageBeenInferred) { StorageType inferredType = inferPreciseType(); diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorageFacade.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorageFacade.java index 178f04a6e07b..12e438d1eb22 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorageFacade.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorageFacade.java @@ -43,6 +43,11 @@ public StorageType inferPreciseType() { return underlyingStorage.inferPreciseType(); } + @Override + public StorageType inferPreciseTypeShrunk() { + return underlyingStorage.inferPreciseTypeShrunk(); + } + @Override public boolean isNa(long idx) { return underlyingStorage.isNa(idx); diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java index 98582fb80ea5..adc67afcb558 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java @@ -39,6 +39,19 @@ public StorageType inferPreciseType() { return getType(); } + /** + * Returns the smallest type (according to Column.auto_value_type rules) that may still fit all + * values in this column. + * + *

It is a sibling of `inferPreciseType` that allows some further shrinking. It is kept + * separate, because `inferPreciseType` should be quick to compute (cached if needed) as it is + * used in typechecking of lots of operations. This one however, is only used in a specific + * `auto_value_type` use-case and rarely will need to be computed more than once. + */ + public StorageType inferPreciseTypeShrunk() { + return getType(); + } + /** * Returns a more specialized storage, if available. * diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java index 8f5be3a695d0..e3e3755137fa 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java @@ -3,9 +3,9 @@ import org.enso.base.Text_Utils; import org.enso.table.data.column.builder.Builder; import org.enso.table.data.column.builder.StringBuilder; -import org.enso.table.data.column.operation.map.MapOperationStorage; import org.enso.table.data.column.operation.map.BinaryMapOperation; import org.enso.table.data.column.operation.map.MapOperationProblemBuilder; +import org.enso.table.data.column.operation.map.MapOperationStorage; import org.enso.table.data.column.operation.map.UnaryMapOperation; import org.enso.table.data.column.operation.map.text.LikeOp; import org.enso.table.data.column.operation.map.text.StringBooleanOp; @@ -19,10 +19,13 @@ import java.util.BitSet; -/** A column storing strings. */ +/** + * A column storing strings. + */ public final class StringStorage extends SpecializedStorage { private final TextType type; + /** * @param data the underlying data * @param size the number of items stored @@ -111,7 +114,8 @@ public BoolStorage runZip( t.add( new UnaryMapOperation<>(Maps.IS_EMPTY) { @Override - protected BoolStorage runUnaryMap(SpecializedStorage storage, MapOperationProblemBuilder problemBuilder) { + protected BoolStorage runUnaryMap(SpecializedStorage storage, + MapOperationProblemBuilder problemBuilder) { BitSet r = new BitSet(); Context context = Context.getCurrent(); for (int i = 0; i < storage.size; i++) { @@ -162,4 +166,40 @@ protected TextType computeResultType(TextType a, TextType b) { }); return t; } + + @Override + public StorageType inferPreciseTypeShrunk() { + if (type.fixedLength()) { + return type; + } + + long minLength = Long.MAX_VALUE; + long maxLength = Long.MIN_VALUE; + for (int i = 0; i < size(); i++) { + String s = getItem(i); + if (s != null) { + long length = Text_Utils.grapheme_length(s); + minLength = Math.min(minLength, length); + maxLength = Math.max(maxLength, length); + } + } + + // maxLength will be <0 if all values were null and will be ==0 if all values were empty strings. + // In both of these cases, we avoid shrinking the type and return the original type instead. + if (maxLength <= 0) { + return getType(); + } + + final long SHORT_LENGTH_THRESHOLD = 255; + if (minLength == maxLength) { + return TextType.fixedLength(minLength); + } else if (maxLength <= SHORT_LENGTH_THRESHOLD && (type.maxLength() < 0 || SHORT_LENGTH_THRESHOLD < type.maxLength())) { + // If the string was unbounded or the bound was larger than 255, we shrink it to 255. + return TextType.variableLengthWithLimit(SHORT_LENGTH_THRESHOLD); + } else { + // Otherwise, we return the original type (because it was either smaller than the proposed 255 bound, or the + // existing elements to do not fit into the 255 bound). + return getType(); + } + } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/AbstractLongStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/AbstractLongStorage.java index 356b839acea5..04a00bf8b00b 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/AbstractLongStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/AbstractLongStorage.java @@ -23,6 +23,8 @@ import org.enso.table.data.column.storage.BoolStorage; import org.enso.table.data.column.storage.Storage; import org.enso.table.data.column.storage.type.IntegerType; +import org.enso.table.data.column.storage.type.StorageType; +import org.graalvm.polyglot.Context; public abstract class AbstractLongStorage extends NumericStorage { public abstract long getItem(int idx); @@ -77,6 +79,46 @@ public Builder createDefaultBuilderOfSameType(int capacity) { @Override public abstract IntegerType getType(); + @Override + public StorageType inferPreciseType() { + return getType(); + } + + @Override + public StorageType inferPreciseTypeShrunk() { + // If the type is already smallest possible, we return it unchanged (we will return 8-bit + // columns as-is, although + // we will not shrink 16-bit columns to 8-bits even if it were possible). + if (getType().bits().toInteger() <= 16) { + return getType(); + } + + IntegerType[] possibleTypes = + new IntegerType[] {IntegerType.INT_16, IntegerType.INT_32, IntegerType.INT_64}; + + int currentTypeIdx = 0; + int n = size(); + Context context = Context.getCurrent(); + for (int i = 0; i < n; i++) { + if (isNa(i)) { + continue; + } + + long item = getItem(i); + while (!possibleTypes[currentTypeIdx].fits(item)) { + currentTypeIdx++; + } + + if (currentTypeIdx >= possibleTypes.length - 1) { + break; + } + + context.safepoint(); + } + + return possibleTypes[currentTypeIdx]; + } + private static MapOperationStorage buildOps() { MapOperationStorage ops = new MapOperationStorage<>(); ops.add(new AddOp<>()) diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/BigIntegerStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/BigIntegerStorage.java index ad7e10ad0a59..be5cb195df3a 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/BigIntegerStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/BigIntegerStorage.java @@ -20,6 +20,7 @@ import org.enso.table.data.column.storage.ObjectStorage; import org.enso.table.data.column.storage.SpecializedStorage; import org.enso.table.data.column.storage.type.BigIntegerType; +import org.enso.table.data.column.storage.type.IntegerType; import org.enso.table.data.column.storage.type.StorageType; public class BigIntegerStorage extends SpecializedStorage { @@ -59,7 +60,7 @@ protected SpecializedStorage newInstance(BigInteger[] data, int size @Override protected BigInteger[] newUnderlyingArray(int size) { - return new BigInteger[0]; + return new BigInteger[size]; } @Override @@ -96,4 +97,67 @@ public long getMaxPrecisionStored() { return cachedMaxPrecisionStored; } + + private StorageType inferredType = null; + + @Override + public StorageType inferPreciseType() { + if (inferredType == null) { + boolean allFitInLong = true; + int visitedCount = 0; + + for (int i = 0; i < size; i++) { + BigInteger value = data[i]; + if (value == null) { + continue; + } + + visitedCount++; + boolean fitsInLong = IntegerType.INT_64.fits(value); + if (!fitsInLong) { + allFitInLong = false; + break; + } + } + + inferredType = + (allFitInLong && visitedCount > 0) ? IntegerType.INT_64 : BigIntegerType.INSTANCE; + } + + return inferredType; + } + + @Override + public StorageType inferPreciseTypeShrunk() { + StorageType preciseType = inferPreciseType(); + if (preciseType instanceof IntegerType) { + return findSmallestIntegerTypeThatFits(); + } + + return preciseType; + } + + private StorageType findSmallestIntegerTypeThatFits() { + // This method assumes that all values _do_ fit in some integer type. + assert inferredType instanceof IntegerType; + + final BigIntegerStorage parent = this; + + // We create a Long storage that gets values by converting our storage. + ComputedNullableLongStorage longAdapter = + new ComputedNullableLongStorage(size) { + @Override + protected Long computeItem(int idx) { + BigInteger bigInteger = parent.getItem(idx); + if (bigInteger == null) { + return null; + } + + return bigInteger.longValueExact(); + } + }; + + // And rely on its shrinking logic. + return longAdapter.inferPreciseTypeShrunk(); + } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/ComputedNullableLongStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/ComputedNullableLongStorage.java new file mode 100644 index 000000000000..835c41972553 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/ComputedNullableLongStorage.java @@ -0,0 +1,197 @@ +package org.enso.table.data.column.storage.numeric; + +import java.util.BitSet; +import java.util.List; +import org.enso.table.data.column.storage.Storage; +import org.enso.table.data.column.storage.type.IntegerType; +import org.enso.table.data.index.Index; +import org.enso.table.data.mask.OrderMask; +import org.enso.table.data.mask.SliceRange; +import org.graalvm.polyglot.Context; + +/** + * Implements a storage that computes the ith stored value using some function. + * + *

This storage allows for missing values. Prefer {@link ComputedLongStorage} for non-nullable + * case. + */ +public abstract class ComputedNullableLongStorage extends AbstractLongStorage { + protected final int size; + + protected abstract Long computeItem(int idx); + + protected ComputedNullableLongStorage(int size) { + this.size = size; + } + + @Override + public int size() { + return size; + } + + @Override + public int countMissing() { + return 0; + } + + @Override + public IntegerType getType() { + return IntegerType.INT_64; + } + + @Override + public boolean isNa(long idx) { + if (idx < 0 || idx >= size) { + throw new IndexOutOfBoundsException( + "Index " + idx + " is out of bounds for range of length " + size + "."); + } + + return computeItem((int) idx) == null; + } + + @Override + public Long getItemBoxed(int idx) { + if (idx < 0 || idx >= size) { + throw new IndexOutOfBoundsException( + "Index " + idx + " is out of bounds for range of length " + size + "."); + } + + return computeItem(idx); + } + + public long getItem(int idx) { + return getItemBoxed(idx); + } + + @Override + public BitSet getIsMissing() { + BitSet missing = new BitSet(); + Context context = Context.getCurrent(); + for (int i = 0; i < size; i++) { + if (computeItem(i) == null) { + missing.set(i); + } + + context.safepoint(); + } + return missing; + } + + @Override + public Storage mask(BitSet mask, int cardinality) { + BitSet newMissing = new BitSet(); + long[] newData = new long[cardinality]; + int resIx = 0; + Context context = Context.getCurrent(); + for (int i = 0; i < size; i++) { + if (mask.get(i)) { + Long item = computeItem(i); + if (item == null) { + newMissing.set(resIx++); + } else { + newData[resIx++] = item; + } + } + + context.safepoint(); + } + return new LongStorage(newData, cardinality, newMissing, getType()); + } + + @Override + public Storage applyMask(OrderMask mask) { + int[] positions = mask.getPositions(); + long[] newData = new long[positions.length]; + BitSet newMissing = new BitSet(); + Context context = Context.getCurrent(); + for (int i = 0; i < positions.length; i++) { + if (positions[i] == Index.NOT_FOUND) { + newMissing.set(i); + } else { + Long item = computeItem(positions[i]); + if (item == null) { + newMissing.set(i); + } else { + newData[i] = item; + } + } + + context.safepoint(); + } + return new LongStorage(newData, positions.length, newMissing, getType()); + } + + @Override + public Storage countMask(int[] counts, int total) { + long[] newData = new long[total]; + BitSet newMissing = new BitSet(); + int pos = 0; + Context context = Context.getCurrent(); + for (int i = 0; i < counts.length; i++) { + Long item = computeItem(i); + if (item == null) { + newMissing.set(pos, pos + counts[i]); + pos += counts[i]; + } else { + long nonNullItem = item; + for (int j = 0; j < counts[i]; j++) { + newData[pos++] = nonNullItem; + } + } + + context.safepoint(); + } + return new LongStorage(newData, total, newMissing, getType()); + } + + @Override + public Storage slice(int offset, int limit) { + int newSize = Math.min(size - offset, limit); + long[] newData = new long[newSize]; + BitSet newMissing = new BitSet(); + Context context = Context.getCurrent(); + for (int i = 0; i < newSize; i++) { + Long item = computeItem(offset + i); + if (item == null) { + newMissing.set(i); + } else { + newData[i] = item; + } + context.safepoint(); + } + return new LongStorage(newData, newSize, newMissing, getType()); + } + + @Override + public Storage slice(List ranges) { + int newSize = SliceRange.totalLength(ranges); + long[] newData = new long[newSize]; + BitSet newMissing = new BitSet(newSize); + int offset = 0; + Context context = Context.getCurrent(); + for (SliceRange range : ranges) { + int rangeStart = range.start(); + int length = range.end() - rangeStart; + for (int i = 0; i < length; i++) { + Long item = computeItem(rangeStart + i); + if (item == null) { + newMissing.set(offset + i); + } else { + newData[offset + i] = item; + } + context.safepoint(); + } + offset += length; + } + + return new LongStorage(newData, newSize, newMissing, getType()); + } + + @Override + public AbstractLongStorage widen(IntegerType widerType) { + // Currently the implementation only reports 64-bit type so there is no widening to do - we can + // just return self. + assert getType().equals(IntegerType.INT_64); + return this; + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java index 564845a8413e..8159aac377dc 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java @@ -26,6 +26,7 @@ import org.enso.table.data.column.storage.BoolStorage; import org.enso.table.data.column.storage.Storage; import org.enso.table.data.column.storage.type.FloatType; +import org.enso.table.data.column.storage.type.IntegerType; import org.enso.table.data.column.storage.type.StorageType; import org.enso.table.data.index.Index; import org.enso.table.data.mask.OrderMask; @@ -388,4 +389,68 @@ public Storage slice(List ranges) { return new DoubleStorage(newData, newSize, newMissing); } + + private StorageType inferredType = null; + + @Override + public StorageType inferPreciseType() { + if (inferredType == null) { + boolean areAllIntegers = true; + int visitedNumbers = 0; + for (int i = 0; i < size; i++) { + if (isMissing.get(i)) { + continue; + } + + double value = Double.longBitsToDouble(data[i]); + visitedNumbers++; + boolean isWholeNumber = value % 1.0 == 0.0; + boolean canBeInteger = isWholeNumber && IntegerType.INT_64.fits(value); + if (!canBeInteger) { + areAllIntegers = false; + break; + } + } + + // We only switch to integers if there was at least one number. + inferredType = (areAllIntegers && visitedNumbers > 0) ? IntegerType.INT_64 : getType(); + } + + return inferredType; + } + + @Override + public StorageType inferPreciseTypeShrunk() { + StorageType inferred = inferPreciseType(); + if (inferred instanceof IntegerType) { + return findSmallestIntegerTypeThatFits(); + } else { + return inferred; + } + } + + private StorageType findSmallestIntegerTypeThatFits() { + assert inferredType instanceof IntegerType; + + final DoubleStorage parent = this; + + // We create a Long storage that gets values by converting our storage. + ComputedNullableLongStorage longAdapter = + new ComputedNullableLongStorage(size) { + @Override + protected Long computeItem(int idx) { + if (parent.isNa(idx)) { + return null; + } + + double value = parent.getItem(idx); + assert value % 1.0 == 0.0 + : "The value " + value + " should be a whole number (guaranteed by checks)."; + return (long) value; + } + }; + + // And rely on its shrinking logic. + return longAdapter.inferPreciseTypeShrunk(); + } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/type/StorageType.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/type/StorageType.java index b61b6738289c..dfeeaa79ad22 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/type/StorageType.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/type/StorageType.java @@ -11,9 +11,11 @@ /** * Represents an underlying internal storage type that can be mapped to the Value Type that is exposed to users. */ -public sealed interface StorageType permits AnyObjectType, BigIntegerType, BooleanType, DateTimeType, DateType, FloatType, IntegerType, TextType, TimeOfDayType { +public sealed interface StorageType permits AnyObjectType, BigIntegerType, BooleanType, DateTimeType, DateType, + FloatType, IntegerType, TextType, TimeOfDayType { /** - * @return the StorageType that represents a given boxed item. + * @return the StorageType that represents a given boxed item. This has special handling for floating-point values - + * if they represent a whole number, they will be treated as integers. */ static StorageType forBoxedItem(Object item) { if (NumericConverter.isCoercibleToLong(item)) { @@ -21,6 +23,11 @@ static StorageType forBoxedItem(Object item) { } if (NumericConverter.isFloatLike(item)) { + double value = NumericConverter.coerceToDouble(item); + if (value % 1.0 == 0.0 && IntegerType.INT_64.fits(value)) { + return IntegerType.INT_64; + } + return FloatType.FLOAT_64; } @@ -32,7 +39,7 @@ static StorageType forBoxedItem(Object item) { case LocalTime t -> TimeOfDayType.INSTANCE; case LocalDateTime d -> DateTimeType.INSTANCE; case ZonedDateTime d -> DateTimeType.INSTANCE; - default -> null; + default -> AnyObjectType.INSTANCE; }; } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/type/TextType.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/type/TextType.java index cfc72407998b..311dee27c170 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/type/TextType.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/type/TextType.java @@ -3,6 +3,12 @@ import org.enso.base.Text_Utils; public record TextType(long maxLength, boolean fixedLength) implements StorageType { + public TextType { + if (maxLength == 0) { + throw new IllegalArgumentException("The maxLength of a text type must be positive or -1 to indicate unlimited length."); + } + } + public static final TextType VARIABLE_LENGTH = new TextType(-1, false); public static TextType fixedLength(long length) { @@ -10,7 +16,7 @@ public static TextType fixedLength(long length) { } public static TextType variableLengthWithLimit(long maxLength) { - assert maxLength >= 0; + assert maxLength > 0; return new TextType(maxLength, false); } @@ -90,6 +96,10 @@ public static TextType concatTypes(TextType type1, TextType type2) { boolean bothFixed = type1.fixedLength && type2.fixedLength; long lengthSum = type1.maxLength + type2.maxLength; + if (lengthSum == 0) { + return VARIABLE_LENGTH; + } + return new TextType(lengthSum, bothFixed); } } diff --git a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso index 7f024f35897c..e4dc5984ef49 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso @@ -67,7 +67,6 @@ spec setup = c.value_type.is_text . should_be_true c.to_vector . should_equal ["{{{MY Type [x=42] }}}", "{{{MY Type [x=X] }}}"] - # TODO what to test here? Test.specify "should allow to cast an integer column to a decimal type" <| t = table_builder [["X", [1, 2, 3]]] c = t.at "X" . cast Value_Type.Decimal @@ -116,6 +115,15 @@ spec setup = w2 = Problems.expect_warning Conversion_Failure c2 w2.affected_rows_count . should_equal 4 + Test.specify "should not allow 0-length Char type" <| + c1 = table_builder [["X", ["a", "", "bcd"]]] . at "X" + r1 = c1.cast (Value_Type.Char size=0 variable_length=False) + r1.should_fail_with Illegal_Argument + r1.catch.to_display_text . should_contain "positive" + + r2 = c1.cast (Value_Type.Char size=0 variable_length=True) + r2.should_fail_with Illegal_Argument + Test.group prefix+"Table/Column.cast - numeric" <| Test.specify "should allow to cast a boolean column to integer" <| t = table_builder [["X", [True, False, True]]] @@ -531,3 +539,254 @@ spec setup = r3 = t.parse ["X", "Y"] Value_Type.Integer r3.should_fail_with Missing_Input_Columns r3.catch.criteria . should_equal ["Y"] + + if setup.is_database then Test.group prefix+"Table/Column auto value type" <| + Test.specify "should report unsupported" <| + t = table_builder [["X", [1, 2, 3]]] + t.auto_value_types . should_fail_with Unsupported_Database_Operation + t.at "X" . auto_value_type . should_fail_with Unsupported_Database_Operation + + if setup.is_database.not then Test.group prefix+"Table/Column auto value type" <| + Test.specify "should allow to narrow down types of a Mixed column" <| + [True, False].each shrink_types-> + mixer = My_Type.Value 1 + t0 = table_builder [["strs", [mixer, "a", "b"]], ["ints", [mixer, 2, 3]], ["floats", [mixer, 1.5, 2.5]], ["mix", [1, mixer, "a"]], ["dates", [mixer, Date.new 2022, Date.new 2020]], ["datetimes", [mixer, Date_Time.new 2022 12 30 13 45, Date_Time.new 2020]], ["times", [mixer, Time_Of_Day.new 12 30, Time_Of_Day.new 13 45]], ["mixed_time", [Date.new 2022, Time_Of_Day.new 12 30, Date_Time.new 2019]], ["bools", [mixer, True, False]]] + t1 = t0.drop 1 + + t1.at "strs" . value_type . should_equal Value_Type.Mixed + t1.at "ints" . value_type . should_equal Value_Type.Mixed + t1.at "floats" . value_type . should_equal Value_Type.Mixed + t1.at "mix" . value_type . should_equal Value_Type.Mixed + t1.at "dates" . value_type . should_equal Value_Type.Mixed + t1.at "datetimes" . value_type . should_equal Value_Type.Mixed + t1.at "times" . value_type . should_equal Value_Type.Mixed + t1.at "mixed_time" . value_type . should_equal Value_Type.Mixed + t1.at "bools" . value_type . should_equal Value_Type.Mixed + + t2 = t1.auto_value_types shrink_types=shrink_types + # Depending on shrink_types value the size of the Char/Integer types may vary - exact details tested elsewhere. + t2.at "strs" . value_type . should_be_a (Value_Type.Char ...) + t2.at "ints" . value_type . should_be_a (Value_Type.Integer ...) + t2.at "floats" . value_type . should_equal Value_Type.Float + t2.at "mix" . value_type . should_equal Value_Type.Mixed + t2.at "dates" . value_type . should_equal Value_Type.Date + t2.at "datetimes" . value_type . should_equal Value_Type.Date_Time + t2.at "times" . value_type . should_equal Value_Type.Time + t2.at "mixed_time" . value_type . should_equal Value_Type.Mixed + t2.at "bools" . value_type . should_equal Value_Type.Boolean + + Test.specify "will only modify selected columns" <| + mixer = My_Type.Value 1 + t0 = table_builder [["strs", [mixer, "a", "b"]], ["ints", [mixer, 2, 3]], ["floats", [mixer, 1.5, 2.5]]] + t1 = t0.drop 1 + + t2 = t1.auto_value_types [] + t2.at "strs" . value_type . should_equal Value_Type.Mixed + t2.at "ints" . value_type . should_equal Value_Type.Mixed + t2.at "floats" . value_type . should_equal Value_Type.Mixed + + t3 = t1.auto_value_types ["strs"] + t3.at "strs" . value_type . should_equal Value_Type.Char + t3.at "ints" . value_type . should_equal Value_Type.Mixed + t3.at "floats" . value_type . should_equal Value_Type.Mixed + + # should match ints and floats but not strs + t4 = t1.auto_value_types "[if].*".to_regex + t4.at "strs" . value_type . should_equal Value_Type.Mixed + t4.at "ints" . value_type . should_equal Value_Type.Integer + t4.at "floats" . value_type . should_equal Value_Type.Float + + Test.specify "will convert a Float column to Integer if all values can be represented as long" <| + t1 = table_builder [["X", [1.0, 2.0, 3.0]], ["Y", [1.0, 2.5, 3.0]], ["Z", [1.0, 2.0, (2.0^100)]]] + t1.at "X" . value_type . should_equal Value_Type.Float + t1.at "Y" . value_type . should_equal Value_Type.Float + t1.at "Z" . value_type . should_equal Value_Type.Float + + t2 = t1.auto_value_types shrink_types=False + t2.at "X" . to_vector . should_equal [1, 2, 3] + t2.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) + t2.at "Y" . value_type . should_equal Value_Type.Float + ## Technically, Z could get converted to Decimal type. But IMO that + is not desirable - at this scale the Float is no longer a + precise type (as not even consecutive integers are exactly + representable). And Decimal is expected to be precise. So such a + conversion should only happen by explicit request, not + automatically. + t2.at "Z" . value_type . should_equal Value_Type.Float + + Test.specify "will not parse text columns" <| + t1 = table_builder [["X", ["1", "2", "3"]]] + c2 = t1.at "X" . auto_value_type + c2.value_type . should_equal Value_Type.Char + + Test.specify "will 'undo' a cast to Mixed" <| + t1 = table_builder [["X", [1, 2, 3]], ["Y", ["a", "b", "c"]]] + t2 = t1.cast ["X", "Y"] Value_Type.Mixed + t2.at "X" . value_type . should_equal Value_Type.Mixed + t2.at "Y" . value_type . should_equal Value_Type.Mixed + + t3 = t2.auto_value_types + t3.at "X" . value_type . should_equal Value_Type.Integer + t3.at "Y" . value_type . should_equal Value_Type.Char + + Test.specify "will choose Decimal type if all values are integers but cannot fit long" <| + c0 = table_builder [["X", [My_Type.Value 42, 1, 2, 2^100]]] . at "X" + c1 = c0.drop 1 + + c1.value_type . should_equal Value_Type.Mixed + c2 = c1.auto_value_type + c2.value_type . should_be_a (Value_Type.Decimal ...) + c2.to_vector . should_equal [1, 2, 2^100] + + Test.specify "will try to find the smallest integer type to fit the value (if shrink_types=True)" <| + [False, True].each is_mixed-> + prefix = if is_mixed then "mixed" else 0 + t0 = table_builder [["X", [prefix, 1, 2, 3]], ["Y", [prefix, 2^20, 2, 3]], ["Z", [prefix, 2^50, 2, 3]], ["F", [prefix, 1.0, 2.0, 3.0]]] + t1 = t0.drop 1 + + case is_mixed of + True -> t1.at "Z" . value_type . should_equal Value_Type.Mixed + False -> t1.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) + + case is_mixed of + True -> t1.at "F" . value_type . should_equal Value_Type.Mixed + False -> t1.at "F" . value_type . should_equal Value_Type.Float + + t2 = t1.auto_value_types shrink_types=False + t2.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) + t2.at "Y" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) + t2.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) + t2.at "F" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) + + t3 = t1.auto_value_types shrink_types=True + # Even though X's values are small enough to fit in a Byte, we stick to 16-bit Integers. + t3.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_16) + t3.at "Y" . value_type . should_equal (Value_Type.Integer Bits.Bits_32) + t3.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) + # Shrinking Floats also finds the smallest type that fits. + t3.at "F" . value_type . should_equal (Value_Type.Integer Bits.Bits_16) + + Test.specify "will not return Byte columns by default, but should leave existing Byte columns intact" <| + c1 = table_builder [["X", [1, 2, 3]]] . at "X" . cast Value_Type.Byte + c1.value_type . should_equal Value_Type.Byte + + [True, False].each shrink_types-> + c2 = c1.auto_value_type shrink_types=shrink_types + c2.value_type . should_equal Value_Type.Byte + + Test.specify "Decimal (scale=0, i.e. integer) columns should also be shrinked if possible and shrink_types=True" <| + t0 = table_builder [["X", [2^100, 1, 2, 3]], ["Y", [10, 20, 2^100, 30]], ["Z", [1, 2, 3, 4]]] . cast "Z" (Value_Type.Decimal scale=0) + t1 = t0.drop 1 + + t1.at "X" . value_type . should_equal (Value_Type.Decimal scale=0) + t1.at "Y" . value_type . should_equal (Value_Type.Decimal scale=0) + t1.at "Z" . value_type . should_equal (Value_Type.Decimal scale=0) + + t2 = t1.auto_value_types shrink_types=False + + # Without shrinking we get an integer type, but not the smallest one - just the default 64-bit. + t2.at "X" . to_vector . should_equal [1, 2, 3] + t2.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) + t2.at "Y" . value_type . should_equal (Value_Type.Decimal scale=0) + t2.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) + + t3 = t1.auto_value_types shrink_types=True + t3.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_16) + t3.at "Y" . value_type . should_equal (Value_Type.Decimal scale=0) + t3.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_16) + + Test.specify "if all text values have the same length, will change the type to fixed-length string (if shrink_types=True)" <| + [False, True].each is_mixed-> + prefix = if is_mixed then 42 else "FOOBARBAZ" + c0 = table_builder [["X", [prefix, "aa", "bb", "cc"]]] . at "X" + c1 = c0.drop 1 + c1.to_vector . should_equal ["aa", "bb", "cc"] + + case is_mixed of + True -> c1.value_type . should_equal Value_Type.Mixed + False -> c1.value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) + + c2 = c1.auto_value_type shrink_types=False + c2.value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) + + c3 = c1.auto_value_type shrink_types=True + c3.value_type . should_equal (Value_Type.Char size=2 variable_length=False) + + c4 = table_builder [["X", ["a", "x", "y"]]] . at "X" . cast (Value_Type.Char size=100 variable_length=True) + c4.to_vector . should_equal ["a", "x", "y"] + c4.value_type . should_equal (Value_Type.Char size=100 variable_length=True) + + c5 = c4.auto_value_type shrink_types=False + c5.value_type . should_equal (Value_Type.Char size=100 variable_length=True) + + c6 = c4.auto_value_type shrink_types=True + c6.value_type . should_equal (Value_Type.Char size=1 variable_length=False) + + Test.specify "if all text values are empty string, the type will remain unchanged" <| + c1 = table_builder [["X", ["", ""]]] . at "X" + c2 = c1.cast (Value_Type.Char size=100 variable_length=True) + + c1.value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) + c2.value_type . should_equal (Value_Type.Char size=100 variable_length=True) + + [True, False].each shrink_types-> + c1_b = c1.auto_value_type shrink_types=shrink_types + c1_b.value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) + + c2_b = c2.auto_value_type shrink_types=shrink_types + c2_b.value_type . should_equal (Value_Type.Char size=100 variable_length=True) + + Test.specify "if all text values fit under 255 characters, will add a 255 length limit (if shrink_types=True)" <| + t1 = table_builder [["short_unbounded", ["a", "bb", "ccc"]], ["long_unbounded", ["a"*100, "b"*200, "c"*300]]] + + t2 = t1 . set (t1.at "short_unbounded" . cast (Value_Type.Char size=1000)) "short_1000" . set (t1.at "short_unbounded" . cast (Value_Type.Char size=10)) "short_10" . set (t1.at "long_unbounded" . cast (Value_Type.Char size=400)) "long_400" . set (t1.at "short_unbounded" . cast Value_Type.Mixed) "short_mixed" + t2.at "short_mixed" . value_type . should_equal Value_Type.Mixed + + t3 = t2.auto_value_types shrink_types=False + t3.at "short_unbounded" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) + t3.at "short_1000" . value_type . should_equal (Value_Type.Char size=1000 variable_length=True) + t3.at "short_10" . value_type . should_equal (Value_Type.Char size=10 variable_length=True) + # Mixed column gets to be text again. + t3.at "short_mixed" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) + t3.at "long_unbounded" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) + t3.at "long_400" . value_type . should_equal (Value_Type.Char size=400 variable_length=True) + + t4 = t2.auto_value_types shrink_types=True + # Short ones get shortened to 255 unless they were shorter already. + t4.at "short_unbounded" . value_type . should_equal (Value_Type.Char size=255 variable_length=True) + t4.at "short_1000" . value_type . should_equal (Value_Type.Char size=255 variable_length=True) + t4.at "short_10" . value_type . should_equal (Value_Type.Char size=10 variable_length=True) + t4.at "short_mixed" . value_type . should_equal (Value_Type.Char size=255 variable_length=True) + # Long ones cannot fit in 255 so they are kept as-is. + t4.at "long_unbounded" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) + t4.at "long_400" . value_type . should_equal (Value_Type.Char size=400 variable_length=True) + + Test.specify "can deal with all-null columns" <| + t0 = table_builder [["mix", [My_Type.Value 1, Nothing, Nothing]], ["int", [42, Nothing, Nothing]], ["str", ["a", Nothing, Nothing]], ["float", [1.5, Nothing, Nothing]], ["decimal", [2^100, 2^10, 2]]] + t1 = t0.drop 1 + + t1.at "mix" . value_type . should_equal Value_Type.Mixed + t1.at "int" . value_type . should_equal Value_Type.Integer + t1.at "float" . value_type . should_equal Value_Type.Float + t1.at "str" . value_type . should_equal Value_Type.Char + t1.at "decimal" . value_type . should_equal (Value_Type.Decimal scale=0) + + t2 = t1.auto_value_types shrink_types=False + t2.at "mix" . value_type . should_equal Value_Type.Mixed + t2.at "int" . value_type . should_equal Value_Type.Integer + ## Technically, if there are no elements, "all of elements" are + whole integers (quantification over empty domain is trivially true). + However, that would be rather not useful, so instead we keep the + original type. + t2.at "float" . value_type . should_equal Value_Type.Float + t1.at "decimal" . value_type . should_equal (Value_Type.Decimal scale=0) + t2.at "str" . value_type . should_equal Value_Type.Char + + t3 = t1.auto_value_types shrink_types=True + t3.at "mix" . value_type . should_equal Value_Type.Mixed + # Technically, if there are no elements, then they can be fit inside of the smallest types available: + t3.at "int" . value_type . should_equal (Value_Type.Integer Bits.Bits_16) + t3.at "float" . value_type . should_equal Value_Type.Float + t1.at "decimal" . value_type . should_equal (Value_Type.Decimal scale=0) + # But for Text we make an exception and keep the type unbounded: 0-length fixed length string simply would not make any sense. + t3.at "str" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) diff --git a/test/Table_Tests/src/Common_Table_Operations/Map_Spec.enso b/test/Table_Tests/src/Common_Table_Operations/Map_Spec.enso index 2bc2074d1f4e..9a46772568cd 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Map_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Map_Spec.enso @@ -125,7 +125,7 @@ spec setup = k x = if x == 2 then Time_Of_Day.new 13 05 else (x+1).to_text r7 = c1.map k expected_value_type=Value_Type.Char r7.should_fail_with Invalid_Value_Type - r7.catch.to_display_text . should_contain "Expected type Char (variable length, max_size=Nothing), but got a value 13:05:00 of type Time" + r7.catch.to_display_text . should_contain "Expected type Char (variable length, max_size=unlimited), but got a value 13:05:00 of type Time" l x = if x == 2 then 42 else Date.new 2022 05 x r8 = c1.map l expected_value_type=Value_Type.Date diff --git a/test/Table_Tests/src/Helpers/Value_Type_Spec.enso b/test/Table_Tests/src/Helpers/Value_Type_Spec.enso index d39064bd1e7e..646dc443fc1f 100644 --- a/test/Table_Tests/src/Helpers/Value_Type_Spec.enso +++ b/test/Table_Tests/src/Helpers/Value_Type_Spec.enso @@ -17,7 +17,7 @@ spec = Value_Type.Float.to_display_text . should_equal "Float (64 bits)" Value_Type.Decimal.to_display_text . should_equal "Decimal (precision=Nothing, scale=Nothing)" - Value_Type.Char.to_display_text . should_equal "Char (variable length, max_size=Nothing)" + Value_Type.Char.to_display_text . should_equal "Char (variable length, max_size=unlimited)" (Value_Type.Binary 8 False).to_display_text . should_equal "Binary (fixed length, size=8 bytes)" Value_Type.Date.to_display_text . should_equal "Date" diff --git a/test/Table_Tests/src/In_Memory/Column_Spec.enso b/test/Table_Tests/src/In_Memory/Column_Spec.enso index 3f0979b4e597..ae83602402fb 100644 --- a/test/Table_Tests/src/In_Memory/Column_Spec.enso +++ b/test/Table_Tests/src/In_Memory/Column_Spec.enso @@ -166,10 +166,14 @@ spec = c8.value_type . should_equal Value_Type.Mixed c8.to_vector . should_equal ["aaa", 42, Date.new 2022 08 22] + c9 = Column.from_vector "X" [Time_Of_Day.new 10 11 12, Time_Of_Day.new 11 30] Value_Type.Time + c9.value_type . should_equal Value_Type.Time + c9.to_vector . should_equal [Time_Of_Day.new 10 11 12, Time_Of_Day.new 11 30] + Test.specify "will fail if unexpected values are encountered for the requested type" <| r1 = Column.from_vector "X" ["a", 2] Value_Type.Char r1.should_fail_with Invalid_Value_Type - r1.catch.to_display_text.should_contain "Expected type Char (variable length, max_size=Nothing), but got a value 2 of type Integer (16 bits)" + r1.catch.to_display_text.should_contain "Expected type Char (variable length, max_size=unlimited), but got a value 2 of type Integer (16 bits)" r2 = Column.from_vector "X" ["aaa", "b"] (Value_Type.Char size=3 variable_length=False) r2.should_fail_with Invalid_Value_Type @@ -177,7 +181,7 @@ spec = r3 = Column.from_vector "X" ["aaa", 42] Value_Type.Char r3.should_fail_with Invalid_Value_Type - r3.catch.to_display_text.should_contain "Expected type Char (variable length, max_size=Nothing), but got a value 42 of type Integer (16 bits)" + r3.catch.to_display_text.should_contain "Expected type Char (variable length, max_size=unlimited), but got a value 42 of type Integer (16 bits)" r4 = Column.from_vector "X" [12, Time_Of_Day.new 10 11 12] Value_Type.Integer r4.should_fail_with Invalid_Value_Type @@ -199,6 +203,13 @@ spec = r8.should_fail_with Invalid_Value_Type r8.catch.to_display_text.should_contain "Expected type Byte, but got a value 1000000000 of type Integer (32 bits)" + Test.specify "will not allow to construct a column with Char size=0" <| + r1 = Column.from_vector "X" [] (Value_Type.Char size=0 variable_length=False) + r1.should_fail_with Illegal_Argument + + r2 = Column.from_vector "X" [] (Value_Type.Char size=0 variable_length=True) + r2.should_fail_with Illegal_Argument + Test.group "Rounding" <| Test.specify "should be able to round a column of decimals" <| Column.from_vector "foo" [1.2, 2.3, 2.5, 3.6] . round . should_equal (Column.from_vector "round([foo])" [1, 2, 3, 4])