From c31df3c6133d18f441d7eb4bbb4f10fd18c52018 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Mon, 25 Sep 2023 14:17:25 +0200 Subject: [PATCH 01/31] Initial doc and method stubs --- .../Database/0.0.0-dev/src/Data/Column.enso | 9 ++++ .../Database/0.0.0-dev/src/Data/Table.enso | 9 ++++ .../Table/0.0.0-dev/src/Data/Column.enso | 31 +++++++++++++ .../Table/0.0.0-dev/src/Data/Table.enso | 44 +++++++++++++++++++ .../0.0.0-dev/src/Data/Type/Enso_Types.enso | 2 +- 5 files changed, 94 insertions(+), 1 deletion(-) diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso index bd4b51cc806a..13ad41d0105c 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso @@ -1579,6 +1579,15 @@ type Column check_cast_compatibility self.value_type value_type <| self.internal_do_cast value_type on_problems + ## Change the value type of the column to a more specific one, based on its + contents. + + This operation is currently not available in the Database backend. + auto_value_type : Boolean -> Column + auto_value_type self use_smallest=False = + _ = use_smallest + Error.throw <| Unsupported_Database_Operation.Error "`Column.auto_value_type` is not supported in the Database backends." + ## PRIVATE Shares the core CAST logic between `cast` and `parse`. internal_do_cast : Value_Type -> Problem_Behavior -> Column diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso index c8dc2f54dc27..f779a2dc0a39 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso @@ -1979,6 +1979,15 @@ type Table new_column = column_to_cast.cast value_type on_problems table.set new_column new_name=column_to_cast.name set_mode=Set_Mode.Update + ## Change the value type of table columns to a more specific one, based on + their contents. + + This operation is currently not available in the Database backend. + auto_value_types : Vector (Text | Integer | Regex) | Text | Integer | Regex -> Boolean -> Boolean -> Problem_Behavior -> Table + auto_value_types self columns=self.column_names use_smallest=False error_on_missing_columns=True on_problems=Problem_Behavior.Report_Warning = + _ = [columns, use_smallest, error_on_missing_columns, on_problems] + Error.throw (Unsupported_Database_Operation.Error "Table.auto_value_types is not supported in the Database backends.") + ## ALIAS drop_missing_rows, dropna GROUP Standard.Base.Selections Remove rows which are all blank or containing blank values. diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso index d9c38acdf63d..45ff3d455c62 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso @@ -1762,6 +1762,37 @@ type Column on_problems.attach_problems_before problems <| Column.from_storage self.name new_storage + ## Change the value type of the column to a more specific one, based on its + contents. + + Arguments: + - use_smallest: If `True`, the smallest type that can fit all values will + be used. Defaults to `False`. See the rules below. + + ? Auto Type Selection Rules + + - If a `Mixed` column can be assigned a single type, like `Char` or + `Integer`, that will be used. + - Text columns that are variable length and have no length limit set + will get assigned a max length of 255 if all values fit that size. + - Text columns are not parsed. To do that, use the `parse` method. + - Integer columns will be assigned the smallest size that can fit all + values (down to Integer 16-bit, but not Byte). + - However, if `use_smallest` is set to `True`, then: + - If all text columns have the same length, the type will become + fixed length. + - The maximum size of a variable length column will be set to be + exactly the length of its longest element. + - An integer column may be converted into `Byte` if all values are + small enough. + - `Float` columns are not changed, as it could mean a loss of + accuracy. + auto_value_type : Boolean -> Column + auto_value_type self use_smallest=False = + # TODO use_smallest etc. + new_value_type = self.inferred_precise_value_type + self.cast new_value_type + ## ALIAS transform column Applies `function` to each item in this column and returns the column diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso index 3b5dd7f1aa9e..f99b9cc73761 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso @@ -946,6 +946,9 @@ type Table Arguments: - columns: The selection of columns to cast. - value_type: The `Value_Type` to cast the column to. + - error_on_missing_columns: Specifies if a missing input column should + result in an error regardless of the `on_problems` settings. Defaults + to `True`. - on_problems: Specifies how to handle problems if they occur, reporting them as warnings by default. @@ -996,6 +999,47 @@ type Table new_column = column_to_cast.cast value_type on_problems table.set new_column new_name=column_to_cast.name set_mode=Set_Mode.Update + ## Change the value type of table columns to a more specific one, based on + their contents. + + This is most useful for `Mixed` type columns and will allow to narrow + down the type if all values in the column fit a more specific type. + + Arguments: + - columns: The selection of columns to convert. + - use_smallest: If `True`, the smallest type that can fit all values will + be used. Defaults to `False`. See the rules below. + - error_on_missing_columns: Specifies if a missing input column should + result in an error regardless of the `on_problems` settings. Defaults + to `True`. + - on_problems: Specifies how to handle problems if they occur, reporting + them as warnings by default. + + ? Auto Type Selection Rules + + - If a `Mixed` column can be assigned a single type, like `Char` or + `Integer`, that will be used. + - Text columns that are variable length and have no length limit set + will get assigned a max length of 255 if all values fit that size. + - Text columns are not parsed. To do that, use the `parse` method. + - Integer columns will be assigned the smallest size that can fit all + values (down to Integer 16-bit, but not Byte). + - However, if `use_smallest` is set to `True`, then: + - If all text columns have the same length, the type will become + fixed length. + - The maximum size of a variable length column will be set to be + exactly the length of its longest element. + - An integer column may be converted into `Byte` if all values are + small enough. + - `Float` columns are not changed, as it could mean a loss of + accuracy. + auto_value_types : Vector (Text | Integer | Regex) | Text | Integer | Regex -> Boolean -> Boolean -> Problem_Behavior -> Table + auto_value_types self columns=self.column_names use_smallest=False error_on_missing_columns=True on_problems=Problem_Behavior.Report_Warning = + selected = self.columns_helper.select_columns columns Case_Sensitivity.Default reorder=False error_on_missing_columns=error_on_missing_columns on_problems=on_problems error_on_empty=False + selected.fold self table-> column_to_cast-> + new_column = column_to_cast.auto_value_type use_smallest + table.set new_column new_name=column_to_cast.name set_mode=Set_Mode.Update + ## GROUP Standard.Base.Conversions Splits a column of text into a set of new columns. The original column will be removed from the table. diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Enso_Types.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Enso_Types.enso index 1ef8d038f15f..2a42c36fb3ab 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Enso_Types.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Enso_Types.enso @@ -18,7 +18,7 @@ polyglot java import org.enso.table.data.column.storage.type.IntegerType most_specific_value_type : Any -> Boolean -> Value_Type most_specific_value_type value use_smallest=False = case value of - _ : Float -> Value_Type.Float Bits.Bits_64 + _ : Float -> Value_Type.Float Bits.Bits_64 _ : Boolean -> Value_Type.Boolean _ : Date -> Value_Type.Date _ : Time_Of_Day -> Value_Type.Time From 10c04f1c65210296645900d3f0cf3f9142512fa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Mon, 25 Sep 2023 15:11:59 +0200 Subject: [PATCH 02/31] clarifying the rules after further thought and discussion --- .../Table/0.0.0-dev/src/Data/Column.enso | 26 +++++++++---------- .../Table/0.0.0-dev/src/Data/Table.enso | 26 +++++++++---------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso index 45ff3d455c62..4a3077fdb885 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso @@ -1773,20 +1773,20 @@ type Column - If a `Mixed` column can be assigned a single type, like `Char` or `Integer`, that will be used. - - Text columns that are variable length and have no length limit set - will get assigned a max length of 255 if all values fit that size. - - Text columns are not parsed. To do that, use the `parse` method. - - Integer columns will be assigned the smallest size that can fit all - values (down to Integer 16-bit, but not Byte). + - Text columns are not parsed. To do that, use the `parse` method. + - If a `Float` column contains only integers, it will be converted to + an Integer column. + - If `use_smallest` is `False` (default), no other transformations are + applied. - However, if `use_smallest` is set to `True`, then: - - If all text columns have the same length, the type will become - fixed length. - - The maximum size of a variable length column will be set to be - exactly the length of its longest element. - - An integer column may be converted into `Byte` if all values are - small enough. - - `Float` columns are not changed, as it could mean a loss of - accuracy. + - Integer columns will be assigned the smallest size that can fit all + values (down to 16-bit integers; converting to the `Byte` type has + to be done manually through `cast`). + - If all elements in a text column have the same length, the type + will become fixed length. + - Otherwise, if a text column is variable length and unbounded, but + all text elements are shorter than 255 characters, the column will + get a max length of 255. Otherwise, the column will stay unbounded. auto_value_type : Boolean -> Column auto_value_type self use_smallest=False = # TODO use_smallest etc. diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso index f99b9cc73761..4f26f630551d 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso @@ -1019,20 +1019,20 @@ type Table - If a `Mixed` column can be assigned a single type, like `Char` or `Integer`, that will be used. - - Text columns that are variable length and have no length limit set - will get assigned a max length of 255 if all values fit that size. - - Text columns are not parsed. To do that, use the `parse` method. - - Integer columns will be assigned the smallest size that can fit all - values (down to Integer 16-bit, but not Byte). + - Text columns are not parsed. To do that, use the `parse` method. + - If a `Float` column contains only integers, it will be converted to + an Integer column. + - If `use_smallest` is `False` (default), no other transformations are + applied. - However, if `use_smallest` is set to `True`, then: - - If all text columns have the same length, the type will become - fixed length. - - The maximum size of a variable length column will be set to be - exactly the length of its longest element. - - An integer column may be converted into `Byte` if all values are - small enough. - - `Float` columns are not changed, as it could mean a loss of - accuracy. + - Integer columns will be assigned the smallest size that can fit all + values (down to 16-bit integers; converting to the `Byte` type has + to be done manually through `cast`). + - If all elements in a text column have the same length, the type + will become fixed length. + - Otherwise, if a text column is variable length and unbounded, but + all text elements are shorter than 255 characters, the column will + get a max length of 255. Otherwise, the column will stay unbounded. auto_value_types : Vector (Text | Integer | Regex) | Text | Integer | Regex -> Boolean -> Boolean -> Problem_Behavior -> Table auto_value_types self columns=self.column_names use_smallest=False error_on_missing_columns=True on_problems=Problem_Behavior.Report_Warning = selected = self.columns_helper.select_columns columns Case_Sensitivity.Default reorder=False error_on_missing_columns=error_on_missing_columns on_problems=on_problems error_on_empty=False From 7b7f8ae867142af657d02e09724ce3fa7a9be109 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 26 Sep 2023 14:04:06 +0200 Subject: [PATCH 03/31] add a note about counting characters --- .../Table/0.0.0-dev/src/Data/Type/Value_Type.enso | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Value_Type.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Value_Type.enso index 0a0f63da9c7d..f7f713a2c3ae 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Value_Type.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Value_Type.enso @@ -95,6 +95,16 @@ type Value_Type ANSI SQL: CHAR, VARCHAR, TEXT, LONGVARCHAR, NCHAR, NVARCHAR, TEXT, CLOB, NCLOB + ! Counting Characters + + Note that different backends may count the text in different ways. + The in-memory backend treats a single grapheme cluster (e.g. 💡) as a + single character unit. In most database systems more complex grapheme + clusters may be counted as multiple characters. So there isn't a 1-1 + correspondence between these limits across backends which may cause + strings to be truncated if they contain such characters and are close + to the limit. + Arguments: - size: the maximum number of characters that can be stored in the column. It can be nothing to indicate no limit. From 38889e4556e9d0d92401f84606eef30b4a49706f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 26 Sep 2023 14:08:54 +0200 Subject: [PATCH 04/31] update spec --- .../lib/Standard/Table/0.0.0-dev/src/Data/Column.enso | 7 ++++--- .../lib/Standard/Table/0.0.0-dev/src/Data/Table.enso | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso index 4a3077fdb885..368a25ce8c47 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso @@ -1784,9 +1784,10 @@ type Column to be done manually through `cast`). - If all elements in a text column have the same length, the type will become fixed length. - - Otherwise, if a text column is variable length and unbounded, but - all text elements are shorter than 255 characters, the column will - get a max length of 255. Otherwise, the column will stay unbounded. + - Otherwise, if a text column is variable length, but all text + elements are no longer than 255 characters, the column will get a + max length of 255. Otherwise, the column size limit will stay + unchanged. auto_value_type : Boolean -> Column auto_value_type self use_smallest=False = # TODO use_smallest etc. diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso index 4f26f630551d..d962c7f889bc 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso @@ -1030,9 +1030,10 @@ type Table to be done manually through `cast`). - If all elements in a text column have the same length, the type will become fixed length. - - Otherwise, if a text column is variable length and unbounded, but - all text elements are shorter than 255 characters, the column will - get a max length of 255. Otherwise, the column will stay unbounded. + - Otherwise, if a text column is variable length, but all text + elements are no longer than 255 characters, the column will get a + max length of 255. Otherwise, the column size limit will stay + unchanged. auto_value_types : Vector (Text | Integer | Regex) | Text | Integer | Regex -> Boolean -> Boolean -> Problem_Behavior -> Table auto_value_types self columns=self.column_names use_smallest=False error_on_missing_columns=True on_problems=Problem_Behavior.Report_Warning = selected = self.columns_helper.select_columns columns Case_Sensitivity.Default reorder=False error_on_missing_columns=error_on_missing_columns on_problems=on_problems error_on_empty=False From 605be6bab6cba9ece07dcc90003f7b11dec901dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 26 Sep 2023 14:11:37 +0200 Subject: [PATCH 05/31] tests --- .../Conversion_Spec.enso | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso index 7f024f35897c..cdac07954afd 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso @@ -531,3 +531,87 @@ spec setup = r3 = t.parse ["X", "Y"] Value_Type.Integer r3.should_fail_with Missing_Input_Columns r3.catch.criteria . should_equal ["Y"] + + if setup.is_database then Test.group "Table/Column auto value type" <| + Test.specify "should report unsupported" <| + t = table_builder [["X", [1, 2, 3]]] + t.auto_value_types . should_fail_with Unsupported_Database_Operation + t.at "X" . auto_value_type . should_fail_with Unsupported_Database_Operation + + if setup.is_database.not then Test.group "Table/Column auto value type" <| + Test.specify "should allow to narrow down types of a Mixed column" <| + mixer = My_Type.Value 1 + t0 = table_builder [["strs", [mixer, "a", "b"]], ["ints", [mixer, 2, 3]], ["floats", [mixer, 1.5, 2.5]], ["mix", [1, mixer, "a"]], ["dates", [mixer, Date.new 2022, Date.new 2020]], ["datetimes", [mixer, Date_Time.new 2022 12 30 13 45, Date_Time.new 2020]], ["times", [mixer, Time_Of_Day.new 12 30, Time_Of_Day.new 13 45]], ["mixed_time", [Date.new 2022, Time_Of_Day.new 12 30, Date_Time.new 2019]], ["bools", [mixer, True, False]]] + t1 = t0.drop 1 + + t1.at "strs" . value_type . should_equal Value_Type.Mixed + t1.at "ints" . value_type . should_equal Value_Type.Mixed + t1.at "floats" . value_type . should_equal Value_Type.Mixed + t1.at "mix" . value_type . should_equal Value_Type.Mixed + t1.at "dates" . value_type . should_equal Value_Type.Mixed + t1.at "datetimes" . value_type . should_equal Value_Type.Mixed + t1.at "times" . value_type . should_equal Value_Type.Mixed + t1.at "mixed_time" . value_type . should_equal Value_Type.Mixed + t1.at "bools" . value_type . should_equal Value_Type.Mixed + + t2 = t1.auto_value_types + t2.at "strs" . value_type . should_equal Value_Type.Char + t2.at "ints" . value_type . should_equal Value_Type.Integer + t2.at "floats" . value_type . should_equal Value_Type.Float + t2.at "mix" . value_type . should_equal Value_Type.Mixed + t2.at "dates" . value_type . should_equal Value_Type.Date + t2.at "datetimes" . value_type . should_equal Value_Type.Date_Time + t2.at "times" . value_type . should_equal Value_Type.Time + t2.at "mixed_time" . value_type . should_equal Value_Type.Mixed + t2.at "bools" . value_type . should_equal Value_Type.Boolean + + Test.specify "will only modify selected columns" <| + mixer = My_Type.Value 1 + t0 = table_builder [["strs", [mixer, "a", "b"]], ["ints", [mixer, 2, 3]], ["floats", [mixer, 1.5, 2.5]]] + t1 = t0.drop 1 + + t2 = t1.auto_value_types [] + t2.at "strs" . value_type . should_equal Value_Type.Mixed + t2.at "ints" . value_type . should_equal Value_Type.Mixed + t2.at "floats" . value_type . should_equal Value_Type.Mixed + + t3 = t1.auto_value_types ["strs"] + t3.at "strs" . value_type . should_equal Value_Type.Char + t3.at "ints" . value_type . should_equal Value_Type.Mixed + t3.at "floats" . value_type . should_equal Value_Type.Mixed + + # should match ints and floats but not strs + t4 = t1.auto_value_types "[if].*".to_regex + t4.at "strs" . value_type . should_equal Value_Type.Mixed + t4.at "ints" . value_type . should_equal Value_Type.Integer + t4.at "floats" . value_type . should_equal Value_Type.Float + + Test.specify "will convert a Float column to Integer if all values can be represented as long" <| + t1 = table_builder [["X", [1.0, 2.0, 3.0]], ["Y", [1.0, 2.5, 3.0]], ["Z", [1.0, 2.0, (2.0^100)]]] + t1.at "X" . value_type . should_equal Value_Type.Float + t1.at "Y" . value_type . should_equal Value_Type.Float + t1.at "Z" . value_type . should_equal Value_Type.Float + + t2 = t1.auto_value_types + t2.at "X" . value_type . should_equal Value_Type.Integer + t2.at "Y" . value_type . should_equal Value_Type.Float + t2.at "Z" . value_type . should_equal Value_Type.Float + + t2.at "X" . to_vector . should_equal [1, 2, 3] + + Test.specify "will not parse text columns" <| + t1 = table_builder [["X", ["1", "2", "3"]]] + c2 = t1.at "X" . auto_value_type + c2.value_type . should_equal Value_Type.Char + + Test.specify "will try to find the smallest integer type to fit the value (if use_smallest=True)" <| + # but won't allow Byte automatically + Nothing + + Test.specify "if all text values have the same length, will change the type to fixed-length string (if use_smallest=True)" <| + # test on limited and unlimited + Nothing + + Test.specify "if all text values fit under 255 characters, will add a 255 length limit (if use_smallest=True)" <| + # test on limited and unlimited + Nothing From fcf623cf1de00617500eb29895ba228692152181 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 26 Sep 2023 14:18:03 +0200 Subject: [PATCH 06/31] tests2 --- .../Conversion_Spec.enso | 30 +++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso index cdac07954afd..102c8dd49d6f 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso @@ -604,9 +604,35 @@ spec setup = c2 = t1.at "X" . auto_value_type c2.value_type . should_equal Value_Type.Char + Test.specify "will choose Decimal type if all values are integers but cannot fit long" <| + c0 = table_builder [["X", [My_Type.Value 42, 1, 2, 2^100]]] . at "X" + c1 = c0.drop 1 + + c1.value_type . should_equal Value_Type.Mixed + c2 = c1.auto_value_type + c2.value_type . should_be_a (Value_Type.Decimal ...) + c2.to_vector . should_equal [1, 2, 2^100] + Test.specify "will try to find the smallest integer type to fit the value (if use_smallest=True)" <| - # but won't allow Byte automatically - Nothing + [False, True].each is_mixed-> + prefix = if is_mixed then "mixed" else 0 + t0 = table_builder [["X", [prefix, 1, 2, 3]], ["Y", [prefix, 2^20, 2, 3]], ["Z", [prefix, 2^50, 2, 3]]] + t1 = t0.drop 1 + + case is_mixed of + True -> t1.at "Z" . value_type . should_equal Value_Type.Mixed + False -> t1.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) + + t2 = t1.auto_value_types use_smallest=False + t2.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) + t2.at "Y" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) + t2.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) + + t3 = t1.auto_value_types use_smallest=True + # Even though X's values are small enough to fit in a Byte, we stick to 16-bit Integers. + t3.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_16) + t3.at "Y" . value_type . should_equal (Value_Type.Integer Bits.Bits_32) + t3.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) Test.specify "if all text values have the same length, will change the type to fixed-length string (if use_smallest=True)" <| # test on limited and unlimited From ade2f32b845324d4b8832fe375915c3ccb59f008 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 26 Sep 2023 14:28:54 +0200 Subject: [PATCH 07/31] tests3 --- .../Conversion_Spec.enso | 86 +++++++++++++------ 1 file changed, 60 insertions(+), 26 deletions(-) diff --git a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso index 102c8dd49d6f..30b418a69303 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso @@ -540,30 +540,31 @@ spec setup = if setup.is_database.not then Test.group "Table/Column auto value type" <| Test.specify "should allow to narrow down types of a Mixed column" <| - mixer = My_Type.Value 1 - t0 = table_builder [["strs", [mixer, "a", "b"]], ["ints", [mixer, 2, 3]], ["floats", [mixer, 1.5, 2.5]], ["mix", [1, mixer, "a"]], ["dates", [mixer, Date.new 2022, Date.new 2020]], ["datetimes", [mixer, Date_Time.new 2022 12 30 13 45, Date_Time.new 2020]], ["times", [mixer, Time_Of_Day.new 12 30, Time_Of_Day.new 13 45]], ["mixed_time", [Date.new 2022, Time_Of_Day.new 12 30, Date_Time.new 2019]], ["bools", [mixer, True, False]]] - t1 = t0.drop 1 - - t1.at "strs" . value_type . should_equal Value_Type.Mixed - t1.at "ints" . value_type . should_equal Value_Type.Mixed - t1.at "floats" . value_type . should_equal Value_Type.Mixed - t1.at "mix" . value_type . should_equal Value_Type.Mixed - t1.at "dates" . value_type . should_equal Value_Type.Mixed - t1.at "datetimes" . value_type . should_equal Value_Type.Mixed - t1.at "times" . value_type . should_equal Value_Type.Mixed - t1.at "mixed_time" . value_type . should_equal Value_Type.Mixed - t1.at "bools" . value_type . should_equal Value_Type.Mixed + [True, False].each use_smallest-> + mixer = My_Type.Value 1 + t0 = table_builder [["strs", [mixer, "a", "b"]], ["ints", [mixer, 2, 3]], ["floats", [mixer, 1.5, 2.5]], ["mix", [1, mixer, "a"]], ["dates", [mixer, Date.new 2022, Date.new 2020]], ["datetimes", [mixer, Date_Time.new 2022 12 30 13 45, Date_Time.new 2020]], ["times", [mixer, Time_Of_Day.new 12 30, Time_Of_Day.new 13 45]], ["mixed_time", [Date.new 2022, Time_Of_Day.new 12 30, Date_Time.new 2019]], ["bools", [mixer, True, False]]] + t1 = t0.drop 1 - t2 = t1.auto_value_types - t2.at "strs" . value_type . should_equal Value_Type.Char - t2.at "ints" . value_type . should_equal Value_Type.Integer - t2.at "floats" . value_type . should_equal Value_Type.Float - t2.at "mix" . value_type . should_equal Value_Type.Mixed - t2.at "dates" . value_type . should_equal Value_Type.Date - t2.at "datetimes" . value_type . should_equal Value_Type.Date_Time - t2.at "times" . value_type . should_equal Value_Type.Time - t2.at "mixed_time" . value_type . should_equal Value_Type.Mixed - t2.at "bools" . value_type . should_equal Value_Type.Boolean + t1.at "strs" . value_type . should_equal Value_Type.Mixed + t1.at "ints" . value_type . should_equal Value_Type.Mixed + t1.at "floats" . value_type . should_equal Value_Type.Mixed + t1.at "mix" . value_type . should_equal Value_Type.Mixed + t1.at "dates" . value_type . should_equal Value_Type.Mixed + t1.at "datetimes" . value_type . should_equal Value_Type.Mixed + t1.at "times" . value_type . should_equal Value_Type.Mixed + t1.at "mixed_time" . value_type . should_equal Value_Type.Mixed + t1.at "bools" . value_type . should_equal Value_Type.Mixed + + t2 = t1.auto_value_types use_smallest=use_smallest + t2.at "strs" . value_type . should_equal Value_Type.Char + t2.at "ints" . value_type . should_equal Value_Type.Integer + t2.at "floats" . value_type . should_equal Value_Type.Float + t2.at "mix" . value_type . should_equal Value_Type.Mixed + t2.at "dates" . value_type . should_equal Value_Type.Date + t2.at "datetimes" . value_type . should_equal Value_Type.Date_Time + t2.at "times" . value_type . should_equal Value_Type.Time + t2.at "mixed_time" . value_type . should_equal Value_Type.Mixed + t2.at "bools" . value_type . should_equal Value_Type.Boolean Test.specify "will only modify selected columns" <| mixer = My_Type.Value 1 @@ -604,6 +605,16 @@ spec setup = c2 = t1.at "X" . auto_value_type c2.value_type . should_equal Value_Type.Char + Test.specify "will 'undo' a cast to Mixed" <| + t1 = table_builder [["X", [1, 2, 3]], ["Y", ["a", "b", "c"]]] + t2 = t1.cast ["X", "Y"] Value_Type.Mixed + t2.at "X" . value_type . should_equal Value_Type.Mixed + t2.at "Y" . value_type . should_equal Value_Type.Mixed + + t3 = t2.auto_value_types + t3.at "X" . value_type . should_equal Value_Type.Integer + t3.at "Y" . value_type . should_equal Value_Type.Char + Test.specify "will choose Decimal type if all values are integers but cannot fit long" <| c0 = table_builder [["X", [My_Type.Value 42, 1, 2, 2^100]]] . at "X" c1 = c0.drop 1 @@ -620,7 +631,7 @@ spec setup = t1 = t0.drop 1 case is_mixed of - True -> t1.at "Z" . value_type . should_equal Value_Type.Mixed + True -> t1.at "Z" . value_type . should_equal Value_Type.Mixed False -> t1.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) t2 = t1.auto_value_types use_smallest=False @@ -635,8 +646,31 @@ spec setup = t3.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) Test.specify "if all text values have the same length, will change the type to fixed-length string (if use_smallest=True)" <| - # test on limited and unlimited - Nothing + [False, True].each is_mixed-> + prefix = if is_mixed then 42 else "FOOBARBAZ" + c0 = table_builder [["X", [prefix, "aa", "bb", "cc"]]] . at "X" + c1 = c0.drop 1 + c1.to_vector . should_equal ["aa", "bb", "cc"] + + case is_mixed of + True -> c1.value_type . should_equal Value_Type.Mixed + False -> c1.value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) + + c2 = c1.auto_value_type use_smallest=False + c2.value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) + + c3 = c1.auto_value_type use_smallest=True + c3.value_type . should_equal (Value_Type.Char size=2 variable_length=False) + + c4 = table_builder [["X", ["a", "x", "y"]]] . at "X" . cast (Value_Type.Char size=100 variable_length=True) + c4.to_vector . should_equal ["a", "x", "y"] + c4.value_type . should_equal (Value_Type.Char size=100 variable_length=True) + + c5 = c4.auto_value_type use_smallest=False + c5.value_type . should_equal (Value_Type.Char size=100 variable_length=True) + + c6 = c4.auto_value_type use_smallest=True + c6.value_type . should_equal (Value_Type.Char size=1 variable_length=False) Test.specify "if all text values fit under 255 characters, will add a 255 length limit (if use_smallest=True)" <| # test on limited and unlimited From 345499bc019b70115dd767708e854e239eff79a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 26 Sep 2023 17:05:01 +0200 Subject: [PATCH 08/31] tests 4 --- .../Conversion_Spec.enso | 29 ++++++++++++++++--- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso index 30b418a69303..56ffccf29787 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso @@ -532,13 +532,13 @@ spec setup = r3.should_fail_with Missing_Input_Columns r3.catch.criteria . should_equal ["Y"] - if setup.is_database then Test.group "Table/Column auto value type" <| + if setup.is_database then Test.group prefix+"Table/Column auto value type" <| Test.specify "should report unsupported" <| t = table_builder [["X", [1, 2, 3]]] t.auto_value_types . should_fail_with Unsupported_Database_Operation t.at "X" . auto_value_type . should_fail_with Unsupported_Database_Operation - if setup.is_database.not then Test.group "Table/Column auto value type" <| + if setup.is_database.not then Test.group prefix+"Table/Column auto value type" <| Test.specify "should allow to narrow down types of a Mixed column" <| [True, False].each use_smallest-> mixer = My_Type.Value 1 @@ -673,5 +673,26 @@ spec setup = c6.value_type . should_equal (Value_Type.Char size=1 variable_length=False) Test.specify "if all text values fit under 255 characters, will add a 255 length limit (if use_smallest=True)" <| - # test on limited and unlimited - Nothing + t1 = table_builder [["short_unbounded", ["a", "bb", "ccc"]], ["long_unbounded", ["a"*100, "b"*200, "c"*300]]] + + t2 = t1 . set (t1.at "short_unbounded" . cast (Value_Type.Char size=1000)) "short_1000" . set (t1.at "short_unbounded" . cast (Value_Type.Char size=10)) "short_10" . set (t1.at "long_unbounded" . cast (Value_Type.Char size=400)) "long_400" . set (t1.at "short_unbounded" . cast Value_Type.Mixed) "short_mixed" + t2.at "short_mixed" . value_type . should_equal Value_Type.Mixed + + t3 = t2.auto_value_types use_smallest=False + t3.at "short_unbounded" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) + t3.at "short_1000" . value_type . should_equal (Value_Type.Char size=1000 variable_length=True) + t3.at "short_10" . value_type . should_equal (Value_Type.Char size=10 variable_length=True) + # Mixed column gets to be text again. + t3.at "short_mixed" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) + t3.at "long_unbounded" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) + t3.at "long_400" . value_type . should_equal (Value_Type.Char size=400 variable_length=True) + + t4 = t2.auto_value_types use_smallest=True + # Short ones get shortened to 255 unless they were shorter already. + t4.at "short_unbounded" . value_type . should_equal (Value_Type.Char size=255 variable_length=True) + t4.at "short_1000" . value_type . should_equal (Value_Type.Char size=255 variable_length=True) + t4.at "short_10" . value_type . should_equal (Value_Type.Char size=10 variable_length=True) + t4.at "short_mixed" . value_type . should_equal (Value_Type.Char size=255 variable_length=True) + # Long ones cannot fit in 255 so they are kept as-is. + t4.at "long_unbounded" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) + t4.at "long_400" . value_type . should_equal (Value_Type.Char size=400 variable_length=True) From 5ef6641831e62188fb935a30dc20a13f92c787c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 26 Sep 2023 17:07:55 +0200 Subject: [PATCH 09/31] Rename use_smallest to shrink_types which is more accurate --- .../Database/0.0.0-dev/src/Data/Column.enso | 4 +-- .../Database/0.0.0-dev/src/Data/Table.enso | 4 +-- .../Table/0.0.0-dev/src/Data/Column.enso | 12 ++++----- .../Table/0.0.0-dev/src/Data/Table.enso | 12 ++++----- .../Conversion_Spec.enso | 26 +++++++++---------- 5 files changed, 29 insertions(+), 29 deletions(-) diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso index 13ad41d0105c..7f69a2e98519 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso @@ -1584,8 +1584,8 @@ type Column This operation is currently not available in the Database backend. auto_value_type : Boolean -> Column - auto_value_type self use_smallest=False = - _ = use_smallest + auto_value_type self shrink_types=False = + _ = shrink_types Error.throw <| Unsupported_Database_Operation.Error "`Column.auto_value_type` is not supported in the Database backends." ## PRIVATE diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso index f779a2dc0a39..7397614c1d48 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso @@ -1984,8 +1984,8 @@ type Table This operation is currently not available in the Database backend. auto_value_types : Vector (Text | Integer | Regex) | Text | Integer | Regex -> Boolean -> Boolean -> Problem_Behavior -> Table - auto_value_types self columns=self.column_names use_smallest=False error_on_missing_columns=True on_problems=Problem_Behavior.Report_Warning = - _ = [columns, use_smallest, error_on_missing_columns, on_problems] + auto_value_types self columns=self.column_names shrink_types=False error_on_missing_columns=True on_problems=Problem_Behavior.Report_Warning = + _ = [columns, shrink_types, error_on_missing_columns, on_problems] Error.throw (Unsupported_Database_Operation.Error "Table.auto_value_types is not supported in the Database backends.") ## ALIAS drop_missing_rows, dropna diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso index 368a25ce8c47..92034affea67 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso @@ -1766,8 +1766,8 @@ type Column contents. Arguments: - - use_smallest: If `True`, the smallest type that can fit all values will - be used. Defaults to `False`. See the rules below. + - shrink_types: If set `True`, smaller types will be chosen if possible, + according to the rules below. Defaults to `False`. ? Auto Type Selection Rules @@ -1776,9 +1776,9 @@ type Column - Text columns are not parsed. To do that, use the `parse` method. - If a `Float` column contains only integers, it will be converted to an Integer column. - - If `use_smallest` is `False` (default), no other transformations are + - If `shrink_types` is `False` (default), no other transformations are applied. - - However, if `use_smallest` is set to `True`, then: + - However, if `shrink_types` is set to `True`, then: - Integer columns will be assigned the smallest size that can fit all values (down to 16-bit integers; converting to the `Byte` type has to be done manually through `cast`). @@ -1789,8 +1789,8 @@ type Column max length of 255. Otherwise, the column size limit will stay unchanged. auto_value_type : Boolean -> Column - auto_value_type self use_smallest=False = - # TODO use_smallest etc. + auto_value_type self shrink_types=False = + # TODO shrink_types etc. new_value_type = self.inferred_precise_value_type self.cast new_value_type diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso index d962c7f889bc..17c04d152ab4 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso @@ -1007,8 +1007,8 @@ type Table Arguments: - columns: The selection of columns to convert. - - use_smallest: If `True`, the smallest type that can fit all values will - be used. Defaults to `False`. See the rules below. + - shrink_types: If set `True`, smaller types will be chosen if possible, + according to the rules below. Defaults to `False`. - error_on_missing_columns: Specifies if a missing input column should result in an error regardless of the `on_problems` settings. Defaults to `True`. @@ -1022,9 +1022,9 @@ type Table - Text columns are not parsed. To do that, use the `parse` method. - If a `Float` column contains only integers, it will be converted to an Integer column. - - If `use_smallest` is `False` (default), no other transformations are + - If `shrink_types` is `False` (default), no other transformations are applied. - - However, if `use_smallest` is set to `True`, then: + - However, if `shrink_types` is set to `True`, then: - Integer columns will be assigned the smallest size that can fit all values (down to 16-bit integers; converting to the `Byte` type has to be done manually through `cast`). @@ -1035,10 +1035,10 @@ type Table max length of 255. Otherwise, the column size limit will stay unchanged. auto_value_types : Vector (Text | Integer | Regex) | Text | Integer | Regex -> Boolean -> Boolean -> Problem_Behavior -> Table - auto_value_types self columns=self.column_names use_smallest=False error_on_missing_columns=True on_problems=Problem_Behavior.Report_Warning = + auto_value_types self columns=self.column_names shrink_types=False error_on_missing_columns=True on_problems=Problem_Behavior.Report_Warning = selected = self.columns_helper.select_columns columns Case_Sensitivity.Default reorder=False error_on_missing_columns=error_on_missing_columns on_problems=on_problems error_on_empty=False selected.fold self table-> column_to_cast-> - new_column = column_to_cast.auto_value_type use_smallest + new_column = column_to_cast.auto_value_type shrink_types table.set new_column new_name=column_to_cast.name set_mode=Set_Mode.Update ## GROUP Standard.Base.Conversions diff --git a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso index 56ffccf29787..29d35ca01b8e 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso @@ -540,7 +540,7 @@ spec setup = if setup.is_database.not then Test.group prefix+"Table/Column auto value type" <| Test.specify "should allow to narrow down types of a Mixed column" <| - [True, False].each use_smallest-> + [True, False].each shrink_types-> mixer = My_Type.Value 1 t0 = table_builder [["strs", [mixer, "a", "b"]], ["ints", [mixer, 2, 3]], ["floats", [mixer, 1.5, 2.5]], ["mix", [1, mixer, "a"]], ["dates", [mixer, Date.new 2022, Date.new 2020]], ["datetimes", [mixer, Date_Time.new 2022 12 30 13 45, Date_Time.new 2020]], ["times", [mixer, Time_Of_Day.new 12 30, Time_Of_Day.new 13 45]], ["mixed_time", [Date.new 2022, Time_Of_Day.new 12 30, Date_Time.new 2019]], ["bools", [mixer, True, False]]] t1 = t0.drop 1 @@ -555,7 +555,7 @@ spec setup = t1.at "mixed_time" . value_type . should_equal Value_Type.Mixed t1.at "bools" . value_type . should_equal Value_Type.Mixed - t2 = t1.auto_value_types use_smallest=use_smallest + t2 = t1.auto_value_types shrink_types=shrink_types t2.at "strs" . value_type . should_equal Value_Type.Char t2.at "ints" . value_type . should_equal Value_Type.Integer t2.at "floats" . value_type . should_equal Value_Type.Float @@ -624,7 +624,7 @@ spec setup = c2.value_type . should_be_a (Value_Type.Decimal ...) c2.to_vector . should_equal [1, 2, 2^100] - Test.specify "will try to find the smallest integer type to fit the value (if use_smallest=True)" <| + Test.specify "will try to find the smallest integer type to fit the value (if shrink_types=True)" <| [False, True].each is_mixed-> prefix = if is_mixed then "mixed" else 0 t0 = table_builder [["X", [prefix, 1, 2, 3]], ["Y", [prefix, 2^20, 2, 3]], ["Z", [prefix, 2^50, 2, 3]]] @@ -634,18 +634,18 @@ spec setup = True -> t1.at "Z" . value_type . should_equal Value_Type.Mixed False -> t1.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) - t2 = t1.auto_value_types use_smallest=False + t2 = t1.auto_value_types shrink_types=False t2.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) t2.at "Y" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) t2.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) - t3 = t1.auto_value_types use_smallest=True + t3 = t1.auto_value_types shrink_types=True # Even though X's values are small enough to fit in a Byte, we stick to 16-bit Integers. t3.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_16) t3.at "Y" . value_type . should_equal (Value_Type.Integer Bits.Bits_32) t3.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) - Test.specify "if all text values have the same length, will change the type to fixed-length string (if use_smallest=True)" <| + Test.specify "if all text values have the same length, will change the type to fixed-length string (if shrink_types=True)" <| [False, True].each is_mixed-> prefix = if is_mixed then 42 else "FOOBARBAZ" c0 = table_builder [["X", [prefix, "aa", "bb", "cc"]]] . at "X" @@ -656,29 +656,29 @@ spec setup = True -> c1.value_type . should_equal Value_Type.Mixed False -> c1.value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) - c2 = c1.auto_value_type use_smallest=False + c2 = c1.auto_value_type shrink_types=False c2.value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) - c3 = c1.auto_value_type use_smallest=True + c3 = c1.auto_value_type shrink_types=True c3.value_type . should_equal (Value_Type.Char size=2 variable_length=False) c4 = table_builder [["X", ["a", "x", "y"]]] . at "X" . cast (Value_Type.Char size=100 variable_length=True) c4.to_vector . should_equal ["a", "x", "y"] c4.value_type . should_equal (Value_Type.Char size=100 variable_length=True) - c5 = c4.auto_value_type use_smallest=False + c5 = c4.auto_value_type shrink_types=False c5.value_type . should_equal (Value_Type.Char size=100 variable_length=True) - c6 = c4.auto_value_type use_smallest=True + c6 = c4.auto_value_type shrink_types=True c6.value_type . should_equal (Value_Type.Char size=1 variable_length=False) - Test.specify "if all text values fit under 255 characters, will add a 255 length limit (if use_smallest=True)" <| + Test.specify "if all text values fit under 255 characters, will add a 255 length limit (if shrink_types=True)" <| t1 = table_builder [["short_unbounded", ["a", "bb", "ccc"]], ["long_unbounded", ["a"*100, "b"*200, "c"*300]]] t2 = t1 . set (t1.at "short_unbounded" . cast (Value_Type.Char size=1000)) "short_1000" . set (t1.at "short_unbounded" . cast (Value_Type.Char size=10)) "short_10" . set (t1.at "long_unbounded" . cast (Value_Type.Char size=400)) "long_400" . set (t1.at "short_unbounded" . cast Value_Type.Mixed) "short_mixed" t2.at "short_mixed" . value_type . should_equal Value_Type.Mixed - t3 = t2.auto_value_types use_smallest=False + t3 = t2.auto_value_types shrink_types=False t3.at "short_unbounded" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) t3.at "short_1000" . value_type . should_equal (Value_Type.Char size=1000 variable_length=True) t3.at "short_10" . value_type . should_equal (Value_Type.Char size=10 variable_length=True) @@ -687,7 +687,7 @@ spec setup = t3.at "long_unbounded" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) t3.at "long_400" . value_type . should_equal (Value_Type.Char size=400 variable_length=True) - t4 = t2.auto_value_types use_smallest=True + t4 = t2.auto_value_types shrink_types=True # Short ones get shortened to 255 unless they were shorter already. t4.at "short_unbounded" . value_type . should_equal (Value_Type.Char size=255 variable_length=True) t4.at "short_1000" . value_type . should_equal (Value_Type.Char size=255 variable_length=True) From 9d6ec5f0ce8889123398cef41448712ace1404f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 26 Sep 2023 18:39:35 +0200 Subject: [PATCH 10/31] avoid copying when casting String storage, where possible --- .../cast/ToTextStorageConverter.java | 46 +++++++++++++++++-- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/operation/cast/ToTextStorageConverter.java b/std-bits/table/src/main/java/org/enso/table/data/column/operation/cast/ToTextStorageConverter.java index b76839d32eb4..df3f49be7f23 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/operation/cast/ToTextStorageConverter.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/operation/cast/ToTextStorageConverter.java @@ -29,8 +29,8 @@ public ToTextStorageConverter(TextType textType) { public Storage cast(Storage storage, CastProblemBuilder problemBuilder) { if (storage instanceof StringStorage stringStorage) { - if (stringStorage.getType().equals(targetType)) { - return stringStorage; + if (canAvoidCopying(stringStorage)) { + return retypeStringStorage(stringStorage); } else { return adaptStringStorage(stringStorage, problemBuilder); } @@ -150,7 +150,8 @@ private Storage castDoubleStorage(DoubleStorage doubleStorage, CastProbl return builder.seal(); } - private Storage castDateTimeStorage(Storage storage, Function converter, CastProblemBuilder problemBuilder) { + private Storage castDateTimeStorage(Storage storage, Function converter, + CastProblemBuilder problemBuilder) { Context context = Context.getCurrent(); StringBuilder builder = new StringBuilder(storage.size(), targetType); for (int i = 0; i < storage.size(); i++) { @@ -204,4 +205,43 @@ private Storage adaptStringStorage(StringStorage stringStorage, CastProb problemBuilder.aggregateOtherProblems(builder.getProblems()); return builder.seal(); } + + private boolean canAvoidCopying(StringStorage stringStorage) { + if (targetType.fitsExactly(stringStorage.getType())) { + return true; + } + + long maxLength = Long.MIN_VALUE; + long minLength = Long.MAX_VALUE; + for (int i = 0; i < stringStorage.size(); i++) { + String value = stringStorage.getItem(i); + if (value == null) { + continue; + } + + long length = value.length(); + if (length > maxLength) { + maxLength = length; + } + if (length < minLength) { + minLength = length; + } + } + + if (targetType.fixedLength()) { + boolean effectivelyFixedLength = minLength == maxLength; + return effectivelyFixedLength && targetType.maxLength() == maxLength; + } else { + return targetType.maxLength() == -1 || maxLength <= targetType.maxLength(); + } + } + + /** + * Creates a new storage re-using the existing array. + *

+ * This can only be done if the values do not need any adaptations, checked by {@code canAvoidCopying}. + */ + private Storage retypeStringStorage(StringStorage stringStorage) { + return new StringStorage(stringStorage.getData(), stringStorage.size(), targetType); + } } From fd5341c6fd121704dd15c5dc6a1077706aafdba8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 26 Sep 2023 18:42:53 +0200 Subject: [PATCH 11/31] implement shrink_types=True for Char and WIP for Float --- .../Table/0.0.0-dev/src/Data/Column.enso | 9 ++-- .../data/column/storage/MixedStorage.java | 13 ++++++ .../column/storage/MixedStorageFacade.java | 5 ++ .../table/data/column/storage/Storage.java | 13 ++++++ .../data/column/storage/StringStorage.java | 46 +++++++++++++++++-- .../column/storage/numeric/DoubleStorage.java | 38 +++++++++++++++ .../data/column/storage/type/StorageType.java | 2 +- .../Conversion_Spec.enso | 5 +- 8 files changed, 122 insertions(+), 9 deletions(-) diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso index 92034affea67..e453852ce394 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso @@ -1790,9 +1790,12 @@ type Column unchanged. auto_value_type : Boolean -> Column auto_value_type self shrink_types=False = - # TODO shrink_types etc. - new_value_type = self.inferred_precise_value_type - self.cast new_value_type + new_value_type = case shrink_types of + False -> self.inferred_precise_value_type + True -> + Storage.to_value_type self.java_column.getStorage.inferPreciseTypeShrunk + # We run with Report_Error because we do not expect any problems. + self.cast new_value_type on_problems=Problem_Behavior.Report_Error ## ALIAS transform column diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorage.java index 0736c163c389..c5286c82729d 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorage.java @@ -105,6 +105,19 @@ public StorageType inferPreciseType() { return inferredType; } + @Override + public StorageType inferPreciseTypeShrunk() { + Storage specialized = getInferredStorage(); + if (specialized == null) { + // If no specialized type is available, it means that: + assert inferredType instanceof AnyObjectType; + return AnyObjectType.INSTANCE; + } + + // If we are able to get a more specialized storage for more specific type - we delegate to its own shrinking logic. + return specialized.inferPreciseTypeShrunk(); + } + private Storage getInferredStorage() { if (!hasSpecializedStorageBeenInferred) { StorageType inferredType = inferPreciseType(); diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorageFacade.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorageFacade.java index 178f04a6e07b..12e438d1eb22 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorageFacade.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorageFacade.java @@ -43,6 +43,11 @@ public StorageType inferPreciseType() { return underlyingStorage.inferPreciseType(); } + @Override + public StorageType inferPreciseTypeShrunk() { + return underlyingStorage.inferPreciseTypeShrunk(); + } + @Override public boolean isNa(long idx) { return underlyingStorage.isNa(idx); diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java index 98582fb80ea5..b5862940cbd7 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java @@ -39,6 +39,19 @@ public StorageType inferPreciseType() { return getType(); } + /** + * Returns the smallest type (according to Column.auto_value_type rules) that may still fit all values in this + * column. + *

+ * It is a sibling of `inferPreciseType` that allows some further shrinking. It is kept separate, because + * `inferPreciseType` should be quick to compute (cached if needed) as it is used in typechecking of lots of + * operations. This one however, is only used in a specific `auto_value_type` use-case and rarely will need to be + * computed more than once. + */ + public StorageType inferPreciseTypeShrunk() { + return getType(); + } + /** * Returns a more specialized storage, if available. * diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java index 8f5be3a695d0..57085da1cd3c 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java @@ -3,9 +3,9 @@ import org.enso.base.Text_Utils; import org.enso.table.data.column.builder.Builder; import org.enso.table.data.column.builder.StringBuilder; -import org.enso.table.data.column.operation.map.MapOperationStorage; import org.enso.table.data.column.operation.map.BinaryMapOperation; import org.enso.table.data.column.operation.map.MapOperationProblemBuilder; +import org.enso.table.data.column.operation.map.MapOperationStorage; import org.enso.table.data.column.operation.map.UnaryMapOperation; import org.enso.table.data.column.operation.map.text.LikeOp; import org.enso.table.data.column.operation.map.text.StringBooleanOp; @@ -19,10 +19,13 @@ import java.util.BitSet; -/** A column storing strings. */ +/** + * A column storing strings. + */ public final class StringStorage extends SpecializedStorage { private final TextType type; + /** * @param data the underlying data * @param size the number of items stored @@ -111,7 +114,8 @@ public BoolStorage runZip( t.add( new UnaryMapOperation<>(Maps.IS_EMPTY) { @Override - protected BoolStorage runUnaryMap(SpecializedStorage storage, MapOperationProblemBuilder problemBuilder) { + protected BoolStorage runUnaryMap(SpecializedStorage storage, + MapOperationProblemBuilder problemBuilder) { BitSet r = new BitSet(); Context context = Context.getCurrent(); for (int i = 0; i < storage.size; i++) { @@ -162,4 +166,40 @@ protected TextType computeResultType(TextType a, TextType b) { }); return t; } + + @Override + public StorageType inferPreciseTypeShrunk() { + if (type.fixedLength()) { + return type; + } + + long minLength = Long.MAX_VALUE; + long maxLength = Long.MIN_VALUE; + long visitedCounter = 0; + for (int i = 0; i < size(); i++) { + String s = getItem(i); + if (s != null) { + long length = Text_Utils.grapheme_length(s); + minLength = Math.min(minLength, length); + maxLength = Math.max(maxLength, length); + visitedCounter++; + } + } + + if (visitedCounter == 0) { + return getType(); + } + + final long SHORT_LENGTH_THRESHOLD = 255; + if (minLength == maxLength) { + return TextType.fixedLength(minLength); + } else if (maxLength <= SHORT_LENGTH_THRESHOLD && (type.maxLength() < 0 || SHORT_LENGTH_THRESHOLD < type.maxLength())) { + // If the string was unbounded or the bound was larger than 255, we shrink it to 255. + return TextType.variableLengthWithLimit(SHORT_LENGTH_THRESHOLD); + } else { + // Otherwise, we return the original type (because it was either smaller than the proposed 255 bound, or the + // existing elements to do not fit into the 255 bound). + return getType(); + } + } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java index 564845a8413e..06c091d0d0c9 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java @@ -26,6 +26,7 @@ import org.enso.table.data.column.storage.BoolStorage; import org.enso.table.data.column.storage.Storage; import org.enso.table.data.column.storage.type.FloatType; +import org.enso.table.data.column.storage.type.IntegerType; import org.enso.table.data.column.storage.type.StorageType; import org.enso.table.data.index.Index; import org.enso.table.data.mask.OrderMask; @@ -388,4 +389,41 @@ public Storage slice(List ranges) { return new DoubleStorage(newData, newSize, newMissing); } + + private StorageType inferredType = null; + + @Override + public StorageType inferPreciseType() { + if (inferredType == null) { + boolean areAllIntegers = true; + for (int i = 0; i < size; i++) { + if (isMissing.get(i)) { + continue; + } + + double value = Double.longBitsToDouble(data[i]); + if (value % 1.0 != 0.0 && IntegerType.INT_64.fits(value)) { + continue; + } else { + areAllIntegers = false; + break; + } + } + + inferredType = areAllIntegers ? IntegerType.INT_64 : getType(); + } + + return inferredType; + } + + @Override + public StorageType inferPreciseTypeShrunk() { + StorageType inferred = inferPreciseType(); + if (inferred instanceof IntegerType) { + // TODO delegate to infer smallest integer type to fit + return inferred; + } else { + return inferred; + } + } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/type/StorageType.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/type/StorageType.java index b61b6738289c..7ae02626bd36 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/type/StorageType.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/type/StorageType.java @@ -32,7 +32,7 @@ static StorageType forBoxedItem(Object item) { case LocalTime t -> TimeOfDayType.INSTANCE; case LocalDateTime d -> DateTimeType.INSTANCE; case ZonedDateTime d -> DateTimeType.INSTANCE; - default -> null; + default -> AnyObjectType.INSTANCE; }; } } diff --git a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso index 29d35ca01b8e..0a4d651cf0cc 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso @@ -556,8 +556,9 @@ spec setup = t1.at "bools" . value_type . should_equal Value_Type.Mixed t2 = t1.auto_value_types shrink_types=shrink_types - t2.at "strs" . value_type . should_equal Value_Type.Char - t2.at "ints" . value_type . should_equal Value_Type.Integer + # Depending on shrink_types value the size of the Char/Integer types may vary - exact details tested elsewhere. + t2.at "strs" . value_type . should_be_a (Value_Type.Char ...) + t2.at "ints" . value_type . should_be_a (Value_Type.Integer ...) t2.at "floats" . value_type . should_equal Value_Type.Float t2.at "mix" . value_type . should_equal Value_Type.Mixed t2.at "dates" . value_type . should_equal Value_Type.Date From 62f949f11b6bf272665127332fdbf0abcb36eae2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 26 Sep 2023 18:46:38 +0200 Subject: [PATCH 12/31] fix typo: != vs == --- .../enso/table/data/column/storage/numeric/DoubleStorage.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java index 06c091d0d0c9..95e5e241bdc4 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java @@ -402,7 +402,8 @@ public StorageType inferPreciseType() { } double value = Double.longBitsToDouble(data[i]); - if (value % 1.0 != 0.0 && IntegerType.INT_64.fits(value)) { + boolean isWholeNumber = value % 1.0 == 0.0; + if (isWholeNumber && IntegerType.INT_64.fits(value)) { continue; } else { areAllIntegers = false; From 8aa7e88b88fb76ccb6f01e93f487cb4f1424d07c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 26 Sep 2023 18:52:24 +0200 Subject: [PATCH 13/31] slight improvement to tests --- .../Conversion_Spec.enso | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso index 0a4d651cf0cc..a49f2ec0fd5a 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso @@ -594,13 +594,18 @@ spec setup = t1.at "Y" . value_type . should_equal Value_Type.Float t1.at "Z" . value_type . should_equal Value_Type.Float - t2 = t1.auto_value_types - t2.at "X" . value_type . should_equal Value_Type.Integer + t2 = t1.auto_value_types shrink_types=False + t2.at "X" . to_vector . should_equal [1, 2, 3] + t2.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) t2.at "Y" . value_type . should_equal Value_Type.Float + ## Technically, Z could get converted to Decimal type. But IMO that + is not desirable - at this scale the Float is no longer a + precise type (as not even consecutive integers are exactly + representable). And Decimal is expected to be precise. So such a + conversion should only happen by explicit request, not + automatically. t2.at "Z" . value_type . should_equal Value_Type.Float - t2.at "X" . to_vector . should_equal [1, 2, 3] - Test.specify "will not parse text columns" <| t1 = table_builder [["X", ["1", "2", "3"]]] c2 = t1.at "X" . auto_value_type @@ -628,23 +633,30 @@ spec setup = Test.specify "will try to find the smallest integer type to fit the value (if shrink_types=True)" <| [False, True].each is_mixed-> prefix = if is_mixed then "mixed" else 0 - t0 = table_builder [["X", [prefix, 1, 2, 3]], ["Y", [prefix, 2^20, 2, 3]], ["Z", [prefix, 2^50, 2, 3]]] + t0 = table_builder [["X", [prefix, 1, 2, 3]], ["Y", [prefix, 2^20, 2, 3]], ["Z", [prefix, 2^50, 2, 3]], ["F", [prefix, 1.0, 2.0, 3.0]]] t1 = t0.drop 1 case is_mixed of True -> t1.at "Z" . value_type . should_equal Value_Type.Mixed False -> t1.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) + case is_mixed of + True -> t1.at "F" . value_type . should_equal Value_Type.Mixed + False -> t1.at "F" . value_type . should_equal Value_Type.Float + t2 = t1.auto_value_types shrink_types=False t2.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) t2.at "Y" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) t2.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) + t2.at "F" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) t3 = t1.auto_value_types shrink_types=True # Even though X's values are small enough to fit in a Byte, we stick to 16-bit Integers. t3.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_16) t3.at "Y" . value_type . should_equal (Value_Type.Integer Bits.Bits_32) t3.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) + # Shrinking Floats also finds the smallest type that fits. + t3.at "F" . value_type . should_equal (Value_Type.Integer Bits.Bits_16) Test.specify "if all text values have the same length, will change the type to fixed-length string (if shrink_types=True)" <| [False, True].each is_mixed-> From a69f41c9b8d56580f8558e8aed6c4d31f7dac7ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 26 Sep 2023 19:09:14 +0200 Subject: [PATCH 14/31] testing more edge cases --- .../Conversion_Spec.enso | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso index a49f2ec0fd5a..71093c042e98 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso @@ -658,6 +658,14 @@ spec setup = # Shrinking Floats also finds the smallest type that fits. t3.at "F" . value_type . should_equal (Value_Type.Integer Bits.Bits_16) + Test.specify "will not return Byte columns by default, but should leave existing Byte columns intact" <| + c1 = table_builder [["X", [1, 2, 3]]] . at "X" . cast Value_Type.Byte + c1.value_type . should_equal Value_Type.Byte + + [True, False].each shrink_types-> + c2 = c1.auto_value_type shrink_types=shrink_types + c2.value_type . should_equal Value_Type.Byte + Test.specify "if all text values have the same length, will change the type to fixed-length string (if shrink_types=True)" <| [False, True].each is_mixed-> prefix = if is_mixed then 42 else "FOOBARBAZ" @@ -709,3 +717,26 @@ spec setup = # Long ones cannot fit in 255 so they are kept as-is. t4.at "long_unbounded" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) t4.at "long_400" . value_type . should_equal (Value_Type.Char size=400 variable_length=True) + + Test.specify "can deal with all-null columns" <| + t0 = table_builder [["mix", [My_Type.Value 1, Nothing, Nothing]], ["int", [42, Nothing, Nothing]], ["str", ["a", Nothing, Nothing]], ["float", [1.5, Nothing, Nothing]]] + t1 = t0.drop 1 + + t1.at "mix" . value_type . should_equal Value_Type.Mixed + t1.at "int" . value_type . should_equal Value_Type.Integer + t1.at "float" . value_type . should_equal Value_Type.Float + t1.at "str" . value_type . should_equal Value_Type.Char + + t2 = t1.auto_value_types shrink_types=False + t2.at "mix" . value_type . should_equal Value_Type.Mixed + t2.at "int" . value_type . should_equal Value_Type.Integer + t2.at "float" . value_type . should_equal Value_Type.Float + t2.at "str" . value_type . should_equal Value_Type.Char + + t3 = t1.auto_value_types shrink_types=True + t3.at "mix" . value_type . should_equal Value_Type.Mixed + # Technically if there are no elements, then they can be fit inside of the smallest types available: + t3.at "int" . value_type . should_equal (Value_Type.Integer Bits.Bits_16) + t3.at "float" . value_type . should_equal (Value_Type.Integer Bits.Bits_16) + # But for Text we make an exception and keep the type unbounded: 0-length fixed length string simply would not make any sense. + t3.at "str" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) From a7dddf1175128cf2f94a24cc8b4f1a1aed313e62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 26 Sep 2023 19:11:54 +0200 Subject: [PATCH 15/31] WIP: shrinking Int --- .../storage/numeric/AbstractLongStorage.java | 47 ++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/AbstractLongStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/AbstractLongStorage.java index 356b839acea5..cb25574b4836 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/AbstractLongStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/AbstractLongStorage.java @@ -1,6 +1,5 @@ package org.enso.table.data.column.storage.numeric; -import java.util.BitSet; import org.enso.table.data.column.builder.Builder; import org.enso.table.data.column.builder.NumericBuilder; import org.enso.table.data.column.operation.map.MapOperationProblemBuilder; @@ -23,6 +22,10 @@ import org.enso.table.data.column.storage.BoolStorage; import org.enso.table.data.column.storage.Storage; import org.enso.table.data.column.storage.type.IntegerType; +import org.enso.table.data.column.storage.type.StorageType; +import org.graalvm.polyglot.Context; + +import java.util.BitSet; public abstract class AbstractLongStorage extends NumericStorage { public abstract long getItem(int idx); @@ -77,6 +80,48 @@ public Builder createDefaultBuilderOfSameType(int capacity) { @Override public abstract IntegerType getType(); + @Override + public StorageType inferPreciseType() { + return getType(); + } + + @Override + public StorageType inferPreciseTypeShrunk() { + // If the type is already smallest possible, we return it unchanged (we will return 8-bit columns as-is, although + // we will not shrink 16-bit columns to 8-bits even if it were possible). + if (getType().bits().toInteger() <= 16) { + return getType(); + } + + IntegerType[] possibleTypes = new IntegerType[] { + IntegerType.INT_16, + IntegerType.INT_32, + IntegerType.INT_64 + }; + + int currentTypeIdx = 0; + int n = size(); + Context context = Context.getCurrent(); + for (int i = 0; i < n; i++) { + if (isNa(i)) { + continue; + } + + long item = getItem(i); + while (!possibleTypes[currentTypeIdx].fits(item)) { + currentTypeIdx++; + } + + if (currentTypeIdx >= possibleTypes.length - 1) { + break; + } + + context.safepoint(); + } + + return possibleTypes[currentTypeIdx]; + } + private static MapOperationStorage buildOps() { MapOperationStorage ops = new MapOperationStorage<>(); ops.add(new AddOp<>()) From e4be9786b0bb71138b3aa359ed0868f9befb4c39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 26 Sep 2023 19:15:34 +0200 Subject: [PATCH 16/31] add a test for Decimal --- .../Conversion_Spec.enso | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso index 71093c042e98..8e4fad1f5f7d 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso @@ -666,6 +666,25 @@ spec setup = c2 = c1.auto_value_type shrink_types=shrink_types c2.value_type . should_equal Value_Type.Byte + Test.specify "Decimal (scale=0, i.e. integer) columns should also be shrinked if possible and shrink_types=True" <| + t0 = table_builder [["X", [2^100, 1, 2, 3]], ["Y", [10, 20, 2^100, 30]], ["Z", [1, 2, 3, 4]]] . cast "Z" (Value_Type.Decimal scale=0) + t1 = t0.drop 1 + + t1.at "X" . value_type . should_equal (Value_Type.Decimal scale=0) + t1.at "Y" . value_type . should_equal (Value_Type.Decimal scale=0) + t1.at "Z" . value_type . should_equal (Value_Type.Decimal scale=0) + + t2 = t1.auto_value_types shrink_types=False + # Without shrinking we get an integer type, but not the smallest one - just the default 64-bit. + t2.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) + t2.at "Y" . value_type . should_equal (Value_Type.Decimal scale=0) + t2.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) + + t3 = t1.auto_value_types shrink_types=True + t3.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_16) + t3.at "Y" . value_type . should_equal (Value_Type.Decimal scale=0) + t3.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_16) + Test.specify "if all text values have the same length, will change the type to fixed-length string (if shrink_types=True)" <| [False, True].each is_mixed-> prefix = if is_mixed then 42 else "FOOBARBAZ" From 741a5c00b8b9800b50689fe9abe50ded635c5ec3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 26 Sep 2023 19:16:32 +0200 Subject: [PATCH 17/31] update spec for Decimal --- distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso | 2 ++ distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso | 2 ++ 2 files changed, 4 insertions(+) diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso index e453852ce394..38641c25c1ae 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso @@ -1776,6 +1776,8 @@ type Column - Text columns are not parsed. To do that, use the `parse` method. - If a `Float` column contains only integers, it will be converted to an Integer column. + - If a `Decimal` column contains only integers that could fit in a + 64-bit integer storage, it will be converted to an Integer column. - If `shrink_types` is `False` (default), no other transformations are applied. - However, if `shrink_types` is set to `True`, then: diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso index 17c04d152ab4..ae8a6eb4b8c8 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso @@ -1022,6 +1022,8 @@ type Table - Text columns are not parsed. To do that, use the `parse` method. - If a `Float` column contains only integers, it will be converted to an Integer column. + - If a `Decimal` column contains only integers that could fit in a + 64-bit integer storage, it will be converted to an Integer column. - If `shrink_types` is `False` (default), no other transformations are applied. - However, if `shrink_types` is set to `True`, then: From 556a5e8c14baeaf5c0eb01e02e37c4ab64cfa52c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 26 Sep 2023 23:57:00 +0200 Subject: [PATCH 18/31] fix --- .../org/enso/table/data/column/builder/BigIntegerBuilder.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/builder/BigIntegerBuilder.java b/std-bits/table/src/main/java/org/enso/table/data/column/builder/BigIntegerBuilder.java index abaf10902432..9602800a8c19 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/builder/BigIntegerBuilder.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/builder/BigIntegerBuilder.java @@ -88,7 +88,7 @@ public void appendRawNoGrow(BigInteger value) { } public static BigIntegerBuilder retypeFromLongBuilder(LongBuilder longBuilder) { - int n = longBuilder.currentSize; + int n = longBuilder.data.length; BigIntegerBuilder res = new BigIntegerBuilder(n); for (int i = 0; i < n; i++) { res.appendNoGrow(BigInteger.valueOf(longBuilder.data[i])); From a41f3673ddfe56707810ae2e26cdc081b33a1e59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 27 Sep 2023 00:11:03 +0200 Subject: [PATCH 19/31] fix 2 --- .../enso/table/data/column/builder/BigIntegerBuilder.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/builder/BigIntegerBuilder.java b/std-bits/table/src/main/java/org/enso/table/data/column/builder/BigIntegerBuilder.java index 9602800a8c19..7e4bfc7e072e 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/builder/BigIntegerBuilder.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/builder/BigIntegerBuilder.java @@ -10,6 +10,7 @@ import org.enso.table.data.column.storage.type.FloatType; import org.enso.table.data.column.storage.type.StorageType; import org.enso.table.error.ValueTypeMismatchException; +import org.graalvm.polyglot.Context; // For now the BigInteger builder is just a stub, reusing the ObjectBuilder and adding a warning. public class BigIntegerBuilder extends TypedBuilderImpl { @@ -88,10 +89,12 @@ public void appendRawNoGrow(BigInteger value) { } public static BigIntegerBuilder retypeFromLongBuilder(LongBuilder longBuilder) { - int n = longBuilder.data.length; - BigIntegerBuilder res = new BigIntegerBuilder(n); + BigIntegerBuilder res = new BigIntegerBuilder(longBuilder.data.length); + int n = longBuilder.currentSize; + Context context = Context.getCurrent(); for (int i = 0; i < n; i++) { res.appendNoGrow(BigInteger.valueOf(longBuilder.data[i])); + context.safepoint(); } return res; } From 24cc278c222b185ed7c73332a5196e249234d6b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 27 Sep 2023 00:11:23 +0200 Subject: [PATCH 20/31] reduce nesting of checks, make the error message more helpful --- .../Table/0.0.0-dev/src/Data/Table.enso | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso index ae8a6eb4b8c8..31ce6bfb36f3 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso @@ -88,10 +88,20 @@ type Table Column.from_vector (v.at 0) (v.at 1) . java_column Column.Value java_col -> java_col _ -> invalid_input_shape - if cols.is_empty then Error.throw (Illegal_Argument.Error "Cannot create a table with no columns.") else - if (cols.all c-> c.getSize == cols.first.getSize).not then Error.throw (Illegal_Argument.Error "All columns must have the same row count.") else - if cols.distinct .getName . length != cols.length then Error.throw (Illegal_Argument.Error "Column names must be distinct.") else - Table.Value (Java_Table.new cols) + Panic.recover Illegal_Argument <| + if cols.is_empty then + Panic.throw (Illegal_Argument.Error "Cannot create a table with no columns.") + + if cols.distinct .getName . length != cols.length then + Panic.throw (Illegal_Argument.Error "Column names must be distinct.") + + mismatched_size_column = cols.find if_missing=Nothing c-> + c.getSize != cols.first.getSize + if mismatched_size_column.is_nothing.not then + msg = "All columns must have the same row count, but the column [" + mismatched_size_column.getName + "] has " + mismatched_size_column.getSize.to_text + " rows, while the column [" + cols.first.getName + "] has " + cols.first.getSize.to_text + " rows." + Panic.throw (Illegal_Argument.Error msg) + + Table.Value (Java_Table.new cols) ## GROUP Standard.Base.Constants Creates a new table from a vector of column names and a vector of vectors From d3212902ecf8af74bc597d9a803ea2b892d1a2db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 27 Sep 2023 00:19:52 +0200 Subject: [PATCH 21/31] fix 3 --- .../table/data/column/storage/numeric/BigIntegerStorage.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/BigIntegerStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/BigIntegerStorage.java index ad7e10ad0a59..668e3bc146f8 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/BigIntegerStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/BigIntegerStorage.java @@ -59,7 +59,7 @@ protected SpecializedStorage newInstance(BigInteger[] data, int size @Override protected BigInteger[] newUnderlyingArray(int size) { - return new BigInteger[0]; + return new BigInteger[size]; } @Override From aabbf47deca44edf1a232c49a1da25c606315315 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 27 Sep 2023 11:24:18 +0200 Subject: [PATCH 22/31] change behaviour on all-null Float --- .../table/data/column/storage/numeric/DoubleStorage.java | 5 ++++- .../src/Common_Table_Operations/Conversion_Spec.enso | 8 ++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java index 95e5e241bdc4..b25fae072bfd 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java @@ -396,12 +396,14 @@ public Storage slice(List ranges) { public StorageType inferPreciseType() { if (inferredType == null) { boolean areAllIntegers = true; + int visitedNumbers = 0; for (int i = 0; i < size; i++) { if (isMissing.get(i)) { continue; } double value = Double.longBitsToDouble(data[i]); + visitedNumbers++; boolean isWholeNumber = value % 1.0 == 0.0; if (isWholeNumber && IntegerType.INT_64.fits(value)) { continue; @@ -411,7 +413,8 @@ public StorageType inferPreciseType() { } } - inferredType = areAllIntegers ? IntegerType.INT_64 : getType(); + // We only switch to integers if there was at least one number. + inferredType = (areAllIntegers && visitedNumbers > 0) ? IntegerType.INT_64 : getType(); } return inferredType; diff --git a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso index 8e4fad1f5f7d..bedb1e2f9e3d 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso @@ -749,13 +749,17 @@ spec setup = t2 = t1.auto_value_types shrink_types=False t2.at "mix" . value_type . should_equal Value_Type.Mixed t2.at "int" . value_type . should_equal Value_Type.Integer + ## Technically, if there are no elements, "all of elements" are + whole integers (quantification over empty domain is trivially true). + However, that would be rather not useful, so instead we keep the + original type. t2.at "float" . value_type . should_equal Value_Type.Float t2.at "str" . value_type . should_equal Value_Type.Char t3 = t1.auto_value_types shrink_types=True t3.at "mix" . value_type . should_equal Value_Type.Mixed - # Technically if there are no elements, then they can be fit inside of the smallest types available: + # Technically, if there are no elements, then they can be fit inside of the smallest types available: t3.at "int" . value_type . should_equal (Value_Type.Integer Bits.Bits_16) - t3.at "float" . value_type . should_equal (Value_Type.Integer Bits.Bits_16) + t3.at "float" . value_type . should_equal Value_Type.Float # But for Text we make an exception and keep the type unbounded: 0-length fixed length string simply would not make any sense. t3.at "str" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) From 519cc00232b8441bcd25c87275624286af9cf75c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 27 Sep 2023 11:25:08 +0200 Subject: [PATCH 23/31] simplify --- .../table/data/column/storage/numeric/DoubleStorage.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java index b25fae072bfd..27b5c459ce32 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java @@ -405,9 +405,8 @@ public StorageType inferPreciseType() { double value = Double.longBitsToDouble(data[i]); visitedNumbers++; boolean isWholeNumber = value % 1.0 == 0.0; - if (isWholeNumber && IntegerType.INT_64.fits(value)) { - continue; - } else { + boolean canBeInteger = isWholeNumber && IntegerType.INT_64.fits(value); + if (!canBeInteger) { areAllIntegers = false; break; } From e1eac4032b41aaf0e227c320d01c70a9d138ac71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 27 Sep 2023 12:12:09 +0200 Subject: [PATCH 24/31] add an edge case --- .../Conversion_Spec.enso | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso index bedb1e2f9e3d..369c5085950e 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso @@ -675,7 +675,9 @@ spec setup = t1.at "Z" . value_type . should_equal (Value_Type.Decimal scale=0) t2 = t1.auto_value_types shrink_types=False + # Without shrinking we get an integer type, but not the smallest one - just the default 64-bit. + t2.at "X" . to_vector . should_equal [1, 2, 3] t2.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) t2.at "Y" . value_type . should_equal (Value_Type.Decimal scale=0) t2.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64) @@ -712,6 +714,20 @@ spec setup = c6 = c4.auto_value_type shrink_types=True c6.value_type . should_equal (Value_Type.Char size=1 variable_length=False) + Test.specify "if all text values are empty string, the type will remain unchanged" <| + c1 = table_builder [["X", ["", ""]]] . at "X" + c2 = c1.cast (Value_Type.Char size=100 variable_length=True) + + c1.value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) + c2.value_type . should_equal (Value_Type.Char size=100 variable_length=True) + + [True, False].each shrink_types-> + c1_b = c1.auto_value_type shrink_types=shrink_types + c1_b.value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) + + c2_b = c2.auto_value_type shrink_types=shrink_types + c2_b.value_type . should_equal (Value_Type.Char size=100 variable_length=True) + Test.specify "if all text values fit under 255 characters, will add a 255 length limit (if shrink_types=True)" <| t1 = table_builder [["short_unbounded", ["a", "bb", "ccc"]], ["long_unbounded", ["a"*100, "b"*200, "c"*300]]] @@ -738,13 +754,14 @@ spec setup = t4.at "long_400" . value_type . should_equal (Value_Type.Char size=400 variable_length=True) Test.specify "can deal with all-null columns" <| - t0 = table_builder [["mix", [My_Type.Value 1, Nothing, Nothing]], ["int", [42, Nothing, Nothing]], ["str", ["a", Nothing, Nothing]], ["float", [1.5, Nothing, Nothing]]] + t0 = table_builder [["mix", [My_Type.Value 1, Nothing, Nothing]], ["int", [42, Nothing, Nothing]], ["str", ["a", Nothing, Nothing]], ["float", [1.5, Nothing, Nothing]], ["decimal", [2^100, 2^10, 2]]] t1 = t0.drop 1 t1.at "mix" . value_type . should_equal Value_Type.Mixed t1.at "int" . value_type . should_equal Value_Type.Integer t1.at "float" . value_type . should_equal Value_Type.Float t1.at "str" . value_type . should_equal Value_Type.Char + t1.at "decimal" . value_type . should_equal (Value_Type.Decimal scale=0) t2 = t1.auto_value_types shrink_types=False t2.at "mix" . value_type . should_equal Value_Type.Mixed @@ -754,6 +771,7 @@ spec setup = However, that would be rather not useful, so instead we keep the original type. t2.at "float" . value_type . should_equal Value_Type.Float + t1.at "decimal" . value_type . should_equal (Value_Type.Decimal scale=0) t2.at "str" . value_type . should_equal Value_Type.Char t3 = t1.auto_value_types shrink_types=True @@ -761,5 +779,6 @@ spec setup = # Technically, if there are no elements, then they can be fit inside of the smallest types available: t3.at "int" . value_type . should_equal (Value_Type.Integer Bits.Bits_16) t3.at "float" . value_type . should_equal Value_Type.Float + t1.at "decimal" . value_type . should_equal (Value_Type.Decimal scale=0) # But for Text we make an exception and keep the type unbounded: 0-length fixed length string simply would not make any sense. t3.at "str" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True) From 533bf7a219b363738402003cabfa95e1655ac061 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 27 Sep 2023 12:16:56 +0200 Subject: [PATCH 25/31] shrinking Decimal pt1 --- .../storage/numeric/BigIntegerStorage.java | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/BigIntegerStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/BigIntegerStorage.java index 668e3bc146f8..f2dbd45c12c8 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/BigIntegerStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/BigIntegerStorage.java @@ -20,6 +20,7 @@ import org.enso.table.data.column.storage.ObjectStorage; import org.enso.table.data.column.storage.SpecializedStorage; import org.enso.table.data.column.storage.type.BigIntegerType; +import org.enso.table.data.column.storage.type.IntegerType; import org.enso.table.data.column.storage.type.StorageType; public class BigIntegerStorage extends SpecializedStorage { @@ -96,4 +97,43 @@ public long getMaxPrecisionStored() { return cachedMaxPrecisionStored; } + + private StorageType inferredType = null; + + @Override + public StorageType inferPreciseType() { + if (inferredType == null) { + boolean allFitInLong = true; + int visitedCount = 0; + + for (int i = 0; i < size; i++) { + BigInteger value = data[i]; + if (value == null) { + continue; + } + + visitedCount++; + boolean fitsInLong = IntegerType.INT_64.fits(value); + if (!fitsInLong) { + allFitInLong = false; + break; + } + } + + inferredType = (allFitInLong && visitedCount > 0) ? IntegerType.INT_64 : BigIntegerType.INSTANCE; + } + + return inferredType; + } + + @Override + public StorageType inferPreciseTypeShrunk() { + StorageType preciseType = inferPreciseType(); + if (preciseType instanceof IntegerType) { + // TODO Try shrinking + return preciseType; + } + + return preciseType; + } } From 0f15f744f42dab0b679930fa64c6729e12b64c13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 27 Sep 2023 12:19:23 +0200 Subject: [PATCH 26/31] special case for all "" --- .../org/enso/table/data/column/storage/StringStorage.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java index 57085da1cd3c..e3e3755137fa 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java @@ -175,18 +175,18 @@ public StorageType inferPreciseTypeShrunk() { long minLength = Long.MAX_VALUE; long maxLength = Long.MIN_VALUE; - long visitedCounter = 0; for (int i = 0; i < size(); i++) { String s = getItem(i); if (s != null) { long length = Text_Utils.grapheme_length(s); minLength = Math.min(minLength, length); maxLength = Math.max(maxLength, length); - visitedCounter++; } } - if (visitedCounter == 0) { + // maxLength will be <0 if all values were null and will be ==0 if all values were empty strings. + // In both of these cases, we avoid shrinking the type and return the original type instead. + if (maxLength <= 0) { return getType(); } From 735494af9238d6e51cea80413e5475e6ff5c3fa0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 27 Sep 2023 13:02:20 +0200 Subject: [PATCH 27/31] shrinking for Decimal and Float --- .../storage/numeric/BigIntegerStorage.java | 26 ++- .../numeric/ComputedNullableLongStorage.java | 197 ++++++++++++++++++ .../column/storage/numeric/DoubleStorage.java | 26 ++- 3 files changed, 245 insertions(+), 4 deletions(-) create mode 100644 std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/ComputedNullableLongStorage.java diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/BigIntegerStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/BigIntegerStorage.java index f2dbd45c12c8..76626b54126a 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/BigIntegerStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/BigIntegerStorage.java @@ -130,10 +130,32 @@ public StorageType inferPreciseType() { public StorageType inferPreciseTypeShrunk() { StorageType preciseType = inferPreciseType(); if (preciseType instanceof IntegerType) { - // TODO Try shrinking - return preciseType; + return findSmallestIntegerTypeThatFits(); } return preciseType; } + + private StorageType findSmallestIntegerTypeThatFits() { + // This method assumes that all values _do_ fit in some integer type. + assert inferredType instanceof IntegerType; + + final BigIntegerStorage parent = this; + + // We create a Long storage that gets values by converting our storage. + ComputedNullableLongStorage longAdapter = new ComputedNullableLongStorage(size) { + @Override + protected Long computeItem(int idx) { + BigInteger bigInteger = parent.getItem(idx); + if (bigInteger == null) { + return null; + } + + return bigInteger.longValueExact(); + } + }; + + // And rely on its shrinking logic. + return longAdapter.inferPreciseTypeShrunk(); + } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/ComputedNullableLongStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/ComputedNullableLongStorage.java new file mode 100644 index 000000000000..731bf2eaf669 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/ComputedNullableLongStorage.java @@ -0,0 +1,197 @@ +package org.enso.table.data.column.storage.numeric; + +import org.enso.table.data.column.storage.Storage; +import org.enso.table.data.column.storage.type.IntegerType; +import org.enso.table.data.index.Index; +import org.enso.table.data.mask.OrderMask; +import org.enso.table.data.mask.SliceRange; +import org.graalvm.polyglot.Context; + +import java.util.BitSet; +import java.util.List; + +/** + * Implements a storage that computes the ith stored value using some function. + * + *

This storage allows for missing values. Prefer {@link ComputedLongStorage} for non-nullable case. + */ +public abstract class ComputedNullableLongStorage extends AbstractLongStorage { + protected final int size; + + protected abstract Long computeItem(int idx); + + protected ComputedNullableLongStorage(int size) { + this.size = size; + } + + @Override + public int size() { + return size; + } + + @Override + public int countMissing() { + return 0; + } + + @Override + public IntegerType getType() { + return IntegerType.INT_64; + } + + @Override + public boolean isNa(long idx) { + if (idx < 0 || idx >= size) { + throw new IndexOutOfBoundsException( + "Index " + idx + " is out of bounds for range of length " + size + "."); + } + + return computeItem((int) idx) == null; + } + + @Override + public Long getItemBoxed(int idx) { + if (idx < 0 || idx >= size) { + throw new IndexOutOfBoundsException( + "Index " + idx + " is out of bounds for range of length " + size + "."); + } + + return computeItem(idx); + } + + public long getItem(int idx) { + return getItemBoxed(idx); + } + + @Override + public BitSet getIsMissing() { + BitSet missing = new BitSet(); + Context context = Context.getCurrent(); + for (int i = 0; i < size; i++) { + if (computeItem(i) == null) { + missing.set(i); + } + + context.safepoint(); + } + return missing; + } + + @Override + public Storage mask(BitSet mask, int cardinality) { + BitSet newMissing = new BitSet(); + long[] newData = new long[cardinality]; + int resIx = 0; + Context context = Context.getCurrent(); + for (int i = 0; i < size; i++) { + if (mask.get(i)) { + Long item = computeItem(i); + if (item == null) { + newMissing.set(resIx++); + } else { + newData[resIx++] = item; + } + } + + context.safepoint(); + } + return new LongStorage(newData, cardinality, newMissing, getType()); + } + + @Override + public Storage applyMask(OrderMask mask) { + int[] positions = mask.getPositions(); + long[] newData = new long[positions.length]; + BitSet newMissing = new BitSet(); + Context context = Context.getCurrent(); + for (int i = 0; i < positions.length; i++) { + if (positions[i] == Index.NOT_FOUND) { + newMissing.set(i); + } else { + Long item = computeItem(positions[i]); + if (item == null) { + newMissing.set(i); + } else { + newData[i] = item; + } + } + + context.safepoint(); + } + return new LongStorage(newData, positions.length, newMissing, getType()); + } + + @Override + public Storage countMask(int[] counts, int total) { + long[] newData = new long[total]; + BitSet newMissing = new BitSet(); + int pos = 0; + Context context = Context.getCurrent(); + for (int i = 0; i < counts.length; i++) { + Long item = computeItem(i); + if (item == null) { + newMissing.set(pos, pos + counts[i]); + pos += counts[i]; + } else { + long nonNullItem = item; + for (int j = 0; j < counts[i]; j++) { + newData[pos++] = nonNullItem; + } + } + + context.safepoint(); + } + return new LongStorage(newData, total, newMissing, getType()); + } + + @Override + public Storage slice(int offset, int limit) { + int newSize = Math.min(size - offset, limit); + long[] newData = new long[newSize]; + BitSet newMissing = new BitSet(); + Context context = Context.getCurrent(); + for (int i = 0; i < newSize; i++) { + Long item = computeItem(offset + i); + if (item == null) { + newMissing.set(i); + } else { + newData[i] = item; + } + context.safepoint(); + } + return new LongStorage(newData, newSize, newMissing, getType()); + } + + @Override + public Storage slice(List ranges) { + int newSize = SliceRange.totalLength(ranges); + long[] newData = new long[newSize]; + BitSet newMissing = new BitSet(newSize); + int offset = 0; + Context context = Context.getCurrent(); + for (SliceRange range : ranges) { + int rangeStart = range.start(); + int length = range.end() - rangeStart; + for (int i = 0; i < length; i++) { + Long item = computeItem(rangeStart + i); + if (item == null) { + newMissing.set(offset + i); + } else { + newData[offset + i] = item; + } + context.safepoint(); + } + offset += length; + } + + return new LongStorage(newData, newSize, newMissing, getType()); + } + + @Override + public AbstractLongStorage widen(IntegerType widerType) { + // Currently the implementation only reports 64-bit type so there is no widening to do - we can + // just return self. + assert getType().equals(IntegerType.INT_64); + return this; + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java index 27b5c459ce32..5d3aba45996e 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java @@ -423,10 +423,32 @@ public StorageType inferPreciseType() { public StorageType inferPreciseTypeShrunk() { StorageType inferred = inferPreciseType(); if (inferred instanceof IntegerType) { - // TODO delegate to infer smallest integer type to fit - return inferred; + return findSmallestIntegerTypeThatFits(); } else { return inferred; } } + + private StorageType findSmallestIntegerTypeThatFits() { + assert inferredType instanceof IntegerType; + + final DoubleStorage parent = this; + + // We create a Long storage that gets values by converting our storage. + ComputedNullableLongStorage longAdapter = new ComputedNullableLongStorage(size) { + @Override + protected Long computeItem(int idx) { + if (parent.isNa(idx)) { + return null; + } + + double value = parent.getItem(idx); + assert value % 1.0 == 0.0 : "The value " + value + " should be a whole number (guaranteed by checks)."; + return (long) value; + } + }; + + // And rely on its shrinking logic. + return longAdapter.inferPreciseTypeShrunk(); + } } From 7154cbba6f42ba2e3b2a9b822672551bd339d540 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 27 Sep 2023 13:13:07 +0200 Subject: [PATCH 28/31] allow adding whole-number Floats to a LongBuilder --- .../table/data/column/builder/LongBuilderChecked.java | 6 +++--- .../data/column/builder/LongBuilderUnchecked.java | 8 ++++---- .../table/data/column/storage/type/StorageType.java | 11 +++++++++-- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/builder/LongBuilderChecked.java b/std-bits/table/src/main/java/org/enso/table/data/column/builder/LongBuilderChecked.java index d6cd63677956..0ccd80609db2 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/builder/LongBuilderChecked.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/builder/LongBuilderChecked.java @@ -26,10 +26,10 @@ public void appendNoGrow(Object o) { if (o == null) { isMissing.set(currentSize++); } else { - try { - long x = NumericConverter.coerceToLong(o); + Long x = NumericConverter.tryConvertingToLong(o); + if (x != null) { appendLongNoGrow(x); - } catch (UnsupportedOperationException e) { + } else { throw new ValueTypeMismatchException(type, o); } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/builder/LongBuilderUnchecked.java b/std-bits/table/src/main/java/org/enso/table/data/column/builder/LongBuilderUnchecked.java index f36ab5f6c5d2..a835fa692955 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/builder/LongBuilderUnchecked.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/builder/LongBuilderUnchecked.java @@ -19,10 +19,10 @@ public void appendNoGrow(Object o) { if (o == null) { isMissing.set(currentSize++); } else { - try { - long x = NumericConverter.coerceToLong(o); - data[currentSize++] = x; - } catch (UnsupportedOperationException e) { + Long x = NumericConverter.tryConvertingToLong(o); + if (x != null) { + appendLongNoGrow(x); + } else { throw new ValueTypeMismatchException(getType(), o); } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/type/StorageType.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/type/StorageType.java index 7ae02626bd36..dfeeaa79ad22 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/type/StorageType.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/type/StorageType.java @@ -11,9 +11,11 @@ /** * Represents an underlying internal storage type that can be mapped to the Value Type that is exposed to users. */ -public sealed interface StorageType permits AnyObjectType, BigIntegerType, BooleanType, DateTimeType, DateType, FloatType, IntegerType, TextType, TimeOfDayType { +public sealed interface StorageType permits AnyObjectType, BigIntegerType, BooleanType, DateTimeType, DateType, + FloatType, IntegerType, TextType, TimeOfDayType { /** - * @return the StorageType that represents a given boxed item. + * @return the StorageType that represents a given boxed item. This has special handling for floating-point values - + * if they represent a whole number, they will be treated as integers. */ static StorageType forBoxedItem(Object item) { if (NumericConverter.isCoercibleToLong(item)) { @@ -21,6 +23,11 @@ static StorageType forBoxedItem(Object item) { } if (NumericConverter.isFloatLike(item)) { + double value = NumericConverter.coerceToDouble(item); + if (value % 1.0 == 0.0 && IntegerType.INT_64.fits(value)) { + return IntegerType.INT_64; + } + return FloatType.FLOAT_64; } From 4880f4742a93f462290357bfe387203728d89e94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 27 Sep 2023 14:58:54 +0200 Subject: [PATCH 29/31] Prevent Char size=0, both in in-memory constructor and at 'type-level' --- .../Base/0.0.0-dev/src/Data/Boolean.enso | 3 -- .../Base/0.0.0-dev/src/Data/Numbers.enso | 37 +++++++++++++++++++ .../0.0.0-dev/src/Data/Type/Enso_Types.enso | 9 +++-- .../0.0.0-dev/src/Data/Type/Storage.enso | 2 +- .../0.0.0-dev/src/Data/Type/Value_Type.enso | 25 +++++++++---- .../operation/map/text/StringStringOp.java | 2 +- .../data/column/storage/type/TextType.java | 12 +++++- .../Conversion_Spec.enso | 10 ++++- .../src/Common_Table_Operations/Map_Spec.enso | 2 +- .../src/Helpers/Value_Type_Spec.enso | 2 +- .../src/In_Memory/Column_Spec.enso | 15 +++++++- 11 files changed, 97 insertions(+), 22 deletions(-) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Boolean.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Boolean.enso index e03cd62ef880..22f88f46efdf 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Boolean.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Boolean.enso @@ -1,6 +1,4 @@ import project.Any.Any -import project.Data.Ordering.Comparable -import project.Data.Ordering.Ordering import project.Nothing.Nothing from project.Data.Boolean.Boolean import False, True @@ -98,4 +96,3 @@ type Boolean if (27 % 3) == 0 then IO.println "Fizz" if_then : Any -> Any | Nothing if_then self ~on_true = @Builtin_Method "Boolean.if_then" - diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Numbers.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Numbers.enso index 2cf6d7a10fda..b3c92a28dcce 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Numbers.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Numbers.enso @@ -1,4 +1,5 @@ import project.Any.Any +import project.Data.Ordering.Comparable import project.Data.Locale.Locale import project.Data.Text.Text import project.Error.Error @@ -1169,3 +1170,39 @@ type Number_Parse_Error to_display_text : Text to_display_text self = "Could not parse " + self.text.to_text + " as a double." + +## A wrapper type that ensures that a function may only take positive integers. +type Positive_Integer + ## PRIVATE + This constructor should not be used by user code as it can be used to + break the invariants. Instead, this type should only be created by `new` + or conversions. + Value (integer : Integer) + + ## PRIVATE + ADVANCED + Constructor to create a `Positive_Integer` from an `Integer` - checking + if it satisfies the condition. User code should prefer the + `Positive_Integer.from` conversion. + new (integer : Integer) = + if integer > 0 then Positive_Integer.Value integer else + Error.throw (Illegal_Argument.Error "Expected a positive integer, but got "+integer.to_display_text) + +## Allows to create a `Positive_Integer` from an `Integer`. + It will throw `Illegal_Argument` if the provided integer is not positive. +Positive_Integer.from (that : Integer) = Positive_Integer.new that + +## PRIVATE +Integer.from (that : Positive_Integer) = that.integer + +## PRIVATE +type Positive_Integer_Comparator + ## PRIVATE + compare x y = + Comparable.from x.integer . compare x.integer y.integer + + ## PRIVATE + hash x = Comparable.from x.integer . hash x.integer + +## PRIVATE +Comparable.from (_:Positive_Integer) = Positive_Integer_Comparator diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Enso_Types.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Enso_Types.enso index 2a42c36fb3ab..ea1e57a7d3c1 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Enso_Types.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Enso_Types.enso @@ -33,9 +33,12 @@ most_specific_value_type value use_smallest=False = # We do a small rewrite here - for integers we always return the Integer type, even if the value is small enough to fit in a Byte. if value_type == Value_Type.Byte then Value_Type.Integer Bits.Bits_16 else value_type True -> Value_Type.Decimal precision=Nothing scale=0 - text : Text -> case use_smallest of - False -> Value_Type.Char size=Nothing variable_length=True - True -> Value_Type.Char size=text.length variable_length=False + text : Text -> + length = text.length + # Not using Char size=0 for empty strings, because that would be an invalid value. + case use_smallest && length > 0 of + True -> Value_Type.Char size=text.length variable_length=False + False -> Value_Type.Char size=Nothing variable_length=True ## TODO [RW] once we add Enso Native Object Type Value Type, we probably want to prefer it over Mixed _ -> Value_Type.Mixed diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Storage.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Storage.enso index a71f907c6c0d..61d761293118 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Storage.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Storage.enso @@ -54,7 +54,7 @@ closest_storage_type value_type = case value_type of Error.throw (Illegal_Argument.Error "Value_Type.Char with fixed length must have a non-nothing size") Value_Type.Char max_length variable_length -> fixed_length = variable_length.not - TextType.new max_length fixed_length + TextType.new (max_length : Integer) fixed_length Value_Type.Date -> DateType.INSTANCE # We currently will not support storing dates without timezones in in-memory mode. Value_Type.Date_Time _ -> DateTimeType.INSTANCE diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Value_Type.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Value_Type.enso index f7f713a2c3ae..7af1c1298fe7 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Value_Type.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Type/Value_Type.enso @@ -1,4 +1,5 @@ from Standard.Base import all +import Standard.Base.Data.Numbers.Positive_Integer import Standard.Base.Errors.Illegal_Argument.Illegal_Argument import project.Data.Type.Value_Type_Helpers @@ -107,10 +108,10 @@ type Value_Type Arguments: - size: the maximum number of characters that can be stored in the - column. It can be nothing to indicate no limit. + column. It can be nothing to indicate no limit. It cannot be 0. - variable_length: whether the size is a maximum or a fixed length. A fixed length string must have a non-nothing size. - Char size:(Integer|Nothing)=Nothing variable_length:Boolean=True + Char (size : (Positive_Integer | Nothing) = Nothing) variable_length:Boolean=True ## Date @@ -393,15 +394,23 @@ type Value_Type Value_Type.Integer size -> "Integer (" + size.to_text + ")" Value_Type.Float size -> "Float (" + size.to_text + ")" Value_Type.Decimal precision scale -> "Decimal (precision=" + precision.to_text + ", scale=" + scale.to_text + ")" - Value_Type.Char size variable_length -> case variable_length of - True -> "Char (variable length, max_size=" + size.to_text + ")" - False -> "Char (fixed length, size=" + size.to_text + ")" + Value_Type.Char size variable_length -> + size_text = case size of + Nothing -> "unlimited" + _ -> size.to Integer . to_text + case variable_length of + True -> "Char (variable length, max_size=" + size_text + ")" + False -> "Char (fixed length, size=" + size_text + ")" Value_Type.Date -> "Date" Value_Type.Date_Time with_timezone -> "Date_Time (with_timezone=" + with_timezone.to_text + ")" Value_Type.Time -> "Time" - Value_Type.Binary size variable_length -> case variable_length of - True -> "Binary (variable length, max_size=" + size.to_text + " bytes)" - False -> "Binary (fixed length, size=" + size.to_text + " bytes)" + Value_Type.Binary size variable_length -> + size_text = case size of + Nothing -> "unlimited" + _ -> size.to Integer . to_text + " bytes" + case variable_length of + True -> "Binary (variable length, max_size=" + size_text + ")" + False -> "Binary (fixed length, size=" + size_text + ")" Value_Type.Unsupported_Data_Type type_name _ -> case type_name of Nothing -> "Unsupported_Data_Type" _ : Text -> "Unsupported_Data_Type (" + type_name + ")" diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/StringStringOp.java b/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/StringStringOp.java index 3f67de2f278e..b984f2dcd945 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/StringStringOp.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/StringStringOp.java @@ -23,7 +23,7 @@ public StringStringOp(String name) { public Storage runBinaryMap(SpecializedStorage storage, Object arg, MapOperationProblemBuilder problemBuilder) { int size = storage.size(); if (arg == null) { - StringBuilder builder = new StringBuilder(size, TextType.variableLengthWithLimit(0)); + StringBuilder builder = new StringBuilder(size, TextType.VARIABLE_LENGTH); builder.appendNulls(size); return builder.seal(); } else if (arg instanceof String argString) { diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/type/TextType.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/type/TextType.java index cfc72407998b..311dee27c170 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/type/TextType.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/type/TextType.java @@ -3,6 +3,12 @@ import org.enso.base.Text_Utils; public record TextType(long maxLength, boolean fixedLength) implements StorageType { + public TextType { + if (maxLength == 0) { + throw new IllegalArgumentException("The maxLength of a text type must be positive or -1 to indicate unlimited length."); + } + } + public static final TextType VARIABLE_LENGTH = new TextType(-1, false); public static TextType fixedLength(long length) { @@ -10,7 +16,7 @@ public static TextType fixedLength(long length) { } public static TextType variableLengthWithLimit(long maxLength) { - assert maxLength >= 0; + assert maxLength > 0; return new TextType(maxLength, false); } @@ -90,6 +96,10 @@ public static TextType concatTypes(TextType type1, TextType type2) { boolean bothFixed = type1.fixedLength && type2.fixedLength; long lengthSum = type1.maxLength + type2.maxLength; + if (lengthSum == 0) { + return VARIABLE_LENGTH; + } + return new TextType(lengthSum, bothFixed); } } diff --git a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso index 369c5085950e..e4dc5984ef49 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Conversion_Spec.enso @@ -67,7 +67,6 @@ spec setup = c.value_type.is_text . should_be_true c.to_vector . should_equal ["{{{MY Type [x=42] }}}", "{{{MY Type [x=X] }}}"] - # TODO what to test here? Test.specify "should allow to cast an integer column to a decimal type" <| t = table_builder [["X", [1, 2, 3]]] c = t.at "X" . cast Value_Type.Decimal @@ -116,6 +115,15 @@ spec setup = w2 = Problems.expect_warning Conversion_Failure c2 w2.affected_rows_count . should_equal 4 + Test.specify "should not allow 0-length Char type" <| + c1 = table_builder [["X", ["a", "", "bcd"]]] . at "X" + r1 = c1.cast (Value_Type.Char size=0 variable_length=False) + r1.should_fail_with Illegal_Argument + r1.catch.to_display_text . should_contain "positive" + + r2 = c1.cast (Value_Type.Char size=0 variable_length=True) + r2.should_fail_with Illegal_Argument + Test.group prefix+"Table/Column.cast - numeric" <| Test.specify "should allow to cast a boolean column to integer" <| t = table_builder [["X", [True, False, True]]] diff --git a/test/Table_Tests/src/Common_Table_Operations/Map_Spec.enso b/test/Table_Tests/src/Common_Table_Operations/Map_Spec.enso index 2bc2074d1f4e..9a46772568cd 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Map_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Map_Spec.enso @@ -125,7 +125,7 @@ spec setup = k x = if x == 2 then Time_Of_Day.new 13 05 else (x+1).to_text r7 = c1.map k expected_value_type=Value_Type.Char r7.should_fail_with Invalid_Value_Type - r7.catch.to_display_text . should_contain "Expected type Char (variable length, max_size=Nothing), but got a value 13:05:00 of type Time" + r7.catch.to_display_text . should_contain "Expected type Char (variable length, max_size=unlimited), but got a value 13:05:00 of type Time" l x = if x == 2 then 42 else Date.new 2022 05 x r8 = c1.map l expected_value_type=Value_Type.Date diff --git a/test/Table_Tests/src/Helpers/Value_Type_Spec.enso b/test/Table_Tests/src/Helpers/Value_Type_Spec.enso index d39064bd1e7e..646dc443fc1f 100644 --- a/test/Table_Tests/src/Helpers/Value_Type_Spec.enso +++ b/test/Table_Tests/src/Helpers/Value_Type_Spec.enso @@ -17,7 +17,7 @@ spec = Value_Type.Float.to_display_text . should_equal "Float (64 bits)" Value_Type.Decimal.to_display_text . should_equal "Decimal (precision=Nothing, scale=Nothing)" - Value_Type.Char.to_display_text . should_equal "Char (variable length, max_size=Nothing)" + Value_Type.Char.to_display_text . should_equal "Char (variable length, max_size=unlimited)" (Value_Type.Binary 8 False).to_display_text . should_equal "Binary (fixed length, size=8 bytes)" Value_Type.Date.to_display_text . should_equal "Date" diff --git a/test/Table_Tests/src/In_Memory/Column_Spec.enso b/test/Table_Tests/src/In_Memory/Column_Spec.enso index 3f0979b4e597..ae83602402fb 100644 --- a/test/Table_Tests/src/In_Memory/Column_Spec.enso +++ b/test/Table_Tests/src/In_Memory/Column_Spec.enso @@ -166,10 +166,14 @@ spec = c8.value_type . should_equal Value_Type.Mixed c8.to_vector . should_equal ["aaa", 42, Date.new 2022 08 22] + c9 = Column.from_vector "X" [Time_Of_Day.new 10 11 12, Time_Of_Day.new 11 30] Value_Type.Time + c9.value_type . should_equal Value_Type.Time + c9.to_vector . should_equal [Time_Of_Day.new 10 11 12, Time_Of_Day.new 11 30] + Test.specify "will fail if unexpected values are encountered for the requested type" <| r1 = Column.from_vector "X" ["a", 2] Value_Type.Char r1.should_fail_with Invalid_Value_Type - r1.catch.to_display_text.should_contain "Expected type Char (variable length, max_size=Nothing), but got a value 2 of type Integer (16 bits)" + r1.catch.to_display_text.should_contain "Expected type Char (variable length, max_size=unlimited), but got a value 2 of type Integer (16 bits)" r2 = Column.from_vector "X" ["aaa", "b"] (Value_Type.Char size=3 variable_length=False) r2.should_fail_with Invalid_Value_Type @@ -177,7 +181,7 @@ spec = r3 = Column.from_vector "X" ["aaa", 42] Value_Type.Char r3.should_fail_with Invalid_Value_Type - r3.catch.to_display_text.should_contain "Expected type Char (variable length, max_size=Nothing), but got a value 42 of type Integer (16 bits)" + r3.catch.to_display_text.should_contain "Expected type Char (variable length, max_size=unlimited), but got a value 42 of type Integer (16 bits)" r4 = Column.from_vector "X" [12, Time_Of_Day.new 10 11 12] Value_Type.Integer r4.should_fail_with Invalid_Value_Type @@ -199,6 +203,13 @@ spec = r8.should_fail_with Invalid_Value_Type r8.catch.to_display_text.should_contain "Expected type Byte, but got a value 1000000000 of type Integer (32 bits)" + Test.specify "will not allow to construct a column with Char size=0" <| + r1 = Column.from_vector "X" [] (Value_Type.Char size=0 variable_length=False) + r1.should_fail_with Illegal_Argument + + r2 = Column.from_vector "X" [] (Value_Type.Char size=0 variable_length=True) + r2.should_fail_with Illegal_Argument + Test.group "Rounding" <| Test.specify "should be able to round a column of decimals" <| Column.from_vector "foo" [1.2, 2.3, 2.5, 3.6] . round . should_equal (Column.from_vector "round([foo])" [1, 2, 3, 4]) From 7e56752dee5d82087965582ab29f402acd49b38a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 27 Sep 2023 15:00:17 +0200 Subject: [PATCH 30/31] changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c242d4d9b54..f697afa66d75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -578,6 +578,7 @@ - [Renamed `Decimal` to `Float`.][7807] - [Implemented `Date_Time_Formatter` for more user-friendly date/time format parsing.][7826] +- [Implemented `Table.auto_value_types` for in-memory tables.][7908] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -820,6 +821,7 @@ [7776]: https://github.com/enso-org/enso/pull/7776 [7807]: https://github.com/enso-org/enso/pull/7807 [7826]: https://github.com/enso-org/enso/pull/7826 +[7908]: https://github.com/enso-org/enso/pull/7908 #### Enso Compiler From 445956973b70f2ae613637d64b4b45797830f600 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 27 Sep 2023 15:08:02 +0200 Subject: [PATCH 31/31] javafmt --- .../data/column/storage/MixedStorage.java | 3 ++- .../table/data/column/storage/Storage.java | 14 +++++----- .../storage/numeric/AbstractLongStorage.java | 13 ++++------ .../storage/numeric/BigIntegerStorage.java | 26 ++++++++++--------- .../numeric/ComputedNullableLongStorage.java | 8 +++--- .../column/storage/numeric/DoubleStorage.java | 26 ++++++++++--------- 6 files changed, 46 insertions(+), 44 deletions(-) diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorage.java index c5286c82729d..ce0e7ef854bd 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/MixedStorage.java @@ -114,7 +114,8 @@ public StorageType inferPreciseTypeShrunk() { return AnyObjectType.INSTANCE; } - // If we are able to get a more specialized storage for more specific type - we delegate to its own shrinking logic. + // If we are able to get a more specialized storage for more specific type - we delegate to its + // own shrinking logic. return specialized.inferPreciseTypeShrunk(); } diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java index b5862940cbd7..adc67afcb558 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java @@ -40,13 +40,13 @@ public StorageType inferPreciseType() { } /** - * Returns the smallest type (according to Column.auto_value_type rules) that may still fit all values in this - * column. - *

- * It is a sibling of `inferPreciseType` that allows some further shrinking. It is kept separate, because - * `inferPreciseType` should be quick to compute (cached if needed) as it is used in typechecking of lots of - * operations. This one however, is only used in a specific `auto_value_type` use-case and rarely will need to be - * computed more than once. + * Returns the smallest type (according to Column.auto_value_type rules) that may still fit all + * values in this column. + * + *

It is a sibling of `inferPreciseType` that allows some further shrinking. It is kept + * separate, because `inferPreciseType` should be quick to compute (cached if needed) as it is + * used in typechecking of lots of operations. This one however, is only used in a specific + * `auto_value_type` use-case and rarely will need to be computed more than once. */ public StorageType inferPreciseTypeShrunk() { return getType(); diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/AbstractLongStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/AbstractLongStorage.java index cb25574b4836..04a00bf8b00b 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/AbstractLongStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/AbstractLongStorage.java @@ -1,5 +1,6 @@ package org.enso.table.data.column.storage.numeric; +import java.util.BitSet; import org.enso.table.data.column.builder.Builder; import org.enso.table.data.column.builder.NumericBuilder; import org.enso.table.data.column.operation.map.MapOperationProblemBuilder; @@ -25,8 +26,6 @@ import org.enso.table.data.column.storage.type.StorageType; import org.graalvm.polyglot.Context; -import java.util.BitSet; - public abstract class AbstractLongStorage extends NumericStorage { public abstract long getItem(int idx); @@ -87,17 +86,15 @@ public StorageType inferPreciseType() { @Override public StorageType inferPreciseTypeShrunk() { - // If the type is already smallest possible, we return it unchanged (we will return 8-bit columns as-is, although + // If the type is already smallest possible, we return it unchanged (we will return 8-bit + // columns as-is, although // we will not shrink 16-bit columns to 8-bits even if it were possible). if (getType().bits().toInteger() <= 16) { return getType(); } - IntegerType[] possibleTypes = new IntegerType[] { - IntegerType.INT_16, - IntegerType.INT_32, - IntegerType.INT_64 - }; + IntegerType[] possibleTypes = + new IntegerType[] {IntegerType.INT_16, IntegerType.INT_32, IntegerType.INT_64}; int currentTypeIdx = 0; int n = size(); diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/BigIntegerStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/BigIntegerStorage.java index 76626b54126a..be5cb195df3a 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/BigIntegerStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/BigIntegerStorage.java @@ -120,7 +120,8 @@ public StorageType inferPreciseType() { } } - inferredType = (allFitInLong && visitedCount > 0) ? IntegerType.INT_64 : BigIntegerType.INSTANCE; + inferredType = + (allFitInLong && visitedCount > 0) ? IntegerType.INT_64 : BigIntegerType.INSTANCE; } return inferredType; @@ -143,17 +144,18 @@ private StorageType findSmallestIntegerTypeThatFits() { final BigIntegerStorage parent = this; // We create a Long storage that gets values by converting our storage. - ComputedNullableLongStorage longAdapter = new ComputedNullableLongStorage(size) { - @Override - protected Long computeItem(int idx) { - BigInteger bigInteger = parent.getItem(idx); - if (bigInteger == null) { - return null; - } - - return bigInteger.longValueExact(); - } - }; + ComputedNullableLongStorage longAdapter = + new ComputedNullableLongStorage(size) { + @Override + protected Long computeItem(int idx) { + BigInteger bigInteger = parent.getItem(idx); + if (bigInteger == null) { + return null; + } + + return bigInteger.longValueExact(); + } + }; // And rely on its shrinking logic. return longAdapter.inferPreciseTypeShrunk(); diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/ComputedNullableLongStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/ComputedNullableLongStorage.java index 731bf2eaf669..835c41972553 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/ComputedNullableLongStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/ComputedNullableLongStorage.java @@ -1,5 +1,7 @@ package org.enso.table.data.column.storage.numeric; +import java.util.BitSet; +import java.util.List; import org.enso.table.data.column.storage.Storage; import org.enso.table.data.column.storage.type.IntegerType; import org.enso.table.data.index.Index; @@ -7,13 +9,11 @@ import org.enso.table.data.mask.SliceRange; import org.graalvm.polyglot.Context; -import java.util.BitSet; -import java.util.List; - /** * Implements a storage that computes the ith stored value using some function. * - *

This storage allows for missing values. Prefer {@link ComputedLongStorage} for non-nullable case. + *

This storage allows for missing values. Prefer {@link ComputedLongStorage} for non-nullable + * case. */ public abstract class ComputedNullableLongStorage extends AbstractLongStorage { protected final int size; diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java index 5d3aba45996e..8159aac377dc 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/numeric/DoubleStorage.java @@ -435,18 +435,20 @@ private StorageType findSmallestIntegerTypeThatFits() { final DoubleStorage parent = this; // We create a Long storage that gets values by converting our storage. - ComputedNullableLongStorage longAdapter = new ComputedNullableLongStorage(size) { - @Override - protected Long computeItem(int idx) { - if (parent.isNa(idx)) { - return null; - } - - double value = parent.getItem(idx); - assert value % 1.0 == 0.0 : "The value " + value + " should be a whole number (guaranteed by checks)."; - return (long) value; - } - }; + ComputedNullableLongStorage longAdapter = + new ComputedNullableLongStorage(size) { + @Override + protected Long computeItem(int idx) { + if (parent.isNa(idx)) { + return null; + } + + double value = parent.getItem(idx); + assert value % 1.0 == 0.0 + : "The value " + value + " should be a whole number (guaranteed by checks)."; + return (long) value; + } + }; // And rely on its shrinking logic. return longAdapter.inferPreciseTypeShrunk();