diff --git a/CHANGELOG.md b/CHANGELOG.md index 49572e322ac4..6a214d824754 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -117,6 +117,7 @@ - [Implemented a basic reader for the `Delimited` file format.][3424] - [Implemented a reader for the `Excel` file format.][3425] - [Added custom encoding support to the `Delimited` file format reader.][3430] +- [Implemented `compute` method on `Vector` for statistics calculations.][3442] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -181,6 +182,7 @@ [3424]: https://github.com/enso-org/enso/pull/3424 [3425]: https://github.com/enso-org/enso/pull/3425 [3430]: https://github.com/enso-org/enso/pull/3430 +[3442]: https://github.com/enso-org/enso/pull/3442 #### Enso Compiler diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Any.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Any.enso index b41e85165333..cb50239447bf 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Any.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Any.enso @@ -167,7 +167,9 @@ type Any a = 6 * 21 a >= 147 >= : Any -> Boolean - >= that = (this > that) || (this == that) + >= that = + ordering = this.compare_to that + (ordering == Ordering.Greater) || (ordering == Ordering.Equal) ## ALIAS Less Than @@ -218,7 +220,9 @@ type Any a = 7 * 21 a < 147 <= : Any -> Boolean - <= that = (this < that) || (this == that) + <= that = + ordering = this.compare_to that + (ordering == Ordering.Less) || (ordering == Ordering.Equal) ## Checks if the type is an instance of `Nothing`. @@ -230,9 +234,17 @@ type Any 1.is_nothing is_nothing : Boolean - is_nothing = case this of - Nothing -> True - _ -> False + is_nothing = False + + ## UNSTABLE + If this is Nothing then returns `function`. + + > Example + If the value "Hello" is nothing return "". + + "Hello".if_nothing "" + if_nothing : Any -> Any + if_nothing ~_ = this ## Executes the provided handler on an error, or returns a non-error value unchanged. diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Locale.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Locale.enso index 9038ef90ad47..3af68115cab6 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Locale.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Locale.enso @@ -249,8 +249,8 @@ us = here.from_language_tag "en-US" example_new = Locale.new "en" "GB" "UTF-8" new : Text -> Text | Nothing -> Text | Nothing -> Locale new language country=Nothing variant=Nothing = - country_text = if country.is_nothing then "" else country - variant_text = if variant.is_nothing then "" else variant + country_text = country.if_nothing "" + variant_text = variant.if_nothing "" java_locale = JavaLocale.new language country_text variant_text here.from_java java_locale diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Ordering/Comparator.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Ordering/Comparator.enso index 87d3ee12cbdd..492c5dbb6741 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Ordering/Comparator.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Ordering/Comparator.enso @@ -1,12 +1,17 @@ -from Standard.Base import all +from Standard.Base import Any, Ordering, Nothing, Vector polyglot java import org.enso.base.ObjectComparator ## ADVANCED - Creates a Java Comparator object which can call En + Creates a Java Comparator object which can call Enso compare_to + + Arguments: + - custom_comparator: + If `Nothing` will get a singleton instance for `.compare_to`. + Otherwise can support a custom fallback comparator. new : Nothing | (Any->Any->Ordering) new custom_comparator=Nothing = - comparator_to_java cmp x y = cmp x y . to_sign + comparator_to_java cmp x y = Vector.handle_incomparable_value (cmp x y . to_sign) case custom_comparator of Nothing -> ObjectComparator.getInstance (comparator_to_java .compare_to) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso new file mode 100644 index 000000000000..fa06bb821a6c --- /dev/null +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso @@ -0,0 +1,117 @@ +from Standard.Base import Boolean, True, False, Nothing, Vector, Number, Any, Error, Array, Panic, Illegal_Argument_Error, Unsupported_Argument_Types +from Standard.Base.Data.Vector import Empty_Error + +import Standard.Base.Data.Ordering.Comparator + +polyglot java import org.enso.base.statistics.Moments +polyglot java import org.enso.base.statistics.CountMinMax + +type Statistic + ## PRIVATE + Convert the Enso Statistic into Java equivalent. + to_java : SingleValue + to_java = case this of + Sum -> Moments.SUM + Mean -> Moments.MEAN + Variance p -> if p then Moments.VARIANCE_POPULATION else Moments.VARIANCE + Standard_Deviation p -> if p then Moments.STANDARD_DEVIATION_POPULATION else Moments.STANDARD_DEVIATION + Skew p -> if p then Moments.SKEW_POPULATION else Moments.SKEW + Kurtosis -> Moments.KURTOSIS + _ -> Nothing + + ## Count the number of non-Nothing and non-NaN values. + type Count + + ## The minimum value. + type Minimum + + ## The maximum value. + type Maximum + + ## Sum the non-Nothing and non-NaN values. + type Sum + + ## The sample mean of the values. + type Mean + + ## The variance of the values. + Arguments: + - population: specifies if data is a sample or the population. + type Variance (population:Boolean=False) + + ## The standard deviation of the values. + Arguments: + - population: specifies if data is a sample or the population. + type Standard_Deviation (population:Boolean=False) + + ## The skewness of the values. + Arguments: + - population: specifies if data is a sample or the population. + type Skew (population:Boolean=False) + + ## The sample kurtosis of the values. + type Kurtosis + +## Compute a single statistic on a vector like object. + + Arguments: + - data: Vector like object which has a `to_array` method. + - statistic: Statistic to calculate. +compute : Vector -> Statistic -> Any +compute data statistic=Count = + here.compute_bulk data [statistic] . first + + +## Compute a set of statistics on a vector like object. + + Arguments: + - data: Vector like object which has a `to_array` method. + - statistics: Set of statistics to calculate. +compute_bulk : Vector -> [Statistic] -> [Any] +compute_bulk data statistics=[Count, Sum] = + + count_min_max = statistics.any s->((s.is_a Count) || (s.is_a Minimum) || (s.is_a Maximum)) + + java_stats = statistics.map .to_java + skip_java_stats = java_stats.all s->s.is_nothing + report_invalid _ = + statistics.map_with_index i->v-> + if java_stats.at i . is_nothing then Nothing else + Error.throw (Illegal_Argument_Error ("Can only compute " + v.to_text + " on numerical data sets.")) + handle_unsupported = Panic.catch Unsupported_Argument_Types handler=report_invalid + + empty_map s = if (s == Count) || (s == Sum) then 0 else + if (s == Minimum) || (s == Maximum) then Error.throw Empty_Error else + Number.nan + + if data.length == 0 then statistics.map empty_map else + count_min_max_values = if count_min_max then CountMinMax.new (CountMinMax.toObjectStream data.to_array) Comparator.new else Nothing + stats_array = if skip_java_stats then Nothing else + handle_unsupported <| Moments.compute data.to_array java_stats.to_array + + statistics.map_with_index i->s->case s of + Count -> count_min_max_values.count + Minimum -> + if count_min_max_values.comparatorError then (Error.throw Vector.Incomparable_Values_Error) else + count_min_max_values.minimum + Maximum -> + if count_min_max_values.comparatorError then (Error.throw Vector.Incomparable_Values_Error) else + count_min_max_values.maximum + _ -> stats_array.at i + +## Compute a single statistic on the vector. + + Arguments: + - statistic: Statistic to calculate. +Vector.Vector.compute : Statistic -> Any +Vector.Vector.compute statistic=Count = + this.compute_bulk [statistic] . first + + +## Compute statistics on the vector. + + Arguments: + - statistics: Set of statistics to calculate. +Vector.Vector.compute_bulk : [Statistic] -> [Any] +Vector.Vector.compute_bulk statistics=[Count, Sum] = + here.compute_bulk this statistics diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso index 467ec8edde3c..09e5fca7a27f 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso @@ -960,7 +960,7 @@ type Vector More details on the HashCode / HashMap ticket https://www.pivotaltracker.com/story/show/181027272. - recovered = Panic.recover Any + here.handle_incomparable_value <| builder = here.new_builder this.fold Map.empty existing-> item-> @@ -970,12 +970,6 @@ type Vector existing.insert key True builder.to_vector - recovered.map_error e-> case e of - No_Such_Method_Error _ _ -> Incomparable_Values_Error - Unsupported_Argument_Types _ -> Incomparable_Values_Error - Type_Error _ _ _ -> Incomparable_Values_Error - _ -> Panic.throw e - ## UNSTABLE @@ -1169,3 +1163,10 @@ type Partition_Accumulator true_builder false_builder ix An error indicating that the vector contains incomparable types. type Incomparable_Values_Error + +## ADVANCED + Catches possible errors from comparing values and throws an + Incomparable_Values_Error if any occur. +handle_incomparable_value ~function = + handle t = Panic.catch t handler=(Error.throw Incomparable_Values_Error) + handle No_Such_Method_Error <| handle Type_Error <| handle Unsupported_Argument_Types <| function diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Nothing.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Nothing.enso index 9b24da3a2528..f88e2a00e4ed 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Nothing.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Nothing.enso @@ -1,7 +1,30 @@ -## The type that has only a singleton value. +from Standard.Base import Boolean, True - It is often used alongside a value of type a to provide a Maybe or - Option abstraction. The type a | Nothing is semantically equivalent to - Maybe a. -@Builtin_Type type Nothing + ## The type that has only a singleton value. Nothing in Enso is used as an + universal value to indicate the lack of presence of a value. + + It is often used alongside a value of type a to provide a Maybe or + Option abstraction. The type a | Nothing is semantically equivalent to + Maybe a. + @Builtin_Type + type Nothing + + ## Checks if the type is an instance of `Nothing`. + + > Example + Checking if the value 1 is nothing. + + 1.is_nothing + is_nothing : Boolean + is_nothing = True + + ## UNSTABLE + If this is Nothing then returns `function`. + + > Example + If the value "Hello" is nothing return "". + + "Hello".if_nothing "" + if_nothing : Any -> Any + if_nothing ~function = function diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso index f33be8901078..3cc7abc6f57e 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso @@ -114,7 +114,7 @@ type Column defaults to the current type if not provided. make_binary_op : Text -> Text -> (Column | Any) -> (Sql_Type | Nothing) -> (Sql_Type | Nothing) -> Column make_binary_op op_kind operand new_type=Nothing operand_type=Nothing = - actual_new_type = if new_type.is_nothing then this.sql_type else new_type + actual_new_type = new_type.if_nothing this.sql_type case operand of Column _ _ _ other_expr _ -> case Helpers.check_integrity this operand of @@ -124,7 +124,7 @@ type Column new_expr = IR.Operation op_kind [this.expression, other_expr] Column this.name this.connection actual_new_type new_expr this.context _ -> - actual_operand_type = if operand_type.is_nothing then this.sql_type else operand_type + actual_operand_type = operand_type.if_nothing this.sql_type other = IR.make_constant actual_operand_type operand new_expr = IR.Operation op_kind [this.expression, other] Column this.name this.connection actual_new_type new_expr this.context @@ -139,7 +139,7 @@ type Column operator. make_unary_op : Text -> Text -> (Sql_Type | Nothing) -> Column make_unary_op op_kind new_type=Nothing = - actual_new_type = if new_type.is_nothing then this.sql_type else new_type + actual_new_type = new_type.if_nothing this.sql_type new_expr = IR.Operation op_kind [this.expression] Column this.name this.connection actual_new_type new_expr this.context @@ -605,7 +605,7 @@ type Aggregate_Column_Builder - new_type: The SQL type of the result column. make_aggregate : Column -> Text -> Text -> Sql_Type -> Column make_aggregate column operation name_suffix="_agg" new_type=Nothing = - actual_new_type = if new_type.is_nothing then column.sql_type else new_type + actual_new_type = new_type.if_nothing column.sql_type expr = IR.Operation operation [column.expression] case Helpers.ensure_name_is_sane name_suffix of True -> diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso index 89ae00d443cc..f177549c1686 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso @@ -507,8 +507,8 @@ type Table Text -> wrap_elem elem Column _ _ _ _ _ -> wrap_elem elem Order_Rule elem Nothing my_order my_nulls -> - chosen_order = if my_order.is_nothing then order else my_order - chosen_nulls = if my_nulls.is_nothing then missing_last else my_nulls + chosen_order = my_order.if_nothing order + chosen_nulls = my_nulls.if_nothing missing_last [this.resolve elem . expression, order_to_ir chosen_order, missing_to_ir chosen_nulls] Order_Rule _ _ _ _ -> Error.throw <| Unsupported_Database_Operation_Error "Custom comparators are not supported in Database" diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Group_By_Key.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Group_By_Key.enso deleted file mode 100644 index 6068370b55e5..000000000000 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Group_By_Key.enso +++ /dev/null @@ -1,43 +0,0 @@ -from Standard.Base import all -import Standard.Base.Data.Ordering.Vector_Lexicographic_Order - -## Create a key structure for grouping operations -key : Vector -> Group_By_Key -key values = - mapper c = Comparable_Value c - Group_By_Key <| values.map mapper - -## PRIVATE - Represents a comparable vector of element which is used as key for grouping. -type Group_By_Key - type Group_By_Key values - - ## See if two keys are equal - == : Group_By_Key->Boolean - == that = this.values == that.values - - ## Compares two keys - compare_to : Group_By_Key->Ordering - compare_to that = - Vector_Lexicographic_Order.compare this.values that.values - -## PRIVATE - Temporary workaround allowing Nothing to be in a Group_By -type Comparable_Value - type Comparable_Value value - - == : Comparable_Nothing->Boolean - == that = (this.compare_to that) == Ordering.Equal - - compare_to : Any->Ordering - compare_to that = - value = case that of - Comparable_Value v -> v - _ -> that - - case this.value of - Nothing -> if value.is_nothing then Ordering.Equal else Ordering.Less - _ -> if value.is_nothing then Ordering.Greater else this.value.compare_to value - - is_nothing : Boolean - is_nothing = this.value.is_nothing diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso index 3f2884f2055f..62c12895a6dd 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso @@ -17,7 +17,6 @@ from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Prob import Standard.Table.Data.Column_Mapping import Standard.Table.Data.Position -import Standard.Table.Data.Group_By_Key import Standard.Table.Data.Aggregate_Column polyglot java import org.enso.table.data.table.Table as Java_Table diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Aggregate_Column_Helper.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Aggregate_Column_Helper.enso index 2f8f7fd58347..72aff863f79b 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Aggregate_Column_Helper.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Aggregate_Column_Helper.enso @@ -67,7 +67,7 @@ prepare_aggregate_columns aggregates table = pass_1 = valid_resolved_aggregate_columns.map c->(if c.new_name.is_nothing then Nothing else unique.make_unique c.new_name) renamed_columns = pass_1.map_with_index i->name-> agg = valid_resolved_aggregate_columns.at i - new_name = (if name.is_nothing then unique.make_unique (here.default_aggregate_column_name agg) else name) + new_name = name.if_nothing (unique.make_unique (here.default_aggregate_column_name agg)) Pair new_name agg # Build Problems Output @@ -116,7 +116,7 @@ resolve_aggregate table problem_builder aggregate_column = resolve : (Integer|Text|Column) -> Column ! Internal_Missing_Column_Error resolve c = res = Table_Helpers.resolve_column_helper table_columns c problem_builder - if res.is_nothing then Error.throw Internal_Missing_Column_Error else res + res.if_nothing (Error.throw Internal_Missing_Column_Error) resolve_selector_to_vector : Column_Selector -> [Column] ! Internal_Missing_Column_Error resolve_selector_to_vector selector = @@ -189,13 +189,13 @@ java_aggregator name column = First c _ ignore_nothing ordering -> if ordering.is_nothing then FirstAggregator.new name c.java_column ignore_nothing else ordering_array = ordering.map .java_column - FirstAggregator.new name c.java_column ignore_nothing ordering_array.to_array (Comparator.new) + FirstAggregator.new name c.java_column ignore_nothing ordering_array.to_array Comparator.new Last c _ ignore_nothing ordering -> if ordering.is_nothing then LastAggregator.new name c.java_column ignore_nothing else ordering_array = ordering.map .java_column - LastAggregator.new name c.java_column ignore_nothing ordering_array.to_array (Comparator.new) - Maximum c _ -> MinOrMaxAggregator.new name c.java_column 1 - Minimum c _ -> MinOrMaxAggregator.new name c.java_column -1 + LastAggregator.new name c.java_column ignore_nothing ordering_array.to_array Comparator.new + Maximum c _ -> MinOrMaxAggregator.new name c.java_column 1 Comparator.new + Minimum c _ -> MinOrMaxAggregator.new name c.java_column -1 Comparator.new Shortest c _ -> ShortestOrLongestAggregator.new name c.java_column -1 Longest c _ -> ShortestOrLongestAggregator.new name c.java_column 1 Concatenate c _ join prefix suffix quote -> ConcatenateAggregator.new name c.java_column join prefix suffix quote diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Table_Helpers.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Table_Helpers.enso index 07d488d30cad..872d11561954 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Table_Helpers.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Table_Helpers.enso @@ -164,7 +164,7 @@ rename_columns internal_columns mapping on_problems = new_names = 0.up_to col_count . map i-> target = index_map.get_or_else i Nothing - if target.is_nothing then target else unique.make_unique target + if target.is_nothing then Nothing else unique.make_unique target new_names Column_Mapping.By_Position vec -> @@ -179,7 +179,7 @@ rename_columns internal_columns mapping on_problems = new_names processed = mapped.map_with_index i->n-> - if n.is_nothing then (unique.make_unique (internal_columns.at i).name) else n + n.if_nothing (unique.make_unique (internal_columns.at i).name) if unique.invalid_names.not_empty then problem_builder.report_other_warning (Invalid_Output_Column_Names unique.invalid_names.to_vector) diff --git a/std-bits/base/src/main/java/org/enso/base/ObjectComparator.java b/std-bits/base/src/main/java/org/enso/base/ObjectComparator.java index 9500d0cd25d0..3767a7f6349a 100644 --- a/std-bits/base/src/main/java/org/enso/base/ObjectComparator.java +++ b/std-bits/base/src/main/java/org/enso/base/ObjectComparator.java @@ -15,9 +15,9 @@ public class ObjectComparator implements Comparator { * @return Comparator object */ public static ObjectComparator getInstance( - BiFunction fallbackComparator) { + BiFunction fallbackComparator) { if (INSTANCE == null) { - INSTANCE = new ObjectComparator(fallbackComparator); + INSTANCE = new ObjectComparator((l, r) -> fallbackComparator.apply(l, r).intValue()); } return INSTANCE; diff --git a/std-bits/base/src/main/java/org/enso/base/statistics/CountMinMax.java b/std-bits/base/src/main/java/org/enso/base/statistics/CountMinMax.java new file mode 100644 index 000000000000..24111b8de7c0 --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/statistics/CountMinMax.java @@ -0,0 +1,50 @@ +package org.enso.base.statistics; + +import java.util.Arrays; +import java.util.Comparator; +import java.util.Iterator; +import java.util.function.BiFunction; +import java.util.stream.Stream; + +public class CountMinMax { + private static boolean isValid(Object v) { + return !(v == null || (v instanceof Double && Double.isNaN((Double)v))); + } + + public static Stream toObjectStream(Object[] array) { + return Arrays.stream(array); + } + + public final int count; + public final boolean comparatorError; + public final Object minimum; + public final Object maximum; + + public CountMinMax(Stream values, Comparator objectComparator) { + int count = 0; + + boolean comparatorFailed = false; + Object minimum = null; + Object maximum = null; + + Iterator iterator = values.filter(CountMinMax::isValid).iterator(); + while (iterator.hasNext()) { + Object value = iterator.next(); + count++; + + if (!comparatorFailed) { + try { + minimum = minimum == null || objectComparator.compare(minimum, value) > 0 ? value : minimum; + maximum = maximum == null || objectComparator.compare(maximum, value) < 0 ? value : maximum; + } catch (ClassCastException e) { + comparatorFailed = true; + } + } + } + + this.count = count; + this.comparatorError = comparatorFailed; + this.minimum = comparatorFailed ? null : minimum; + this.maximum = comparatorFailed ? null : maximum; + } +} diff --git a/std-bits/base/src/main/java/org/enso/base/statistics/Kurtosis.java b/std-bits/base/src/main/java/org/enso/base/statistics/Kurtosis.java new file mode 100644 index 000000000000..dadcb868ea24 --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/statistics/Kurtosis.java @@ -0,0 +1,28 @@ +package org.enso.base.statistics; + +public class Kurtosis implements MomentStatistic { + private final Variance variance; + + public Kurtosis() { + this.variance = new Variance(false); + } + + @Override + public int order() { + return 4; + } + + @Override + public double evaluate(long n, double[] sums) { + if (n < 4) { + return Double.NaN; + } + + double avg = sums[0] / n; + double var = variance.evaluate(n, sums); + double scale = n * (n + 1) / ((n - 1) * (n - 2) * (n - 3) * var * var); + double shift = 3.0 * (n - 1.0) * (n - 1.0) / ((n - 2.0) * (n - 3.0)); + return (sums[3] - 4 * avg * sums[2] + 6 * avg * avg * sums[1] - 3 * avg * avg * avg * sums[0]) + * scale - shift; + } +} diff --git a/std-bits/base/src/main/java/org/enso/base/statistics/Mean.java b/std-bits/base/src/main/java/org/enso/base/statistics/Mean.java new file mode 100644 index 000000000000..e402be366789 --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/statistics/Mean.java @@ -0,0 +1,13 @@ +package org.enso.base.statistics; + +public class Mean implements MomentStatistic { + @Override + public int order() { + return 1; + } + + @Override + public double evaluate(long n, double[] sums) { + return n == 0 ? Double.NaN : sums[0] / n; + } +} diff --git a/std-bits/base/src/main/java/org/enso/base/statistics/MomentStatistic.java b/std-bits/base/src/main/java/org/enso/base/statistics/MomentStatistic.java new file mode 100644 index 000000000000..647aa03ea813 --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/statistics/MomentStatistic.java @@ -0,0 +1,17 @@ +package org.enso.base.statistics; + +public interface MomentStatistic { + /*** + * Maximum order needed to compute the statistic + * @return Max order needed. 0 if only need the count. + */ + int order(); + + /*** + * Compute the statistic + * @param n the count of valid values + * @param sums the totals of each order + * @return computed statistic + */ + double evaluate(long n, double[] sums); +} diff --git a/std-bits/base/src/main/java/org/enso/base/statistics/Moments.java b/std-bits/base/src/main/java/org/enso/base/statistics/Moments.java new file mode 100644 index 000000000000..3663448dbc40 --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/statistics/Moments.java @@ -0,0 +1,71 @@ +package org.enso.base.statistics; + +import java.util.Arrays; + +/*** + * Set of descriptive statistics for numerical data sets + */ +public class Moments { + + /** Statistic to compute the total of the values. */ + public static final MomentStatistic SUM = new Sum(); + + /** Statistic to compute the mean average of the values. */ + public static final MomentStatistic MEAN = new Mean(); + + /** Statistic to compute the sample variance of the values. */ + public static final MomentStatistic VARIANCE = new Variance(false); + + /** Statistic to compute the population variance of the values. */ + public static final MomentStatistic VARIANCE_POPULATION = new Variance(true); + + /** Statistic to compute the sample standard deviation of the values. */ + public static final MomentStatistic STANDARD_DEVIATION = new StandardDeviation(false); + + /** Statistic to compute the population standard deviation of the values. */ + public static final MomentStatistic STANDARD_DEVIATION_POPULATION = new StandardDeviation(true); + + /** Statistic to compute the sample skewness of the values. */ + public static final MomentStatistic SKEW = new Skew(false); + + /** Statistic to compute the population skewness of the values. */ + public static final MomentStatistic SKEW_POPULATION = new Skew(true); + + /** Statistic to compute the sample kurtosis of the values. */ + public static final MomentStatistic KURTOSIS = new Kurtosis(); + + /** + * Compute a set of statistics on a data set. + * + * @param data set of values. + * @param statistics set of statistics to compute. + * @return computed statistics. + */ + public static double[] compute(Double[] data, MomentStatistic[] statistics) { + if (statistics.length == 0) { + return new double[0]; + } + + int order = Arrays.stream(statistics).mapToInt(s -> s == null ? 0 : s.order()).max().getAsInt(); + + long count = 0; + double[] totals = new double[order]; + for (Double value : data) { + if (value == null || Double.isNaN(value)) { + continue; + } + + count++; + double v = value; + for (int i = 0; i < order; i++) { + totals[i] += v; + v *= value; + } + } + + final long _count = count; + return Arrays.stream(statistics) + .mapToDouble(s -> s == null ? Double.NaN : s.evaluate(_count, totals)) + .toArray(); + } +} diff --git a/std-bits/base/src/main/java/org/enso/base/statistics/Skew.java b/std-bits/base/src/main/java/org/enso/base/statistics/Skew.java new file mode 100644 index 000000000000..d254cccbec0f --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/statistics/Skew.java @@ -0,0 +1,29 @@ +package org.enso.base.statistics; + +public class Skew implements MomentStatistic { + private final Boolean population; + private final StandardDeviation standardDeviation; + + public Skew(boolean population) { + this.population = population; + this.standardDeviation = new StandardDeviation(population); + } + + @Override + public int order() { + return 3; + } + + @Override + public double evaluate(long n, double[] sums) { + if (n < 3) { + return Double.NaN; + } + + double avg = sums[0] / n; + double st_dev = standardDeviation.evaluate(n, sums); + double scale = + 1.0 / (st_dev * st_dev * st_dev) / (population ? n : ((n - 1.0) * (n - 2.0) / n)); + return (sums[2] - 3 * avg * sums[1] + 2 * avg * avg * sums[0]) * scale; + } +} diff --git a/std-bits/base/src/main/java/org/enso/base/statistics/StandardDeviation.java b/std-bits/base/src/main/java/org/enso/base/statistics/StandardDeviation.java new file mode 100644 index 000000000000..e4c9fd307b2f --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/statistics/StandardDeviation.java @@ -0,0 +1,19 @@ +package org.enso.base.statistics; + +public class StandardDeviation implements MomentStatistic { + private final Variance variance; + + public StandardDeviation(boolean population) { + this.variance = new Variance(population); + } + + @Override + public int order() { + return this.variance.order(); + } + + @Override + public double evaluate(long n, double[] sums) { + return Math.sqrt(this.variance.evaluate(n, sums)); + } +} diff --git a/std-bits/base/src/main/java/org/enso/base/statistics/Sum.java b/std-bits/base/src/main/java/org/enso/base/statistics/Sum.java new file mode 100644 index 000000000000..181a8644451b --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/statistics/Sum.java @@ -0,0 +1,13 @@ +package org.enso.base.statistics; + +public class Sum implements MomentStatistic { + @Override + public int order() { + return 1; + } + + @Override + public double evaluate(long n, double[] sums) { + return sums[0]; + } +} diff --git a/std-bits/base/src/main/java/org/enso/base/statistics/Variance.java b/std-bits/base/src/main/java/org/enso/base/statistics/Variance.java new file mode 100644 index 000000000000..bfb6f18b3181 --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/statistics/Variance.java @@ -0,0 +1,27 @@ +package org.enso.base.statistics; + +public class Variance implements MomentStatistic { + private final boolean population; + + public Variance(boolean population) { + this.population = population; + } + + public boolean isPopulation() { + return population; + } + + @Override + public int order() { + return 2; + } + + @Override + public double evaluate(long n, double[] sums) { + if (population) { + return n < 1 ? Double.NaN : (sums[1] - sums[0] * sums[0] / n) / n; + } else { + return n < 2 ? Double.NaN : (sums[1] - sums[0] * sums[0] / n) / (n - 1); + } + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/aggregations/MinOrMax.java b/std-bits/table/src/main/java/org/enso/table/aggregations/MinOrMax.java index 6d1ccb633a67..6a095bb1be84 100644 --- a/std-bits/table/src/main/java/org/enso/table/aggregations/MinOrMax.java +++ b/std-bits/table/src/main/java/org/enso/table/aggregations/MinOrMax.java @@ -4,6 +4,7 @@ import org.enso.table.data.table.Column; import org.enso.table.data.table.problems.InvalidAggregation; +import java.util.Comparator; import java.util.List; /*** @@ -12,27 +13,30 @@ public class MinOrMax extends Aggregator { private final Storage storage; private final int minOrMax; + private final Comparator objectComparator; /** * Constructs a MinOrMax Aggregator + * * @param name output column name * @param column input column * @param minOrMax <0 for minimum, >0 for maximum */ - public MinOrMax(String name, Column column, int minOrMax) { + public MinOrMax(String name, Column column, int minOrMax, Comparator objectComparator) { super(name, Storage.Type.OBJECT); this.storage = column.getStorage(); this.minOrMax = Integer.signum(minOrMax); + this.objectComparator = objectComparator; } @Override public Object aggregate(List indexes) { Object current = null; - for (int row: indexes) { + for (int row : indexes) { Object value = storage.getItemBoxed(row); if (value != null) { try { - if (current == null || Integer.signum(Compare(current, value)) == minOrMax) { + if (current == null || Integer.signum(objectComparator.compare(value, current)) == minOrMax) { current = value; } } catch (ClassCastException e) { @@ -43,31 +47,4 @@ public Object aggregate(List indexes) { } return current; } - - private static int Compare(Object current, Object value) { - if (current instanceof String && value instanceof String) { - return ((String)value).compareTo((String)current); - } - - if (current instanceof Long) { - Long lValue = CastToLong(value); - if (null != lValue) { - return Long.compare(lValue, (Long)current); - } - - Double dValue = CastToDouble(value); - if (null != dValue) { - return Double.compare(dValue, (Long)current); - } - } - - if (current instanceof Double) { - Double dValue = CastToDouble(value); - if (null != dValue) { - return Double.compare(dValue, (Double)current); - } - } - - throw new ClassCastException(); - } } diff --git a/test/Benchmarks/src/Statistics/CountMinMax.enso b/test/Benchmarks/src/Statistics/CountMinMax.enso new file mode 100644 index 000000000000..38e272671c91 --- /dev/null +++ b/test/Benchmarks/src/Statistics/CountMinMax.enso @@ -0,0 +1,26 @@ +from Standard.Base import IO, Integer, Vector + +import Standard.Test.Bench +import Standard.Test.Faker + +import Standard.Base.Data.Statistics +from Standard.Base.Data.Statistics import all + +## Bench Utilities ============================================================ + +vector_size = 10000000 +iter_size = 5 +num_iterations = 5 + +create_vector : Integer->Integer->Vector +create_vector rows (seed=1646322139) = + faker = Faker.new seed + 0.up_to rows . map _-> faker.make_some_nothing (faker.integer 0 1000000) + +# The Benchmarks ============================================================== +main = + IO.println <| "Making data..." + vector = here.create_vector here.vector_size + + IO.println <| "Testing..." + Bench.measure (vector.compute_bulk [Count, Minimum, Maximum]) "Count Min Max" here.iter_size here.num_iterations diff --git a/test/Table_Tests/src/Json_Spec.enso b/test/Table_Tests/src/Json_Spec.enso index c947a40398f1..d40190bb8652 100644 --- a/test/Table_Tests/src/Json_Spec.enso +++ b/test/Table_Tests/src/Json_Spec.enso @@ -8,18 +8,18 @@ spec = Test.group 'JSON conversion' <| clothes = Enso_Project.data/'clothes.csv' . read_csv simple_empty = Enso_Project.data/'simple_empty.csv' . read_csv - Test.specify 'Should convert tables to a format compatible with Json.Array.to_table' <| + Test.specify 'should convert tables to a format compatible with Json.Array.to_table' <| clothes_json = clothes.to_json clothes_json.to_table ['Id', 'Name', 'Quantity', 'Rating', 'Price'] . should_equal clothes - Test.specify 'Should write JSON tables to disk' <| + Test.specify 'should write JSON tables to disk' <| out = Enso_Project.data / 'out.json' out.delete_if_exists simple_empty.write_json out (Json.parse out.read_text).to_table ['a', 'b', 'c'] . should_equal simple_empty out.delete_if_exists - Test.specify 'Should write JSON tables to disk using the write method' <| + Test.specify 'should write JSON tables to disk using the write method' <| out = Enso_Project.data / 'out.json' out.delete_if_exists simple_empty.write out Table.Format.Json diff --git a/test/Tests/src/Data/Statistics_Spec.enso b/test/Tests/src/Data/Statistics_Spec.enso new file mode 100644 index 000000000000..2332934d001d --- /dev/null +++ b/test/Tests/src/Data/Statistics_Spec.enso @@ -0,0 +1,150 @@ +from Standard.Base import Nothing, Vector, Number, True, Illegal_Argument_Error, False + +import Standard.Base.Data.Statistics +from Standard.Base.Data.Statistics import all + +import Standard.Test + +# === Test Resources === + +type Ord number + +Ord.compare_to : Ord -> Ordering +Ord.compare_to that = that.number.compare_to this.number + +type No_Ord number + +# Tests + +spec = + simple_set = [1, 2, 3, 4, 5] + number_set = [0.4, -18.56, -16.99, -16.43, -45.84, 13.44, -6.85, 9.68, -8.55, 10.87, 10.38, 33.85, -41.02, 1.87, -26.52, -13.87, -39.06, 25.92, -16.01, 42.01] + missing_set = number_set.map_with_index i->v->(if i % 5 == 4 then Nothing else v) + with_nans_set = number_set.map_with_index i->v->(if i % 5 == 4 then (if i % 10 == 9 then Number.nan else Nothing) else v) + text_set = ["A", "B", Nothing, "D"] + + ord_set = [Ord 10, Ord 2, Nothing, Ord 9] + no_ord_set = [No_Ord 10, No_Ord 2, Nothing, No_Ord 9] + + double_error = 0.000001 + + Test.group "Statistics" <| + Test.specify "should be able to count valid values" <| + simple_set.compute . should_equal 5 + number_set.compute . should_equal 20 + missing_set.compute . should_equal 16 + with_nans_set.compute . should_equal 16 + text_set.compute . should_equal 3 + + Test.specify "should be able to get minimum of maximum values" <| + simple_set.compute Minimum . should_equal 1 + number_set.compute Minimum . should_equal -45.84 epsilon=double_error + missing_set.compute Minimum . should_equal -41.02 epsilon=double_error + with_nans_set.compute Minimum . should_equal -41.02 epsilon=double_error + text_set.compute Minimum . should_equal "A" + simple_set.compute Maximum . should_equal 5 + number_set.compute Maximum . should_equal 42.01 epsilon=double_error + missing_set.compute Maximum . should_equal 33.85 epsilon=double_error + with_nans_set.compute Maximum . should_equal 33.85 epsilon=double_error + text_set.compute Maximum . should_equal "D" + + Test.specify "should be able to get sum of values" <| + simple_set.compute Sum . should_equal 15 epsilon=double_error + number_set.compute Sum . should_equal -101.28 epsilon=double_error + missing_set.compute Sum . should_equal -81.8 epsilon=double_error + with_nans_set.compute Sum . should_equal -81.8 epsilon=double_error + + Test.specify "should be able to get mean of values" <| + simple_set.compute Mean . should_equal 3 epsilon=double_error + number_set.compute Mean . should_equal -5.064 epsilon=double_error + missing_set.compute Mean . should_equal -5.1125 epsilon=double_error + with_nans_set.compute Mean . should_equal -5.1125 epsilon=double_error + + Test.specify "should be able to get sample variance of values" <| + simple_set.compute Variance . should_equal 2.5 epsilon=double_error + number_set.compute Variance . should_equal 582.0137832 epsilon=double_error + missing_set.compute Variance . should_equal 431.0218867 epsilon=double_error + with_nans_set.compute Variance . should_equal 431.0218867 epsilon=double_error + [1].compute Variance . is_nan . should_equal True + + Test.specify "should be able to get population variance of values" <| + simple_set.compute (Variance True) . should_equal 2 epsilon=double_error + number_set.compute (Variance True) . should_equal 552.913094 epsilon=double_error + missing_set.compute (Variance True) . should_equal 404.0830188 epsilon=double_error + with_nans_set.compute (Variance True) . should_equal 404.0830188 epsilon=double_error + + Test.specify "should be able to get population standard deviation of values" <| + simple_set.compute Standard_Deviation . should_equal 1.58113883 epsilon=double_error + number_set.compute Standard_Deviation . should_equal 24.12496183 epsilon=double_error + missing_set.compute Standard_Deviation . should_equal 20.76106661 epsilon=double_error + with_nans_set.compute Standard_Deviation . should_equal 20.76106661 epsilon=double_error + [1].compute Standard_Deviation . is_nan . should_equal True + + Test.specify "should be able to get sample standard deviation of values" <| + simple_set.compute (Standard_Deviation True) . should_equal 1.414213562 epsilon=double_error + number_set.compute (Standard_Deviation True) . should_equal 23.51410415 epsilon=double_error + missing_set.compute (Standard_Deviation True) . should_equal 20.1018163 epsilon=double_error + with_nans_set.compute (Standard_Deviation True) . should_equal 20.1018163 epsilon=double_error + + Test.specify "should be able to get sample skewness of values" <| + simple_set.compute Skew . should_equal 0 epsilon=double_error + number_set.compute Skew . should_equal 0.165086552 epsilon=double_error + missing_set.compute Skew . should_equal 0.084238123 epsilon=double_error + with_nans_set.compute Skew . should_equal 0.084238123 epsilon=double_error + [1, 2].compute Skew . is_nan . should_equal True + + Test.specify "should be able to get population skewness of values" <| + simple_set.compute (Skew True) . should_equal 0 epsilon=double_error + number_set.compute (Skew True) . should_equal 0.152437706 epsilon=double_error + missing_set.compute (Skew True) . should_equal 0.076125664 epsilon=double_error + with_nans_set.compute (Skew True) . should_equal 0.076125664 epsilon=double_error + [1, 2].compute (Skew True) . is_nan . should_equal True + + Test.specify "should be able to get sample kurtosis of values" <| + simple_set.compute Kurtosis . should_equal -1.2 epsilon=double_error + number_set.compute Kurtosis . should_equal -0.449422438 epsilon=double_error + missing_set.compute Kurtosis . should_equal -0.201991074 epsilon=double_error + with_nans_set.compute Kurtosis . should_equal -0.201991074 epsilon=double_error + [1, 2, 3].compute Kurtosis . is_nan . should_equal True + + Test.specify "should allow bulk computation" <| + stats = [Count, Minimum, Mean, Variance, Skew] + expected = [20, -45.84, -5.064, 582.0137832, 0.165086552] + values = number_set.compute_bulk stats + values.map_with_index i->v->((expected.at i - v).abs < double_error) . any v->(v == True) . should_equal True + + Test.specify "should be able to count and sum on empty Vector" <| + [].compute . should_equal 0 + [].compute Sum . should_equal 0 + + Test.specify "should fail with Empty_Error for Minimum and Maximum on empty Vector" <| + [].compute Minimum . should_fail_with Vector.Empty_Error + [].compute Maximum . should_fail_with Vector.Empty_Error + + Test.specify "should be NaN for other statistics sum on empty Vector" <| + [].compute Mean . is_nan . should_equal True + [].compute Variance . is_nan . should_equal True + [].compute Skew . is_nan . should_equal True + [].compute Kurtosis . is_nan . should_equal True + + Test.specify "should fail with Illegal_Argument_Error on number based statistics for text Vector" <| + text_set.compute Sum . should_fail_with Illegal_Argument_Error + text_set.compute Mean . should_fail_with Illegal_Argument_Error + text_set.compute Variance . should_fail_with Illegal_Argument_Error + text_set.compute Skew . should_fail_with Illegal_Argument_Error + text_set.compute Kurtosis . should_fail_with Illegal_Argument_Error + + Test.specify "should be able to do Count, Minimum and Maximum on custom type with compare_to" <| + ord_set.compute . should_equal 3 + ord_set.compute Minimum . should_equal (Ord 10) + ord_set.compute Maximum . should_equal (Ord 2) + + Test.specify "should fail with Incomparable_Values_Error on custom type without compare_to" <| + no_ord_set.compute . should_equal 3 + no_ord_set.compute Minimum . should_fail_with Vector.Incomparable_Values_Error + no_ord_set.compute Maximum . should_fail_with Vector.Incomparable_Values_Error + + Test.specify "should fail with Incomparable_Values_Error on mixed Vectors" <| + [1, False].compute Minimum . should_fail_with Vector.Incomparable_Values_Error + +main = Test.Suite.run_main here.spec diff --git a/test/Tests/src/Data/Time/Date_Spec.enso b/test/Tests/src/Data/Time/Date_Spec.enso index 2f709dd68775..afaaac8dfbb2 100644 --- a/test/Tests/src/Data/Time/Date_Spec.enso +++ b/test/Tests/src/Data/Time/Date_Spec.enso @@ -129,3 +129,5 @@ spec = date_2 = Date.parse "2021-01-01" date_2.week_of_year Locale.us . should_equal 1 date_2.week_of_year Locale.uk . should_equal 0 + +main = Test.Suite.run_main here.spec diff --git a/test/Tests/src/Data/Time/Duration_Spec.enso b/test/Tests/src/Data/Time/Duration_Spec.enso index 77932ff16e83..f217d6c85e1d 100644 --- a/test/Tests/src/Data/Time/Duration_Spec.enso +++ b/test/Tests/src/Data/Time/Duration_Spec.enso @@ -98,4 +98,3 @@ spec = duration_1!=duration_2 . should_be_true duration_1>duration_2 . should_be_true duration_1