From 8353d6839f45b2f6930ac3f928e6844889de70e4 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 25 Feb 2021 10:25:08 -0800 Subject: [PATCH 01/11] basic var algorithm change --- python/dask_cudf/dask_cudf/core.py | 41 +++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 0ba35460835..dc96caca912 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -296,17 +296,40 @@ def var( return handle_out(out, result) else: + + def _local_var(x, skipna): + n = len(x) + avg = x.mean(skipna=skipna) + m2 = ((x - avg) ** 2).sum(skipna=skipna) + return n, avg, m2 + + def _aggregate_var(parts): + n, avg, m2 = parts[0] + for i in range(1, len(parts)): + n_a, avg_a, m2_a = n, avg, m2 + n_b, avg_b, m2_b = parts[i] + n = n_a + n_b + avg = (n_a * avg_a + n_b * avg_b) / n + delta = avg_b - avg_a + m2 = m2_a + m2_b + delta ** 2 * n_a * n_b / n + return m2 / (n - 1) + + dsk = {} + name = "var-" + tokenize( + axis, skipna, ddof, split_every, dtype, out + ) + local_name = "local-" + name num = self._get_numeric_data() - x = 1.0 * num.sum(skipna=skipna, split_every=split_every) - x2 = 1.0 * (num ** 2).sum(skipna=skipna, split_every=split_every) - n = num.count(split_every=split_every) - name = self._token_prefix + "var" - result = map_partitions( - var_aggregate, x2, x, n, token=name, meta=meta, ddof=ddof + parts = [] + for n in range(num.npartitions): + parts.append((local_name, n)) + dsk[parts[-1]] = (_local_var, (num._name, n), skipna) + dsk[(name, 0)] = (_aggregate_var, parts) + + graph = HighLevelGraph.from_collections( + name, dsk, dependencies=[num] ) - if isinstance(self, DataFrame): - result.divisions = (min(self.columns), max(self.columns)) - return handle_out(out, result) + return dd.core.new_dd_object(graph, name, meta, (None, None)) def repartition(self, *args, **kwargs): """ Wraps dask.dataframe DataFrame.repartition method. From ba71d7dfafd2d55207493c209809bf453490d6a3 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 25 Feb 2021 10:49:54 -0800 Subject: [PATCH 02/11] api cleanup --- python/dask_cudf/dask_cudf/core.py | 108 ++++++++++++++++------------- 1 file changed, 61 insertions(+), 47 deletions(-) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index dc96caca912..cb96e0b3e55 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -280,6 +280,7 @@ def var( split_every=False, dtype=None, out=None, + naive=False, ): axis = self._validate_axis(axis) meta = self._meta_nonempty.var(axis=axis, skipna=skipna) @@ -294,42 +295,12 @@ def var( ddof=ddof, ) return handle_out(out, result) - - else: - - def _local_var(x, skipna): - n = len(x) - avg = x.mean(skipna=skipna) - m2 = ((x - avg) ** 2).sum(skipna=skipna) - return n, avg, m2 - - def _aggregate_var(parts): - n, avg, m2 = parts[0] - for i in range(1, len(parts)): - n_a, avg_a, m2_a = n, avg, m2 - n_b, avg_b, m2_b = parts[i] - n = n_a + n_b - avg = (n_a * avg_a + n_b * avg_b) / n - delta = avg_b - avg_a - m2 = m2_a + m2_b + delta ** 2 * n_a * n_b / n - return m2 / (n - 1) - - dsk = {} - name = "var-" + tokenize( - axis, skipna, ddof, split_every, dtype, out - ) - local_name = "local-" + name - num = self._get_numeric_data() - parts = [] - for n in range(num.npartitions): - parts.append((local_name, n)) - dsk[parts[-1]] = (_local_var, (num._name, n), skipna) - dsk[(name, 0)] = (_aggregate_var, parts) - - graph = HighLevelGraph.from_collections( - name, dsk, dependencies=[num] + elif naive: + return _parallel_naive_var( + self, meta, skipna, ddof, split_every, out ) - return dd.core.new_dd_object(graph, name, meta, (None, None)) + else: + return _parallel_welford_var(self, meta, skipna, split_every, out) def repartition(self, *args, **kwargs): """ Wraps dask.dataframe DataFrame.repartition method. @@ -419,6 +390,7 @@ def var( split_every=False, dtype=None, out=None, + naive=False, ): axis = self._validate_axis(axis) meta = self._meta_nonempty.var(axis=axis, skipna=skipna) @@ -433,19 +405,12 @@ def var( ddof=ddof, ) return handle_out(out, result) - - else: - num = self._get_numeric_data() - x = 1.0 * num.sum(skipna=skipna, split_every=split_every) - x2 = 1.0 * (num ** 2).sum(skipna=skipna, split_every=split_every) - n = num.count(split_every=split_every) - name = self._token_prefix + "var" - result = map_partitions( - var_aggregate, x2, x, n, token=name, meta=meta, ddof=ddof + elif naive: + return _parallel_naive_var( + self, meta, skipna, ddof, split_every, out ) - if isinstance(self, DataFrame): - result.divisions = (min(self.columns), max(self.columns)) - return handle_out(out, result) + else: + return _parallel_welford_var(self, meta, skipna, split_every, out) def groupby(self, *args, **kwargs): from .groupby import CudfSeriesGroupBy @@ -457,6 +422,55 @@ class Index(Series, dd.core.Index): _partition_type = cudf.Index +def _parallel_naive_var(ddf, meta, skipna, ddof, split_every, out): + num = ddf._get_numeric_data() + x = 1.0 * num.sum(skipna=skipna, split_every=split_every) + x2 = 1.0 * (num ** 2).sum(skipna=skipna, split_every=split_every) + n = num.count(split_every=split_every) + name = ddf._token_prefix + "var" + result = map_partitions( + var_aggregate, x2, x, n, token=name, meta=meta, ddof=ddof + ) + if isinstance(ddf, DataFrame): + result.divisions = (min(ddf.columns), max(ddf.columns)) + return handle_out(out, result) + + +def _parallel_welford_var(ddf, meta, skipna, split_every, out): + def _local_var(x, skipna): + n = len(x) + avg = x.mean(skipna=skipna) + m2 = ((x - avg) ** 2).sum(skipna=skipna) + return n, avg, m2 + + def _aggregate_var(parts): + n, avg, m2 = parts[0] + for i in range(1, len(parts)): + n_a, avg_a, m2_a = n, avg, m2 + n_b, avg_b, m2_b = parts[i] + n = n_a + n_b + avg = (n_a * avg_a + n_b * avg_b) / n + delta = avg_b - avg_a + m2 = m2_a + m2_b + delta ** 2 * n_a * n_b / n + return m2 / (n - 1) + + dsk = {} + name = "var-" + tokenize(skipna, split_every, out) + local_name = "local-" + name + num = ddf._get_numeric_data() + parts = [] + for n in range(num.npartitions): + parts.append((local_name, n)) + dsk[parts[-1]] = (_local_var, (num._name, n), skipna) + dsk[(name, 0)] = (_aggregate_var, parts) + + graph = HighLevelGraph.from_collections(name, dsk, dependencies=[num]) + result = dd.core.new_dd_object(graph, name, meta, (None, None)) + if isinstance(ddf, DataFrame): + result.divisions = (min(ddf.columns), max(ddf.columns)) + return handle_out(out, result) + + def _extract_meta(x): """ Extract internal cache data (``_meta``) from dask_cudf objects From 943d3b4fe363bfa911def98c90e887db3e8e0399 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 25 Feb 2021 11:30:34 -0800 Subject: [PATCH 03/11] fix test --- python/dask_cudf/dask_cudf/core.py | 52 +++++++++++++------ python/dask_cudf/dask_cudf/tests/test_core.py | 30 +++++++++++ 2 files changed, 66 insertions(+), 16 deletions(-) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index cb96e0b3e55..058584c7989 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -1,4 +1,5 @@ # Copyright (c) 2018-2020, NVIDIA CORPORATION. +import math import warnings from distutils.version import LooseVersion @@ -296,11 +297,9 @@ def var( ) return handle_out(out, result) elif naive: - return _parallel_naive_var( - self, meta, skipna, ddof, split_every, out - ) + return _naive_var(self, meta, skipna, ddof, split_every, out) else: - return _parallel_welford_var(self, meta, skipna, split_every, out) + return _parallel_var(self, meta, skipna, split_every, out) def repartition(self, *args, **kwargs): """ Wraps dask.dataframe DataFrame.repartition method. @@ -406,11 +405,9 @@ def var( ) return handle_out(out, result) elif naive: - return _parallel_naive_var( - self, meta, skipna, ddof, split_every, out - ) + return _naive_var(self, meta, skipna, ddof, split_every, out) else: - return _parallel_welford_var(self, meta, skipna, split_every, out) + return _parallel_var(self, meta, skipna, split_every, out) def groupby(self, *args, **kwargs): from .groupby import CudfSeriesGroupBy @@ -422,7 +419,7 @@ class Index(Series, dd.core.Index): _partition_type = cudf.Index -def _parallel_naive_var(ddf, meta, skipna, ddof, split_every, out): +def _naive_var(ddf, meta, skipna, ddof, split_every, out): num = ddf._get_numeric_data() x = 1.0 * num.sum(skipna=skipna, split_every=split_every) x2 = 1.0 * (num ** 2).sum(skipna=skipna, split_every=split_every) @@ -436,7 +433,7 @@ def _parallel_naive_var(ddf, meta, skipna, ddof, split_every, out): return handle_out(out, result) -def _parallel_welford_var(ddf, meta, skipna, split_every, out): +def _parallel_var(ddf, meta, skipna, split_every, out): def _local_var(x, skipna): n = len(x) avg = x.mean(skipna=skipna) @@ -452,17 +449,40 @@ def _aggregate_var(parts): avg = (n_a * avg_a + n_b * avg_b) / n delta = avg_b - avg_a m2 = m2_a + m2_b + delta ** 2 * n_a * n_b / n + return n, avg, m2 + + def _finalize_var(vals): + n, _, m2 = vals return m2 / (n - 1) - dsk = {} + # Build graph + nparts = ddf.npartitions + if not split_every: + split_every = nparts name = "var-" + tokenize(skipna, split_every, out) local_name = "local-" + name num = ddf._get_numeric_data() - parts = [] - for n in range(num.npartitions): - parts.append((local_name, n)) - dsk[parts[-1]] = (_local_var, (num._name, n), skipna) - dsk[(name, 0)] = (_aggregate_var, parts) + dsk = { + (local_name, n, 0): (_local_var, (num._name, n), skipna) + for n in range(nparts) + } + + # Use reduction tree + widths = [nparts] + while nparts > 1: + nparts = math.ceil(nparts / split_every) + widths.append(nparts) + height = len(widths) + for depth in range(1, height): + for group in range(widths[depth]): + p_max = widths[depth - 1] + lstart = split_every * group + lstop = min(lstart + split_every, p_max) + node_list = [ + (local_name, p, depth - 1) for p in range(lstart, lstop) + ] + dsk[(local_name, group, depth)] = (_aggregate_var, node_list) + dsk[(name, 0)] = (_finalize_var, (local_name, group, depth)) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[num]) result = dd.core.new_dd_object(graph, name, meta, (None, None)) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 10719794843..8ed97fc356b 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -720,6 +720,36 @@ def test_dataframe_describe(): dd.assert_eq(ddf.describe(), pddf.describe(), check_less_precise=3) +def test_zero_std_describe(): + num = 84886781 + df = cudf.DataFrame( + { + "x": np.full((20,), num, dtype=np.float), + "y": np.full((20,), num, dtype=np.float), + } + ) + pdf = df.to_pandas() + ddf = dgd.from_cudf(df, npartitions=4) + pddf = dd.from_pandas(pdf, npartitions=4) + + dd.assert_eq(ddf.describe(), pddf.describe(), check_less_precise=3) + + +def test_large_numbers_describe(): + num = 8488678001 + df = cudf.DataFrame( + { + "x": np.arange(num, num + 1000, dtype=np.float), + "y": np.arange(num, num + 1000, dtype=np.float), + } + ) + pdf = df.to_pandas() + ddf = dgd.from_cudf(df, npartitions=4) + pddf = dd.from_pandas(pdf, npartitions=4) + + dd.assert_eq(ddf.describe(), pddf.describe(), check_less_precise=3) + + def test_index_map_partitions(): # https://github.com/rapidsai/cudf/issues/6738 From d8db547c9f0150ac3f2bed26efb092d94c66b76c Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 25 Feb 2021 11:43:54 -0800 Subject: [PATCH 04/11] add series test --- python/dask_cudf/dask_cudf/core.py | 2 +- python/dask_cudf/dask_cudf/tests/test_core.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 058584c7989..00b18586378 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -484,7 +484,7 @@ def _finalize_var(vals): dsk[(local_name, group, depth)] = (_aggregate_var, node_list) dsk[(name, 0)] = (_finalize_var, (local_name, group, depth)) - graph = HighLevelGraph.from_collections(name, dsk, dependencies=[num]) + graph = HighLevelGraph.from_collections(name, dsk, dependencies=[num, ddf]) result = dd.core.new_dd_object(graph, name, meta, (None, None)) if isinstance(ddf, DataFrame): result.divisions = (min(ddf.columns), max(ddf.columns)) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 8ed97fc356b..cf0bc7f0a01 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -709,6 +709,23 @@ def test_dataframe_set_index(): assert_eq(ddf.compute(), pddf.compute()) +def test_series_describe(): + random.seed(0) + sr = cudf.datasets.randomdata(20)["x"] + psr = sr.to_pandas() + + dsr = dgd.from_cudf(sr, npartitions=4) + pdsr = dd.from_pandas(psr, npartitions=4) + + # NOTE: Removing `compute` is causing an + # "incorrect dependencies" error here. + dd.assert_eq( + dsr.describe().compute(), + pdsr.describe().compute(), + check_less_precise=3, + ) + + def test_dataframe_describe(): random.seed(0) df = cudf.datasets.randomdata(20) From 311756f9da319fd4bcb5f15a0f2006de61c9a26f Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 25 Feb 2021 16:29:53 -0800 Subject: [PATCH 05/11] fix bug in reduction --- python/dask_cudf/dask_cudf/core.py | 11 +++++------ python/dask_cudf/dask_cudf/tests/test_core.py | 4 +--- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 00b18586378..1c88ec7ea74 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -368,7 +368,7 @@ class Series(_Frame, dd.core.Series): def count(self, split_every=False): return reduction( - self, + [self], chunk=M.count, aggregate=np.sum, split_every=split_every, @@ -436,6 +436,8 @@ def _naive_var(ddf, meta, skipna, ddof, split_every, out): def _parallel_var(ddf, meta, skipna, split_every, out): def _local_var(x, skipna): n = len(x) + # TODO: x.sum()/n seems to be faster than x.mean() + # on Quadro RTX 8000 - Need to compare on V/A100 avg = x.mean(skipna=skipna) m2 = ((x - avg) ** 2).sum(skipna=skipna) return n, avg, m2 @@ -666,11 +668,8 @@ def reduction( meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs) meta = dd.core.make_meta(meta) - for arg in args: - if isinstance(arg, _Frame): - dsk.update(arg.dask) - - return dd.core.new_dd_object(dsk, b, meta, (None, None)) + graph = HighLevelGraph.from_collections(b, dsk, dependencies=args) + return dd.core.new_dd_object(graph, b, meta, (None, None)) def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index cf0bc7f0a01..3045c918ef1 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -720,9 +720,7 @@ def test_series_describe(): # NOTE: Removing `compute` is causing an # "incorrect dependencies" error here. dd.assert_eq( - dsr.describe().compute(), - pdsr.describe().compute(), - check_less_precise=3, + dsr.describe(), pdsr.describe(), check_less_precise=3, ) From 25de14c81a8952db99c0593ad584707e6b8a8541 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 25 Feb 2021 16:30:50 -0800 Subject: [PATCH 06/11] fix bug in reduction --- python/dask_cudf/dask_cudf/tests/test_core.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 3045c918ef1..4cd79a8df65 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -717,8 +717,6 @@ def test_series_describe(): dsr = dgd.from_cudf(sr, npartitions=4) pdsr = dd.from_pandas(psr, npartitions=4) - # NOTE: Removing `compute` is causing an - # "incorrect dependencies" error here. dd.assert_eq( dsr.describe(), pdsr.describe(), check_less_precise=3, ) From cb570a7247998705bc33f81b66b78a3a4ccce3a2 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 25 Feb 2021 20:01:40 -0800 Subject: [PATCH 07/11] fix count logic --- python/dask_cudf/dask_cudf/core.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 1c88ec7ea74..3fd96955225 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -435,10 +435,12 @@ def _naive_var(ddf, meta, skipna, ddof, split_every, out): def _parallel_var(ddf, meta, skipna, split_every, out): def _local_var(x, skipna): - n = len(x) - # TODO: x.sum()/n seems to be faster than x.mean() - # on Quadro RTX 8000 - Need to compare on V/A100 - avg = x.mean(skipna=skipna) + if skipna: + n = x.count(skipna=skipna) + avg = x.mean(skipna=skipna) + else: + n = len(x) + avg = x.sum(skipna=skipna) / n m2 = ((x - avg) ** 2).sum(skipna=skipna) return n, avg, m2 From 301f52e822e2a55e19f11fca1805c3555e2d44be Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 25 Feb 2021 20:03:12 -0800 Subject: [PATCH 08/11] trigger reformatting --- python/dask_cudf/dask_cudf/core.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 3fd96955225..3cf85e3283f 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -439,6 +439,8 @@ def _local_var(x, skipna): n = x.count(skipna=skipna) avg = x.mean(skipna=skipna) else: + # Not skipping nulls, so might as well + # avoid the full `count` operation n = len(x) avg = x.sum(skipna=skipna) / n m2 = ((x - avg) ** 2).sum(skipna=skipna) From 56dcbe4a7768659a09cdce7444e3665bd34d0faa Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 25 Feb 2021 20:19:48 -0800 Subject: [PATCH 09/11] avoid float dep warning --- python/dask_cudf/dask_cudf/tests/test_core.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 4cd79a8df65..2d2230b5716 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -737,8 +737,8 @@ def test_zero_std_describe(): num = 84886781 df = cudf.DataFrame( { - "x": np.full((20,), num, dtype=np.float), - "y": np.full((20,), num, dtype=np.float), + "x": np.full((20,), num, dtype=np.float64), + "y": np.full((20,), num, dtype=np.float64), } ) pdf = df.to_pandas() @@ -752,8 +752,8 @@ def test_large_numbers_describe(): num = 8488678001 df = cudf.DataFrame( { - "x": np.arange(num, num + 1000, dtype=np.float), - "y": np.arange(num, num + 1000, dtype=np.float), + "x": np.arange(num, num + 1000, dtype=np.float64), + "y": np.arange(num, num + 1000, dtype=np.float64), } ) pdf = df.to_pandas() From e68d4a2333473b935d1d64e9bc0c27e296d516c5 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 25 Feb 2021 21:04:49 -0800 Subject: [PATCH 10/11] cover edge case (single row) --- python/dask_cudf/dask_cudf/core.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 3cf85e3283f..aa83bad7630 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -488,6 +488,8 @@ def _finalize_var(vals): (local_name, p, depth - 1) for p in range(lstart, lstop) ] dsk[(local_name, group, depth)] = (_aggregate_var, node_list) + if height == 1: + group = depth = 0 dsk[(name, 0)] = (_finalize_var, (local_name, group, depth)) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[num, ddf]) From 0ceeadeda4203475bb76663ea5d47aaf7b9cbc28 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Fri, 26 Feb 2021 06:55:08 -0800 Subject: [PATCH 11/11] change test_large_numbers_describe to test_large_numbers_var (seems occational failures are not related to var) --- python/dask_cudf/dask_cudf/tests/test_core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 2d2230b5716..a85034224a2 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -748,7 +748,7 @@ def test_zero_std_describe(): dd.assert_eq(ddf.describe(), pddf.describe(), check_less_precise=3) -def test_large_numbers_describe(): +def test_large_numbers_var(): num = 8488678001 df = cudf.DataFrame( { @@ -760,7 +760,7 @@ def test_large_numbers_describe(): ddf = dgd.from_cudf(df, npartitions=4) pddf = dd.from_pandas(pdf, npartitions=4) - dd.assert_eq(ddf.describe(), pddf.describe(), check_less_precise=3) + dd.assert_eq(ddf.var(), pddf.var(), check_less_precise=3) def test_index_map_partitions():