diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 448a22425a4..8cbadfa19a5 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -7,7 +7,7 @@ import pandas as pd import rmm import cudf -import cudf._lib as libcudfxx +import cudf._lib as libcudf from cudf.api.types import is_categorical_dtype, is_list_dtype, is_struct_dtype from cudf.core.buffer import Buffer @@ -160,7 +160,7 @@ cdef class Column: if self.base_mask is None or self.offset == 0: self._mask = self.base_mask else: - self._mask = libcudfxx.null_mask.copy_bitmask(self) + self._mask = libcudf.null_mask.copy_bitmask(self) return self._mask @property diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx index b6e26fe594f..ce83a6f0f18 100644 --- a/python/cudf/cudf/_lib/null_mask.pyx +++ b/python/cudf/cudf/_lib/null_mask.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from enum import Enum @@ -8,9 +8,6 @@ from libcpp.utility cimport move from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer from cudf._lib.column cimport Column - -import cudf._lib as libcudfxx - from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.null_mask cimport ( bitmask_allocation_size_bytes as cpp_bitmask_allocation_size_bytes, diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 8cb7dd942c1..e363ea875f0 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # cython: boundscheck = False @@ -17,7 +17,7 @@ except ImportError: import json import numpy as np -from cython.operator import dereference +from cython.operator cimport dereference from cudf.api.types import ( is_categorical_dtype, diff --git a/python/cudf/cudf/_lib/rolling.pyx b/python/cudf/cudf/_lib/rolling.pyx index b4b3384032c..a2cb115f668 100644 --- a/python/cudf/cudf/_lib/rolling.pyx +++ b/python/cudf/cudf/_lib/rolling.pyx @@ -1,6 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. - -from __future__ import print_function +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import pandas as pd diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 22a5666ef3f..d13c55dfcc0 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -1,5 +1,5 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. -from warnings import warn +import warnings import cupy as cp import numpy as np @@ -50,7 +50,7 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None): raise NotImplementedError("na_sentinel can not be None.") if size_hint: - warn("size_hint is not applicable for cudf.factorize") + warnings.warn("size_hint is not applicable for cudf.factorize") return_cupy_array = isinstance(values, cp.ndarray) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index fac8af652c1..375a19f5423 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -2,7 +2,7 @@ from __future__ import annotations -import datetime as dt +import datetime import locale import re from locale import nl_langinfo @@ -237,9 +237,9 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike: if isinstance(other, (cudf.Scalar, ColumnBase, cudf.DateOffset)): return other - if isinstance(other, dt.datetime): + if isinstance(other, datetime.datetime): other = np.datetime64(other) - elif isinstance(other, dt.timedelta): + elif isinstance(other, datetime.timedelta): other = np.timedelta64(other) elif isinstance(other, pd.Timestamp): other = other.to_datetime64() diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index f10e257d359..d8ddb3d8d1a 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -1,8 +1,8 @@ # Copyright (c) 2021-2022, NVIDIA CORPORATION. +import warnings from decimal import Decimal from typing import Any, Sequence, Tuple, Union, cast -from warnings import warn import cupy as cp import numpy as np @@ -43,7 +43,7 @@ def as_decimal_column( isinstance(dtype, cudf.core.dtypes.DecimalDtype) and dtype.scale < self.dtype.scale ): - warn( + warnings.warn( "cuDF truncates when downcasting decimals to a lower scale. " "To round, use Series.round() or DataFrame.round()." ) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 15815427aca..810624e9f4e 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -2,7 +2,7 @@ from __future__ import annotations -import datetime as dt +import datetime from typing import Any, Sequence, cast import numpy as np @@ -211,7 +211,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: def normalize_binop_value(self, other) -> ColumnBinaryOperand: if isinstance(other, (ColumnBase, cudf.Scalar)): return other - if isinstance(other, dt.timedelta): + if isinstance(other, datetime.timedelta): other = np.timedelta64(other) elif isinstance(other, pd.Timestamp): other = other.to_datetime64() diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8893b85c97c..24aa0d01b3c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5596,14 +5596,14 @@ def select_dtypes(self, include=None, exclude=None): @ioutils.doc_to_parquet() def to_parquet(self, path, *args, **kwargs): """{docstring}""" - from cudf.io import parquet as pq + from cudf.io import parquet - return pq.to_parquet(self, path, *args, **kwargs) + return parquet.to_parquet(self, path, *args, **kwargs) @ioutils.doc_to_feather() def to_feather(self, path, *args, **kwargs): """{docstring}""" - from cudf.io import feather as feather + from cudf.io import feather feather.to_feather(self, path, *args, **kwargs) @@ -5623,7 +5623,7 @@ def to_csv( **kwargs, ): """{docstring}""" - from cudf.io import csv as csv + from cudf.io import csv return csv.to_csv( self, @@ -5643,7 +5643,7 @@ def to_csv( @ioutils.doc_to_orc() def to_orc(self, fname, compression=None, *args, **kwargs): """{docstring}""" - from cudf.io import orc as orc + from cudf.io import orc orc.to_orc(self, fname, compression, *args, **kwargs) diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py index 782b74ef4a6..83cceff5c4c 100644 --- a/python/cudf/cudf/core/subword_tokenizer.py +++ b/python/cudf/cudf/core/subword_tokenizer.py @@ -1,9 +1,9 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. from __future__ import annotations +import warnings from typing import Union -from warnings import warn import cupy as cp @@ -186,7 +186,7 @@ def __call__( "When truncation is not True, the behaviour currently differs " "from HuggingFace as cudf always returns overflowing tokens" ) - warn(warning_msg) + warnings.warn(warning_msg) if padding != "max_length": error_msg = ( diff --git a/python/cudf/cudf/tests/test_api_types.py b/python/cudf/cudf/tests/test_api_types.py index e7cf113f604..c2cd78f88a0 100644 --- a/python/cudf/cudf/tests/test_api_types.py +++ b/python/cudf/cudf/tests/test_api_types.py @@ -3,10 +3,10 @@ import numpy as np import pandas as pd import pytest -from pandas.api import types as ptypes +from pandas.api import types as pd_types import cudf -from cudf.api import types as types +from cudf.api import types @pytest.mark.parametrize( @@ -1035,11 +1035,13 @@ def test_is_decimal_dtype(obj, expect): ), ) def test_pandas_agreement(obj): - assert types.is_categorical_dtype(obj) == ptypes.is_categorical_dtype(obj) - assert types.is_numeric_dtype(obj) == ptypes.is_numeric_dtype(obj) - assert types.is_integer_dtype(obj) == ptypes.is_integer_dtype(obj) - assert types.is_integer(obj) == ptypes.is_integer(obj) - assert types.is_string_dtype(obj) == ptypes.is_string_dtype(obj) + assert types.is_categorical_dtype(obj) == pd_types.is_categorical_dtype( + obj + ) + assert types.is_numeric_dtype(obj) == pd_types.is_numeric_dtype(obj) + assert types.is_integer_dtype(obj) == pd_types.is_integer_dtype(obj) + assert types.is_integer(obj) == pd_types.is_integer(obj) + assert types.is_string_dtype(obj) == pd_types.is_string_dtype(obj) @pytest.mark.parametrize( @@ -1115,7 +1117,7 @@ def test_pandas_agreement(obj): ), ) def test_pandas_agreement_scalar(obj): - assert types.is_scalar(obj) == ptypes.is_scalar(obj) + assert types.is_scalar(obj) == pd_types.is_scalar(obj) # TODO: Add test of interval. diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py index f06142f4cc9..15dfa111860 100644 --- a/python/cudf/cudf/tests/test_contains.py +++ b/python/cudf/cudf/tests/test_contains.py @@ -1,4 +1,6 @@ -from datetime import datetime as dt +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + +import datetime import numpy as np import pandas as pd @@ -41,12 +43,12 @@ def get_string_series(): testdata_all = [ ( cudf_date_series("20010101", "20020215", freq="400h"), - dt.strptime("2001-01-01", "%Y-%m-%d"), + datetime.datetime.strptime("2001-01-01", "%Y-%m-%d"), True, ), ( cudf_date_series("20010101", "20020215", freq="400h"), - dt.strptime("2000-01-01", "%Y-%m-%d"), + datetime.datetime.strptime("2000-01-01", "%Y-%m-%d"), False, ), (cudf_date_series("20010101", "20020215", freq="400h"), 20000101, False), diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 07261534777..2685524add4 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -13,7 +13,6 @@ from copy import copy import cupy -import cupy as cp import numpy as np import pandas as pd import pyarrow as pa @@ -7332,7 +7331,7 @@ def test_sample_axis_0( @pytest.mark.parametrize("replace", [True, False]) @pytest.mark.parametrize( - "random_state_lib", [cp.random.RandomState, np.random.RandomState] + "random_state_lib", [cupy.random.RandomState, np.random.RandomState] ) def test_sample_reproducibility(replace, random_state_lib): df = cudf.DataFrame({"a": cupy.arange(0, 1024)}) @@ -7384,7 +7383,7 @@ def test_oversample_without_replace(n, frac, axis): ) -@pytest.mark.parametrize("random_state", [None, cp.random.RandomState(42)]) +@pytest.mark.parametrize("random_state", [None, cupy.random.RandomState(42)]) def test_sample_unsupported_arguments(random_state): df = cudf.DataFrame({"float": [0.05, 0.2, 0.3, 0.2, 0.25]}) with pytest.raises( diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 964ac9e5457..8be338e787a 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1,7 +1,6 @@ # Copyright (c) 2019-2022, NVIDIA CORPORATION. import datetime -import datetime as dt import operator import re @@ -219,8 +218,8 @@ def test_sort_datetime(): def test_issue_165(): df_pandas = pd.DataFrame() - start_date = dt.datetime.strptime("2000-10-21", "%Y-%m-%d") - data = [(start_date + dt.timedelta(days=x)) for x in range(6)] + start_date = datetime.datetime.strptime("2000-10-21", "%Y-%m-%d") + data = [(start_date + datetime.timedelta(days=x)) for x in range(6)] df_pandas["dates"] = data df_pandas["num"] = [1, 2, 3, 4, 5, 6] df_cudf = DataFrame.from_pandas(df_pandas) diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index e8a695570f0..a80208cfd7d 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -1,6 +1,6 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. -import itertools as it +import itertools import random import numpy as np @@ -280,7 +280,7 @@ def test_drop_duplicates_empty(df): @pytest.mark.parametrize("num_columns", [3, 4, 5]) def test_dataframe_drop_duplicates_numeric_method(num_columns): - comb = list(it.permutations(range(num_columns), num_columns)) + comb = list(itertools.permutations(range(num_columns), num_columns)) shuf = list(comb) random.Random(num_columns).shuffle(shuf) diff --git a/python/cudf/cudf/tests/test_hdfs.py b/python/cudf/cudf/tests/test_hdfs.py index de4303a34a8..8730cb187b5 100644 --- a/python/cudf/cudf/tests/test_hdfs.py +++ b/python/cudf/cudf/tests/test_hdfs.py @@ -3,12 +3,12 @@ import os from io import BytesIO -import fastavro as fa +import fastavro import numpy as np import pandas as pd import pyarrow as pa import pytest -from pyarrow import orc as orc +from pyarrow import orc import cudf from cudf.testing._utils import assert_eq @@ -253,7 +253,7 @@ def test_read_avro(datadir, hdfs, test_url): got = cudf.read_avro(hd_fpath) with open(fname, mode="rb") as f: - expect = pd.DataFrame.from_records(fa.reader(f)) + expect = pd.DataFrame.from_records(fastavro.reader(f)) for col in expect.columns: expect[col] = expect[col].astype(got[col].dtype) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 5082fb08b92..c3969bf6c14 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -11,7 +11,7 @@ import pandas as pd import pyarrow as pa import pyarrow.orc -import pyorc as po +import pyorc import pytest import cudf @@ -307,7 +307,7 @@ def test_orc_read_skiprows(tmpdir): {"a": [1, 0, 1, 0, None, 1, 1, 1, 0, None, 0, 0, 1, 1, 1, 1]}, dtype=pd.BooleanDtype(), ) - writer = po.Writer(buff, po.Struct(a=po.Boolean())) + writer = pyorc.Writer(buff, pyorc.Struct(a=pyorc.Boolean())) tuples = list( map( lambda x: (None,) if x[0] is pd.NA else x, @@ -931,29 +931,35 @@ def generate_list_struct_buff(size=100_000): buff = BytesIO() schema = { - "lvl3_list": po.Array(po.Array(po.Array(po.BigInt()))), - "lvl1_list": po.Array(po.BigInt()), - "lvl1_struct": po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}), - "lvl2_struct": po.Struct( + "lvl3_list": pyorc.Array(pyorc.Array(pyorc.Array(pyorc.BigInt()))), + "lvl1_list": pyorc.Array(pyorc.BigInt()), + "lvl1_struct": pyorc.Struct( + **{"a": pyorc.BigInt(), "b": pyorc.BigInt()} + ), + "lvl2_struct": pyorc.Struct( **{ - "a": po.BigInt(), - "lvl1_struct": po.Struct( - **{"c": po.BigInt(), "d": po.BigInt()} + "a": pyorc.BigInt(), + "lvl1_struct": pyorc.Struct( + **{"c": pyorc.BigInt(), "d": pyorc.BigInt()} ), } ), - "list_nests_struct": po.Array( - po.Array(po.Struct(**{"a": po.BigInt(), "b": po.BigInt()})) + "list_nests_struct": pyorc.Array( + pyorc.Array( + pyorc.Struct(**{"a": pyorc.BigInt(), "b": pyorc.BigInt()}) + ) ), - "struct_nests_list": po.Struct( + "struct_nests_list": pyorc.Struct( **{ - "struct": po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}), - "list": po.Array(po.BigInt()), + "struct": pyorc.Struct( + **{"a": pyorc.BigInt(), "b": pyorc.BigInt()} + ), + "list": pyorc.Array(pyorc.BigInt()), } ), } - schema = po.Struct(**schema) + schema = pyorc.Struct(**schema) lvl3_list = [ rd.choice( @@ -1019,7 +1025,7 @@ def generate_list_struct_buff(size=100_000): } ) - writer = po.Writer(buff, schema, stripe_size=1024) + writer = pyorc.Writer(buff, schema, stripe_size=1024) tuples = list( map( lambda x: (None,) if x[0] is pd.NA else x, @@ -1101,15 +1107,17 @@ def gen_map_buff(size=10000): buff = BytesIO() schema = { - "lvl1_map": po.Map(key=po.String(), value=po.BigInt()), - "lvl2_map": po.Map(key=po.String(), value=po.Array(po.BigInt())), - "lvl2_struct_map": po.Map( - key=po.String(), - value=po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}), + "lvl1_map": pyorc.Map(key=pyorc.String(), value=pyorc.BigInt()), + "lvl2_map": pyorc.Map( + key=pyorc.String(), value=pyorc.Array(pyorc.BigInt()) + ), + "lvl2_struct_map": pyorc.Map( + key=pyorc.String(), + value=pyorc.Struct(**{"a": pyorc.BigInt(), "b": pyorc.BigInt()}), ), } - schema = po.Struct(**schema) + schema = pyorc.Struct(**schema) lvl1_map = [ rd.choice( @@ -1186,8 +1194,8 @@ def gen_map_buff(size=10000): "lvl2_struct_map": lvl2_struct_map, } ) - writer = po.Writer( - buff, schema, stripe_size=1024, compression=po.CompressionKind.NONE + writer = pyorc.Writer( + buff, schema, stripe_size=1024, compression=pyorc.CompressionKind.NONE ) tuples = list( map( @@ -1479,8 +1487,9 @@ def test_statistics_sum_overflow(): minint64 = np.iinfo(np.int64).min buff = BytesIO() - with po.Writer( - buff, po.Struct(a=po.BigInt(), b=po.BigInt(), c=po.BigInt()) + with pyorc.Writer( + buff, + pyorc.Struct(a=pyorc.BigInt(), b=pyorc.BigInt(), c=pyorc.BigInt()), ) as writer: writer.write((maxint64, minint64, minint64)) writer.write((1, -1, 1)) @@ -1497,20 +1506,20 @@ def test_statistics_sum_overflow(): def test_empty_statistics(): buff = BytesIO() - orc_schema = po.Struct( - a=po.BigInt(), - b=po.Double(), - c=po.String(), - d=po.Decimal(11, 2), - e=po.Date(), - f=po.Timestamp(), - g=po.Boolean(), - h=po.Binary(), - i=po.BigInt(), + orc_schema = pyorc.Struct( + a=pyorc.BigInt(), + b=pyorc.Double(), + c=pyorc.String(), + d=pyorc.Decimal(11, 2), + e=pyorc.Date(), + f=pyorc.Timestamp(), + g=pyorc.Boolean(), + h=pyorc.Binary(), + i=pyorc.BigInt(), # One column with non null value, else cudf/pyorc readers crash ) data = tuple([None] * (len(orc_schema.fields) - 1) + [1]) - with po.Writer(buff, orc_schema) as writer: + with pyorc.Writer(buff, orc_schema) as writer: writer.write(data) got = cudf.io.orc.read_orc_statistics([buff]) diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index e8382681820..79211456996 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -1,7 +1,6 @@ # Copyright (c) 2021-2022, NVIDIA CORPORATION. import datetime -import datetime as dt import re from decimal import Decimal @@ -11,7 +10,6 @@ import pytest import cudf -from cudf import Scalar as pycudf_scalar from cudf._lib.copying import get_element from cudf.testing._utils import ( ALL_TYPES, @@ -297,9 +295,9 @@ def test_date_duration_scalars(value): actual = s.value - if isinstance(value, dt.datetime): + if isinstance(value, datetime.datetime): expected = np.datetime64(value) - elif isinstance(value, dt.timedelta): + elif isinstance(value, datetime.timedelta): expected = np.timedelta64(value) elif isinstance(value, pd.Timestamp): expected = value.to_datetime64() @@ -344,7 +342,7 @@ def test_scalar_invalid_implicit_conversion(cls, dtype): cls(pd.NA) except TypeError as e: with pytest.raises(TypeError, match=re.escape(str(e))): - slr = pycudf_scalar(None, dtype=dtype) + slr = cudf.Scalar(None, dtype=dtype) cls(slr) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 4cd1738996f..35c6fdc73f8 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -1,6 +1,6 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. -import datetime as dt +import datetime from collections import namedtuple from decimal import Decimal @@ -259,9 +259,9 @@ def to_cudf_compatible_scalar(val, dtype=None): ) or cudf.api.types.is_string_dtype(dtype): dtype = "str" - if isinstance(val, dt.datetime): + if isinstance(val, datetime.datetime): val = np.datetime64(val) - elif isinstance(val, dt.timedelta): + elif isinstance(val, datetime.timedelta): val = np.timedelta64(val) elif isinstance(val, pd.Timestamp): val = val.to_datetime64() diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py index cdaaff6b2af..25b3d517e1c 100644 --- a/python/cudf/cudf/utils/queryutils.py +++ b/python/cudf/cudf/utils/queryutils.py @@ -1,7 +1,7 @@ # Copyright (c) 2018-2022, NVIDIA CORPORATION. import ast -import datetime as dt +import datetime from typing import Any, Dict import numpy as np @@ -232,7 +232,7 @@ def query_execute(df, expr, callenv): name = name[len(ENVREF_PREFIX) :] try: val = envdict[name] - if isinstance(val, dt.datetime): + if isinstance(val, datetime.datetime): val = np.datetime64(val) except KeyError: msg = "{!r} not defined in the calling environment"