diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e252af717ce..a030f3bd25b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -61,6 +61,16 @@ repos: # https://github.com/PyCQA/pydocstyle/issues/603 additional_dependencies: [toml] args: ["--config=pyproject.toml"] + - repo: https://github.com/nbQA-dev/nbQA + rev: 1.6.3 + hooks: + - id: nbqa-isort + # Use the cudf_kafka isort orderings in notebooks so that dask + # and RAPIDS packages have their own sections. + args: ["--settings-file=python/cudf_kafka/pyproject.toml"] + - id: nbqa-black + # Explicitly specify the pyproject.toml at the repo root, not per-project. + args: ["--config=pyproject.toml"] - repo: https://github.com/pre-commit/mirrors-clang-format rev: v11.1.0 hooks: diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb index af938b79a29..0352c624e04 100644 --- a/docs/cudf/source/user_guide/10min.ipynb +++ b/docs/cudf/source/user_guide/10min.ipynb @@ -35,6 +35,7 @@ "\n", "import cupy as cp\n", "import pandas as pd\n", + "\n", "import cudf\n", "import dask_cudf\n", "\n", diff --git a/docs/cudf/source/user_guide/cupy-interop.ipynb b/docs/cudf/source/user_guide/cupy-interop.ipynb index 3e169984ace..c98a4ddea23 100644 --- a/docs/cudf/source/user_guide/cupy-interop.ipynb +++ b/docs/cudf/source/user_guide/cupy-interop.ipynb @@ -18,9 +18,10 @@ "outputs": [], "source": [ "import timeit\n", - "from packaging import version\n", "\n", "import cupy as cp\n", + "from packaging import version\n", + "\n", "import cudf\n", "\n", "if version.parse(cp.__version__) >= version.parse(\"10.0.0\"):\n", @@ -63,10 +64,13 @@ ], "source": [ "nelem = 10000\n", - "df = cudf.DataFrame({'a':range(nelem),\n", - " 'b':range(500, nelem + 500),\n", - " 'c':range(1000, nelem + 1000)}\n", - " )\n", + "df = cudf.DataFrame(\n", + " {\n", + " \"a\": range(nelem),\n", + " \"b\": range(500, nelem + 500),\n", + " \"c\": range(1000, nelem + 1000),\n", + " }\n", + ")\n", "\n", "%timeit arr_cupy = cupy_from_dlpack(df.to_dlpack())\n", "%timeit arr_cupy = df.values\n", @@ -138,7 +142,7 @@ } ], "source": [ - "col = 'a'\n", + "col = \"a\"\n", "\n", "%timeit cola_cupy = cp.asarray(df[col])\n", "%timeit cola_cupy = cupy_from_dlpack(df[col].to_dlpack())\n", @@ -1088,14 +1092,16 @@ "metadata": {}, "outputs": [], "source": [ - "def cudf_to_cupy_sparse_matrix(data, sparseformat='column'):\n", - " \"\"\"Converts a cuDF object to a CuPy Sparse Column matrix.\n", - " \"\"\"\n", - " if sparseformat not in ('row', 'column',):\n", + "def cudf_to_cupy_sparse_matrix(data, sparseformat=\"column\"):\n", + " \"\"\"Converts a cuDF object to a CuPy Sparse Column matrix.\"\"\"\n", + " if sparseformat not in (\n", + " \"row\",\n", + " \"column\",\n", + " ):\n", " raise ValueError(\"Let's focus on column and row formats for now.\")\n", - " \n", + "\n", " _sparse_constructor = cp.sparse.csc_matrix\n", - " if sparseformat == 'row':\n", + " if sparseformat == \"row\":\n", " _sparse_constructor = cp.sparse.csr_matrix\n", "\n", " return _sparse_constructor(cupy_from_dlpack(data.to_dlpack()))" @@ -1121,8 +1127,8 @@ "nonzero = 1000\n", "for i in range(20):\n", " arr = cp.random.normal(5, 5, nelem)\n", - " arr[cp.random.choice(arr.shape[0], nelem-nonzero, replace=False)] = 0\n", - " df['a' + str(i)] = arr" + " arr[cp.random.choice(arr.shape[0], nelem - nonzero, replace=False)] = 0\n", + " df[\"a\" + str(i)] = arr" ] }, { diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb index 943fc980a31..ba8c65784d2 100644 --- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb +++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb @@ -15,9 +15,10 @@ "metadata": {}, "outputs": [], "source": [ + "import numpy as np\n", + "\n", "import cudf\n", - "from cudf.datasets import randomdata\n", - "import numpy as np" + "from cudf.datasets import randomdata" ] }, { @@ -375,7 +376,7 @@ "metadata": {}, "outputs": [], "source": [ - "sr = cudf.Series(['', 'abc', 'some_example'])" + "sr = cudf.Series([\"\", \"abc\", \"some_example\"])" ] }, { @@ -387,9 +388,9 @@ "source": [ "def f(st):\n", " if len(st) > 0:\n", - " if st.startswith('a'):\n", + " if st.startswith(\"a\"):\n", " return 1\n", - " elif 'example' in st:\n", + " elif \"example\" in st:\n", " return 2\n", " else:\n", " return -1\n", @@ -443,6 +444,7 @@ "outputs": [], "source": [ "from cudf.core.udf.utils import set_malloc_heap_size\n", + "\n", "set_malloc_heap_size(int(2e9))" ] }, @@ -472,7 +474,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = randomdata(nrows=5, dtypes={'a':int, 'b':int, 'c':int}, seed=12)" + "df = randomdata(nrows=5, dtypes={\"a\": int, \"b\": int, \"c\": int}, seed=12)" ] }, { @@ -484,10 +486,11 @@ "source": [ "from numba import cuda\n", "\n", + "\n", "@cuda.jit\n", "def multiply(in_col, out_col, multiplier):\n", " i = cuda.grid(1)\n", - " if i < in_col.size: # boundary guard\n", + " if i < in_col.size: # boundary guard\n", " out_col[i] = in_col[i] * multiplier" ] }, @@ -508,9 +511,9 @@ "metadata": {}, "outputs": [], "source": [ - "size = len(df['a'])\n", - "df['e'] = 0.0\n", - "multiply.forall(size)(df['a'], df['e'], 10.0)" + "size = len(df[\"a\"])\n", + "df[\"e\"] = 0.0\n", + "multiply.forall(size)(df[\"a\"], df[\"e\"], 10.0)" ] }, { @@ -658,7 +661,7 @@ "outputs": [], "source": [ "def f(row):\n", - " return row['A'] + row['B']" + " return row[\"A\"] + row[\"B\"]" ] }, { @@ -733,10 +736,7 @@ } ], "source": [ - "df = cudf.DataFrame({\n", - " 'A': [1,2,3],\n", - " 'B': [4,cudf.NA,6]\n", - "})\n", + "df = cudf.DataFrame({\"A\": [1, 2, 3], \"B\": [4, cudf.NA, 6]})\n", "df" ] }, @@ -881,13 +881,14 @@ ], "source": [ "def f(row):\n", - " x = row['a']\n", + " x = row[\"a\"]\n", " if x is cudf.NA:\n", " return 0\n", " else:\n", " return x + 1\n", "\n", - "df = cudf.DataFrame({'a': [1, cudf.NA, 3]})\n", + "\n", + "df = cudf.DataFrame({\"a\": [1, cudf.NA, 3]})\n", "df" ] }, @@ -988,17 +989,15 @@ ], "source": [ "def f(row):\n", - " x = row['a']\n", - " y = row['b']\n", + " x = row[\"a\"]\n", + " y = row[\"b\"]\n", " if x + y > 3:\n", " return cudf.NA\n", " else:\n", " return x + y\n", "\n", - "df = cudf.DataFrame({\n", - " 'a': [1, 2, 3], \n", - " 'b': [2, 1, 1]\n", - "})\n", + "\n", + "df = cudf.DataFrame({\"a\": [1, 2, 3], \"b\": [2, 1, 1]})\n", "df" ] }, @@ -1099,12 +1098,10 @@ ], "source": [ "def f(row):\n", - " return row['a'] + row['b']\n", + " return row[\"a\"] + row[\"b\"]\n", + "\n", "\n", - "df = cudf.DataFrame({\n", - " 'a': [1, 2, 3], \n", - " 'b': [0.5, cudf.NA, 3.14]\n", - "})\n", + "df = cudf.DataFrame({\"a\": [1, 2, 3], \"b\": [0.5, cudf.NA, 3.14]})\n", "df" ] }, @@ -1214,15 +1211,14 @@ ], "source": [ "def f(row):\n", - " x = row['a']\n", + " x = row[\"a\"]\n", " if x > 3:\n", - " return x\n", + " return x\n", " else:\n", - " return 1.5\n", + " return 1.5\n", + "\n", "\n", - "df = cudf.DataFrame({\n", - " 'a': [1, 3, 5]\n", - "})\n", + "df = cudf.DataFrame({\"a\": [1, 3, 5]})\n", "df" ] }, @@ -1335,15 +1331,18 @@ ], "source": [ "def f(row):\n", - " return row['a'] + (row['b'] - (row['c'] / row['d'])) % row['e']\n", + " return row[\"a\"] + (row[\"b\"] - (row[\"c\"] / row[\"d\"])) % row[\"e\"]\n", "\n", - "df = cudf.DataFrame({\n", - " 'a': [1, 2, 3],\n", - " 'b': [4, 5, 6],\n", - " 'c': [cudf.NA, 4, 4],\n", - " 'd': [8, 7, 8],\n", - " 'e': [7, 1, 6]\n", - "})\n", + "\n", + "df = cudf.DataFrame(\n", + " {\n", + " \"a\": [1, 2, 3],\n", + " \"b\": [4, 5, 6],\n", + " \"c\": [cudf.NA, 4, 4],\n", + " \"d\": [8, 7, 8],\n", + " \"e\": [7, 1, 6],\n", + " }\n", + ")\n", "df" ] }, @@ -1451,10 +1450,9 @@ } ], "source": [ - "str_df = cudf.DataFrame({\n", - " 'str_col': ['abc', 'ABC', 'Example'],\n", - " 'scale': [1, 2, 3]\n", - "})\n", + "str_df = cudf.DataFrame(\n", + " {\"str_col\": [\"abc\", \"ABC\", \"Example\"], \"scale\": [1, 2, 3]}\n", + ")\n", "str_df" ] }, @@ -1466,9 +1464,9 @@ "outputs": [], "source": [ "def f(row):\n", - " st = row['str_col']\n", - " scale = row['scale']\n", - " \n", + " st = row[\"str_col\"]\n", + " scale = row[\"scale\"]\n", + "\n", " if len(st) > 5:\n", " return len(st) + scale\n", " else:\n", @@ -1626,11 +1624,12 @@ } ], "source": [ - "df = df.apply_rows(conditional_add, \n", - " incols={'a':'x', 'e':'y'},\n", - " outcols={'out': np.float64},\n", - " kwargs={}\n", - " )\n", + "df = df.apply_rows(\n", + " conditional_add,\n", + " incols={\"a\": \"x\", \"e\": \"y\"},\n", + " outcols={\"out\": np.float64},\n", + " kwargs={},\n", + ")\n", "df.head()" ] }, @@ -1738,10 +1737,11 @@ " for i, (x, y) in enumerate(zip(a, b)):\n", " out[i] = x + y\n", "\n", - "df = randomdata(nrows=5, dtypes={'a':int, 'b':int, 'c':int}, seed=12)\n", - "df.loc[2, 'a'] = None\n", - "df.loc[3, 'b'] = None\n", - "df.loc[1, 'c'] = None\n", + "\n", + "df = randomdata(nrows=5, dtypes={\"a\": int, \"b\": int, \"c\": int}, seed=12)\n", + "df.loc[2, \"a\"] = None\n", + "df.loc[3, \"b\"] = None\n", + "df.loc[1, \"c\"] = None\n", "df.head()" ] }, @@ -1841,10 +1841,9 @@ } ], "source": [ - "df = df.apply_rows(gpu_add, \n", - " incols=['a', 'b'],\n", - " outcols={'out':np.float64},\n", - " kwargs={})\n", + "df = df.apply_rows(\n", + " gpu_add, incols=[\"a\", \"b\"], outcols={\"out\": np.float64}, kwargs={}\n", + ")\n", "df.head()" ] }, @@ -1892,7 +1891,7 @@ } ], "source": [ - "ser = cudf.Series([16, 25, 36, 49, 64, 81], dtype='float64')\n", + "ser = cudf.Series([16, 25, 36, 49, 64, 81], dtype=\"float64\")\n", "ser" ] }, @@ -1935,12 +1934,13 @@ "source": [ "import math\n", "\n", + "\n", "def example_func(window):\n", " b = 0\n", " for a in window:\n", " b = max(b, math.sqrt(a))\n", " if b == 8:\n", - " return 100 \n", + " return 100\n", " return b" ] }, @@ -2064,8 +2064,8 @@ ], "source": [ "df2 = cudf.DataFrame()\n", - "df2['a'] = np.arange(55, 65, dtype='float64')\n", - "df2['b'] = np.arange(55, 65, dtype='float64')\n", + "df2[\"a\"] = np.arange(55, 65, dtype=\"float64\")\n", + "df2[\"b\"] = np.arange(55, 65, dtype=\"float64\")\n", "df2.head()" ] }, @@ -2279,7 +2279,9 @@ } ], "source": [ - "df = randomdata(nrows=10, dtypes={'a':float, 'b':bool, 'c':str, 'e': float}, seed=12)\n", + "df = randomdata(\n", + " nrows=10, dtypes={\"a\": float, \"b\": bool, \"c\": str, \"e\": float}, seed=12\n", + ")\n", "df.head()" ] }, @@ -2290,7 +2292,7 @@ "metadata": {}, "outputs": [], "source": [ - "grouped = df.groupby(['b'])" + "grouped = df.groupby([\"b\"])" ] }, { @@ -2469,9 +2471,9 @@ } ], "source": [ - "results = grouped.apply_grouped(rolling_avg,\n", - " incols=['e'],\n", - " outcols=dict(rolling_avg_e=np.float64))\n", + "results = grouped.apply_grouped(\n", + " rolling_avg, incols=[\"e\"], outcols=dict(rolling_avg_e=np.float64)\n", + ")\n", "results" ] }, @@ -2554,8 +2556,9 @@ " i = cuda.grid(1)\n", " if i < x.size:\n", " out[i] = x[i] * 5\n", - " \n", - "out = cudf.Series(cp.zeros(len(s), dtype='int32'))\n", + "\n", + "\n", + "out = cudf.Series(cp.zeros(len(s), dtype=\"int32\"))\n", "multiply_by_5.forall(s.shape[0])(s, out)\n", "out" ] diff --git a/docs/cudf/source/user_guide/missing-data.ipynb b/docs/cudf/source/user_guide/missing-data.ipynb index ac5bddd34cf..f1404ce0b77 100644 --- a/docs/cudf/source/user_guide/missing-data.ipynb +++ b/docs/cudf/source/user_guide/missing-data.ipynb @@ -39,8 +39,9 @@ "metadata": {}, "outputs": [], "source": [ - "import cudf\n", - "import numpy as np" + "import numpy as np\n", + "\n", + "import cudf" ] }, { @@ -50,7 +51,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = cudf.DataFrame({'a': [1, 2, None, 4], 'b':[0.1, None, 2.3, 17.17]})" + "df = cudf.DataFrame({\"a\": [1, 2, None, 4], \"b\": [0.1, None, 2.3, 17.17]})" ] }, { @@ -221,7 +222,7 @@ } ], "source": [ - "df['a'].notna()" + "df[\"a\"].notna()" ] }, { @@ -304,7 +305,7 @@ } ], "source": [ - "df['b'] == np.nan" + "df[\"b\"] == np.nan" ] }, { @@ -535,7 +536,10 @@ ], "source": [ "import pandas as pd\n", - "datetime_series = cudf.Series([pd.Timestamp(\"20120101\"), pd.NaT, pd.Timestamp(\"20120101\")])\n", + "\n", + "datetime_series = cudf.Series(\n", + " [pd.Timestamp(\"20120101\"), pd.NaT, pd.Timestamp(\"20120101\")]\n", + ")\n", "datetime_series" ] }, @@ -618,7 +622,12 @@ "metadata": {}, "outputs": [], "source": [ - "df1 = cudf.DataFrame({'a':[1, None, 2, 3, None], 'b':cudf.Series([np.nan, 2, 3.2, 0.1, 1], nan_as_null=False)})" + "df1 = cudf.DataFrame(\n", + " {\n", + " \"a\": [1, None, 2, 3, None],\n", + " \"b\": cudf.Series([np.nan, 2, 3.2, 0.1, 1], nan_as_null=False),\n", + " }\n", + ")" ] }, { @@ -628,7 +637,9 @@ "metadata": {}, "outputs": [], "source": [ - "df2 = cudf.DataFrame({'a':[1, 11, 2, 34, 10], 'b':cudf.Series([0.23, 22, 3.2, None, 1])})" + "df2 = cudf.DataFrame(\n", + " {\"a\": [1, 11, 2, 34, 10], \"b\": cudf.Series([0.23, 22, 3.2, None, 1])}\n", + ")" ] }, { @@ -899,7 +910,7 @@ } ], "source": [ - "df1['a']" + "df1[\"a\"]" ] }, { @@ -920,7 +931,7 @@ } ], "source": [ - "df1['a'].sum()" + "df1[\"a\"].sum()" ] }, { @@ -949,7 +960,7 @@ } ], "source": [ - "df1['a'].mean()" + "df1[\"a\"].mean()" ] }, { @@ -980,7 +991,7 @@ } ], "source": [ - "df1['a'].sum(skipna=False)" + "df1[\"a\"].sum(skipna=False)" ] }, { @@ -1001,7 +1012,7 @@ } ], "source": [ - "df1['a'].mean(skipna=False)" + "df1[\"a\"].mean(skipna=False)" ] }, { @@ -1035,7 +1046,7 @@ } ], "source": [ - "df1['a'].cumsum()" + "df1[\"a\"].cumsum()" ] }, { @@ -1069,7 +1080,7 @@ } ], "source": [ - "df1['a'].cumsum(skipna=False)" + "df1[\"a\"].cumsum(skipna=False)" ] }, { @@ -1148,7 +1159,7 @@ } ], "source": [ - "cudf.Series([], dtype='float64').sum()" + "cudf.Series([], dtype=\"float64\").sum()" ] }, { @@ -1219,7 +1230,7 @@ } ], "source": [ - "cudf.Series([], dtype='float64').prod()" + "cudf.Series([], dtype=\"float64\").prod()" ] }, { @@ -1382,7 +1393,7 @@ } ], "source": [ - "df1.groupby('a').mean()" + "df1.groupby(\"a\").mean()" ] }, { @@ -1463,7 +1474,7 @@ } ], "source": [ - "df1.groupby('a', dropna=False).mean()" + "df1.groupby(\"a\", dropna=False).mean()" ] }, { @@ -1670,7 +1681,7 @@ } ], "source": [ - "df1['b'].fillna(10)" + "df1[\"b\"].fillna(10)" ] }, { @@ -1697,7 +1708,8 @@ "outputs": [], "source": [ "import cupy as cp\n", - "dff = cudf.DataFrame(cp.random.randn(10, 3), columns=list('ABC'))" + "\n", + "dff = cudf.DataFrame(cp.random.randn(10, 3), columns=list(\"ABC\"))" ] }, { @@ -2339,7 +2351,7 @@ } ], "source": [ - "df1['a'].dropna()" + "df1[\"a\"].dropna()" ] }, {