From f3af69925e5321563dea82d8d2ae463faa45ba44 Mon Sep 17 00:00:00 2001 From: Raymond Douglass Date: Fri, 23 Sep 2022 11:38:52 -0400 Subject: [PATCH 01/10] DOC --- CHANGELOG.md | 4 ++++ ci/checks/style.sh | 2 +- ci/gpu/build.sh | 2 +- ci/gpu/java.sh | 2 +- conda/environments/cudf_dev_cuda11.5.yml | 4 ++-- cpp/CMakeLists.txt | 2 +- cpp/doxygen/Doxyfile | 4 ++-- cpp/examples/basic/CMakeLists.txt | 2 +- cpp/libcudf_kafka/CMakeLists.txt | 2 +- docs/cudf/source/conf.py | 4 ++-- fetch_rapids.cmake | 2 +- java/src/main/native/CMakeLists.txt | 2 +- python/cudf/CMakeLists.txt | 2 +- 13 files changed, 19 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 89049dff3b6..092b62d6c63 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# cuDF 22.12.00 (Date TBD) + +Please see https://github.com/rapidsai/cudf/releases/tag/v22.12.00a for the latest changes to this development branch. + # cuDF 22.10.00 (Date TBD) Please see https://github.com/rapidsai/cudf/releases/tag/v22.10.00a for the latest changes to this development branch. diff --git a/ci/checks/style.sh b/ci/checks/style.sh index de3f8c01d83..29f5474fd87 100755 --- a/ci/checks/style.sh +++ b/ci/checks/style.sh @@ -14,7 +14,7 @@ LANG=C.UTF-8 . /opt/conda/etc/profile.d/conda.sh conda activate rapids -FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.10/cmake-format-rapids-cmake.json +FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.12/cmake-format-rapids-cmake.json export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE}) wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL} diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 118bdb263af..f3c302173c8 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -35,7 +35,7 @@ unset GIT_DESCRIBE_TAG export INSTALL_DASK_MAIN=1 # ucx-py version -export UCX_PY_VERSION='0.28.*' +export UCX_PY_VERSION='0.29.*' ################################################################################ # TRAP - Setup trap for removing jitify cache diff --git a/ci/gpu/java.sh b/ci/gpu/java.sh index b110303662b..e1d3bab2bc5 100755 --- a/ci/gpu/java.sh +++ b/ci/gpu/java.sh @@ -31,7 +31,7 @@ export GIT_DESCRIBE_TAG=`git describe --tags` export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` # ucx-py version -export UCX_PY_VERSION='0.28.*' +export UCX_PY_VERSION='0.29.*' ################################################################################ # TRAP - Setup trap for removing jitify cache diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml index 973ca731853..c3e41927a05 100644 --- a/conda/environments/cudf_dev_cuda11.5.yml +++ b/conda/environments/cudf_dev_cuda11.5.yml @@ -13,7 +13,7 @@ dependencies: - clang=11.1.0 - clang-tools=11.1.0 - cupy>=9.5.0,<12.0.0a0 - - rmm=22.10.* + - rmm=22.12.* - cmake>=3.20.1,!=3.23.0 - cmake_setuptools>=0.1.3 - scikit-build>=0.13.1 @@ -62,7 +62,7 @@ dependencies: - sphinx-autobuild - myst-nb - scipy - - dask-cuda=22.10.* + - dask-cuda=22.12.* - mimesis<4.1 - packaging - protobuf>=3.20.1,<3.21.0a0 diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7efa186aede..6b743662e0e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -25,7 +25,7 @@ rapids_cuda_init_architectures(CUDF) project( CUDF - VERSION 22.10.00 + VERSION 22.12.00 LANGUAGES C CXX CUDA ) if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5) diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index 871632b053d..4684e180f00 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "libcudf" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 22.10.00 +PROJECT_NUMBER = 22.12.00 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a @@ -2162,7 +2162,7 @@ SKIP_FUNCTION_MACROS = YES # the path). If a tag file is not located in the directory in which doxygen is # run, you must also specify the path to the tagfile here. -TAGFILES = rmm.tag=https://docs.rapids.ai/api/librmm/22.10 +TAGFILES = rmm.tag=https://docs.rapids.ai/api/librmm/22.12 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a # tag file that is based on the input files it reads. See section "Linking to diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt index f4bc205d4ba..b182cb08774 100644 --- a/cpp/examples/basic/CMakeLists.txt +++ b/cpp/examples/basic/CMakeLists.txt @@ -16,7 +16,7 @@ file( ) include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake) -set(CUDF_TAG branch-22.10) +set(CUDF_TAG branch-22.12) CPMFindPackage( NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf GIT_TAG ${CUDF_TAG} diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt index 76a012e7c6e..71341277109 100644 --- a/cpp/libcudf_kafka/CMakeLists.txt +++ b/cpp/libcudf_kafka/CMakeLists.txt @@ -22,7 +22,7 @@ include(rapids-find) project( CUDA_KAFKA - VERSION 22.10.00 + VERSION 22.12.00 LANGUAGES CXX ) diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index db471316830..ec5b1bd2aac 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -85,9 +85,9 @@ # built documents. # # The short X.Y version. -version = '22.10' +version = '22.12' # The full version, including alpha/beta/rc tags. -release = '22.10.00' +release = '22.12.00' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake index 9e2917ffc07..cc2e201fdc3 100644 --- a/fetch_rapids.cmake +++ b/fetch_rapids.cmake @@ -12,7 +12,7 @@ # the License. # ============================================================================= if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS.cmake) - file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.10/RAPIDS.cmake + file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.12/RAPIDS.cmake ${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS.cmake ) endif() diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 9410f8eacf3..26923927378 100755 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -28,7 +28,7 @@ rapids_cuda_init_architectures(CUDF_JNI) project( CUDF_JNI - VERSION 22.10.00 + VERSION 22.12.00 LANGUAGES C CXX CUDA ) diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index 72e1779401f..6dc0f1800e0 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -14,7 +14,7 @@ cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) -set(cudf_version 22.10.00) +set(cudf_version 22.12.00) include(../../fetch_rapids.cmake) From a945377d5b0ecfc139faf87e9db62d2b9f516df0 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 26 Sep 2022 17:00:15 -0500 Subject: [PATCH 02/10] Add doc section for `list` & `struct` handling (#11770) Fixes: #11011 This PR: - [x] Adds a side-section for `list` & `struct` handling. - [x] Reduces duplication. - [x] Exposes more `ListMethods` APIs. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/11770 --- docs/cudf/source/api_docs/index.rst | 2 + docs/cudf/source/api_docs/list_handling.rst | 21 +++++++ docs/cudf/source/api_docs/series.rst | 58 +++++-------------- docs/cudf/source/api_docs/struct_handling.rst | 13 +++++ 4 files changed, 50 insertions(+), 44 deletions(-) create mode 100644 docs/cudf/source/api_docs/list_handling.rst create mode 100644 docs/cudf/source/api_docs/struct_handling.rst diff --git a/docs/cudf/source/api_docs/index.rst b/docs/cudf/source/api_docs/index.rst index b77c98f3ac3..ef04167c327 100644 --- a/docs/cudf/source/api_docs/index.rst +++ b/docs/cudf/source/api_docs/index.rst @@ -19,4 +19,6 @@ This page provides a list of all publicly accessible modules, methods and classe io subword_tokenize string_handling + list_handling + struct_handling options diff --git a/docs/cudf/source/api_docs/list_handling.rst b/docs/cudf/source/api_docs/list_handling.rst new file mode 100644 index 00000000000..f1fb6d1ca74 --- /dev/null +++ b/docs/cudf/source/api_docs/list_handling.rst @@ -0,0 +1,21 @@ +List handling +~~~~~~~~~~~~~ + +``Series.list`` can be used to access the values of the series as +lists and apply list methods to it. These can be accessed like +``Series.list.``. + +.. currentmodule:: cudf.core.column.lists.ListMethods +.. autosummary:: + :toctree: api/ + + astype + concat + contains + index + get + leaves + len + sort_values + take + unique diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst index 1e53c90b44d..53042041f6d 100644 --- a/docs/cudf/source/api_docs/series.rst +++ b/docs/cudf/source/api_docs/series.rst @@ -310,21 +310,6 @@ Timedelta properties .. include:: string_handling.rst -.. - The following is needed to ensure the generated pages are created with the - correct template (otherwise they would be created in the Series/Index class page) - -.. - .. currentmodule:: cudf - .. autosummary:: - :toctree: api/ - :template: autosummary/accessor.rst - - Series.str - Series.cat - Series.dt - Index.str - .. _api.series.cat: Categorical accessor @@ -349,42 +334,27 @@ the ``Series.cat`` accessor. .. _api.series.list: - -List handling -~~~~~~~~~~~~~ - -``Series.list`` can be used to access the values of the series as -lists and apply list methods to it. These can be accessed like -``Series.list.``. - -.. currentmodule:: cudf.core.column.lists.ListMethods -.. autosummary:: - :toctree: api/ - - concat - contains - get - len - sort_values - take - unique +.. include:: list_handling.rst .. _api.series.struct: +.. include:: struct_handling.rst -Struct handling -~~~~~~~~~~~~~~~ -``Series.struct`` can be used to access the values of the series as -Structs and apply struct methods to it. These can be accessed like -``Series.struct.``. +.. + The following is needed to ensure the generated pages are created with the + correct template (otherwise they would be created in the Series/Index class page) -.. currentmodule:: cudf.core.column.struct.StructMethods -.. autosummary:: - :toctree: api/ +.. + .. currentmodule:: cudf + .. autosummary:: + :toctree: api/ + :template: autosummary/accessor.rst - field - explode + Series.str + Series.cat + Series.dt + Index.str Serialization / IO / conversion diff --git a/docs/cudf/source/api_docs/struct_handling.rst b/docs/cudf/source/api_docs/struct_handling.rst new file mode 100644 index 00000000000..05ba990382a --- /dev/null +++ b/docs/cudf/source/api_docs/struct_handling.rst @@ -0,0 +1,13 @@ +Struct handling +~~~~~~~~~~~~~~~ + +``Series.struct`` can be used to access the values of the series as +Structs and apply struct methods to it. These can be accessed like +``Series.struct.``. + +.. currentmodule:: cudf.core.column.struct.StructMethods +.. autosummary:: + :toctree: api/ + + field + explode From 11156ccda67b1e7667de520cfc66ca4c3117ad12 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 26 Sep 2022 17:50:48 -0500 Subject: [PATCH 03/10] Fix issue with set-item incase of `list` and `struct` types (#11760) Fixes: #11721 This PR: - [x] Fixes: #11721, by not going through the fill & fill_inplace APIs which don't support `struct` and `list` columns. - [x] Fixes an issue in caching while constructing a `struct` or `list` scalar as `list` & `dict` objects are not hashable and we were running into the following errors: ```python In [9]: i = cudf.Scalar([10, 11]) --------------------------------------------------------------------------- KeyError Traceback (most recent call last) File /nvme/0/pgali/envs/cudfdev/lib/python3.9/site-packages/cudf/core/scalar.py:51, in CachedScalarInstanceMeta.__call__(self, value, dtype) 49 try: 50 # try retrieving an instance from the cache: ---> 51 self.__instances.move_to_end(cache_key) 52 return self.__instances[cache_key] KeyError: ([10, 11], , None, ) During handling of the above exception, another exception occurred: TypeError Traceback (most recent call last) Cell In [9], line 1 ----> 1 i = cudf.Scalar([10, 11]) File /nvme/0/pgali/envs/cudfdev/lib/python3.9/site-packages/cudf/core/scalar.py:57, in CachedScalarInstanceMeta.__call__(self, value, dtype) 53 except KeyError: 54 # if an instance couldn't be found in the cache, 55 # construct it and add to cache: 56 obj = super().__call__(value, dtype=dtype) ---> 57 self.__instances[cache_key] = obj 58 if len(self.__instances) > self.__maxsize: 59 self.__instances.popitem(last=False) TypeError: unhashable type: 'list' ``` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/11760 --- python/cudf/cudf/core/column/column.py | 7 +++- python/cudf/cudf/core/scalar.py | 6 ++- python/cudf/cudf/tests/test_setitem.py | 55 ++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 8b2c51dae90..66ae984ee81 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -518,7 +518,12 @@ def _scatter_by_slice( self._check_scatter_key_length(num_keys, value) - if step == 1: + if step == 1 and not isinstance( + self, (cudf.core.column.StructColumn, cudf.core.column.ListColumn) + ): + # NOTE: List & Struct dtypes aren't supported by both + # inplace & out-of-place fill. Hence we need to use scatter for + # these two types. if isinstance(value, cudf.core.scalar.Scalar): return self._fill(value, start, stop, inplace=True) else: diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 1deb56963a7..36018425fc6 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -54,7 +54,11 @@ def __call__(self, value, dtype=None): # if an instance couldn't be found in the cache, # construct it and add to cache: obj = super().__call__(value, dtype=dtype) - self.__instances[cache_key] = obj + try: + self.__instances[cache_key] = obj + except TypeError: + # couldn't hash the arguments, don't cache: + return obj if len(self.__instances) > self.__maxsize: self.__instances.popitem(last=False) return obj diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index cb455ae831c..13b342e6c3b 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -242,3 +242,58 @@ def test_categorical_setitem_invalid(): "the categories first", ): gs[0] = 5 + + +def test_series_slice_setitem_list(): + actual = cudf.Series([[[1, 2], [2, 3]], [[3, 4]], [[4, 5]], [[6, 7]]]) + actual[slice(0, 3, 1)] = [[10, 11], [12, 23]] + expected = cudf.Series( + [ + [[10, 11], [12, 23]], + [[10, 11], [12, 23]], + [[10, 11], [12, 23]], + [[6, 7]], + ] + ) + assert_eq(actual, expected) + + actual = cudf.Series([[[1, 2], [2, 3]], [[3, 4]], [[4, 5]], [[6, 7]]]) + actual[0:3] = cudf.Scalar([[10, 11], [12, 23]]) + + assert_eq(actual, expected) + + +def test_series_slice_setitem_struct(): + actual = cudf.Series( + [ + {"a": {"b": 10}, "b": 11}, + {"a": {"b": 100}, "b": 5}, + {"a": {"b": 50}, "b": 2}, + {"a": {"b": 1000}, "b": 67}, + {"a": {"b": 4000}, "b": 1090}, + ] + ) + actual[slice(0, 3, 1)] = {"a": {"b": 5050}, "b": 101} + expected = cudf.Series( + [ + {"a": {"b": 5050}, "b": 101}, + {"a": {"b": 5050}, "b": 101}, + {"a": {"b": 5050}, "b": 101}, + {"a": {"b": 1000}, "b": 67}, + {"a": {"b": 4000}, "b": 1090}, + ] + ) + assert_eq(actual, expected) + + actual = cudf.Series( + [ + {"a": {"b": 10}, "b": 11}, + {"a": {"b": 100}, "b": 5}, + {"a": {"b": 50}, "b": 2}, + {"a": {"b": 1000}, "b": 67}, + {"a": {"b": 4000}, "b": 1090}, + ] + ) + actual[0:3] = cudf.Scalar({"a": {"b": 5050}, "b": 101}) + + assert_eq(actual, expected) From e64c2da1207d5069bc627b5b08bbcc1f25636e76 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 26 Sep 2022 18:03:33 -0500 Subject: [PATCH 04/10] Fix return type of `Index.isna` & `Index.notna` (#11769) This PR fixes: #11159 by returning correct object type for the result of `isna` & `notna` in `Index`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Charles Blackmon-Luca (https://github.com/charlesbluca) URL: https://github.com/rapidsai/cudf/pull/11769 --- python/cudf/cudf/core/frame.py | 28 ++++++++++++++-------------- python/cudf/cudf/core/index.py | 20 ++++++++++++++++++++ python/cudf/cudf/tests/test_index.py | 4 ++-- 3 files changed, 36 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 40926a1c8cb..4fb914a6409 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1196,7 +1196,7 @@ def _copy_type_metadata( return self @_cudf_nvtx_annotate - def isnull(self): + def isna(self): """ Identify missing values. @@ -1236,7 +1236,7 @@ def isnull(self): 0 5 Alfred 1 6 1939-05-27 00:00:00.000000 Batman Batmobile 2 1940-04-25 00:00:00.000000 Joker - >>> df.isnull() + >>> df.isna() age born name toy 0 False True False True 1 False False False False @@ -1252,7 +1252,7 @@ def isnull(self): 3 Inf 4 -Inf dtype: float64 - >>> ser.isnull() + >>> ser.isna() 0 False 1 False 2 True @@ -1265,17 +1265,17 @@ def isnull(self): >>> idx = cudf.Index([1, 2, None, np.NaN, 0.32, np.inf]) >>> idx Float64Index([1.0, 2.0, , , 0.32, Inf], dtype='float64') - >>> idx.isnull() - GenericIndex([False, False, True, True, False, False], dtype='bool') + >>> idx.isna() + array([False, False, True, True, False, False]) """ data_columns = (col.isnull() for col in self._columns) return self._from_data_like_self(zip(self._column_names, data_columns)) - # Alias for isnull - isna = isnull + # Alias for isna + isnull = isna @_cudf_nvtx_annotate - def notnull(self): + def notna(self): """ Identify non-missing values. @@ -1315,7 +1315,7 @@ def notnull(self): 0 5 Alfred 1 6 1939-05-27 00:00:00.000000 Batman Batmobile 2 1940-04-25 00:00:00.000000 Joker - >>> df.notnull() + >>> df.notna() age born name toy 0 True False True False 1 True True True True @@ -1331,7 +1331,7 @@ def notnull(self): 3 Inf 4 -Inf dtype: float64 - >>> ser.notnull() + >>> ser.notna() 0 True 1 True 2 False @@ -1344,14 +1344,14 @@ def notnull(self): >>> idx = cudf.Index([1, 2, None, np.NaN, 0.32, np.inf]) >>> idx Float64Index([1.0, 2.0, , , 0.32, Inf], dtype='float64') - >>> idx.notnull() - GenericIndex([True, True, False, False, True, True], dtype='bool') + >>> idx.notna() + array([ True, True, False, False, True, True]) """ data_columns = (col.notnull() for col in self._columns) return self._from_data_like_self(zip(self._column_names, data_columns)) - # Alias for notnull - notna = notnull + # Alias for notna + notnull = notna @_cudf_nvtx_annotate def searchsorted( diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 57a10358561..b6ae7beebc5 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -841,6 +841,14 @@ def nunique(self): def isna(self): return cupy.zeros(len(self), dtype=bool) + isnull = isna + + @_cudf_nvtx_annotate + def notna(self): + return cupy.ones(len(self), dtype=bool) + + notnull = isna + @_cudf_nvtx_annotate def _minmax(self, meth: str): no_steps = len(self) - 1 @@ -1313,6 +1321,18 @@ def find_label_range(self, first, last): end += 1 return begin, end + @_cudf_nvtx_annotate + def isna(self): + return self._column.isnull().values + + isnull = isna + + @_cudf_nvtx_annotate + def notna(self): + return self._column.notnull().values + + notnull = notna + @_cudf_nvtx_annotate def get_slice_bound(self, label, side, kind=None): return self._values.get_slice_bound(label, side, kind) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 236fd619b8e..e8c568979a3 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -406,14 +406,14 @@ def test_index_copy_deep(idx, deep): def test_index_isna(idx): pidx = pd.Index(idx, name="idx") gidx = cudf.core.index.Int64Index(idx, name="idx") - assert_eq(gidx.isna().to_numpy(), pidx.isna()) + assert_eq(gidx.isna(), pidx.isna()) @pytest.mark.parametrize("idx", [[1, None, 3, None, 5]]) def test_index_notna(idx): pidx = pd.Index(idx, name="idx") gidx = cudf.core.index.Int64Index(idx, name="idx") - assert_eq(gidx.notna().to_numpy(), pidx.notna()) + assert_eq(gidx.notna(), pidx.notna()) def test_rangeindex_slice_attr_name(): From d24bce5d36788edf2f4a98e62128f7f7d4cc3645 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 26 Sep 2022 20:34:23 -0500 Subject: [PATCH 05/10] Remove `kwargs` in `read_csv` & `to_csv` (#11762) Fixes: #11683, #10823 This PR: - [x] Removes `kwargs` in CSV reader & writer such that users get clear errors when they misspell a parameter. - [x] Re-orders `read_csv` & `to_csv` parameters which will now match to pandas. The diff is actually adding `storage_options` to `read_csv` & `to_csv` after removing `kwargs`, and the rest of it all re-ordering appropriately. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ashwin Srinath (https://github.com/shwina) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/11762 --- python/cudf/cudf/core/dataframe.py | 8 +- python/cudf/cudf/io/csv.py | 66 +++++++------ python/cudf/cudf/tests/test_s3.py | 5 +- python/cudf/cudf/utils/ioutils.py | 154 ++++++++++++++++------------- 4 files changed, 133 insertions(+), 100 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 77aeec286a5..cac37f1f274 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5987,11 +5987,11 @@ def to_csv( columns=None, header=True, index=True, - line_terminator="\n", - chunksize=None, encoding=None, compression=None, - **kwargs, + line_terminator="\n", + chunksize=None, + storage_options=None, ): """{docstring}""" from cudf.io import csv @@ -6008,7 +6008,7 @@ def to_csv( chunksize=chunksize, encoding=encoding, compression=compression, - **kwargs, + storage_options=storage_options, ) @ioutils.doc_to_orc() diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index a665d7e8d7d..0adf432c31d 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -18,45 +18,52 @@ @ioutils.doc_read_csv() def read_csv( filepath_or_buffer, - lineterminator="\n", - quotechar='"', - quoting=0, - doublequote=True, - header="infer", - mangle_dupe_cols=True, - usecols=None, sep=",", delimiter=None, - delim_whitespace=False, - skipinitialspace=False, + header="infer", names=None, + index_col=None, + usecols=None, + prefix=None, + mangle_dupe_cols=True, dtype=None, - skipfooter=0, - skiprows=0, - dayfirst=False, - compression="infer", - thousands=None, - decimal=".", true_values=None, false_values=None, + skipinitialspace=False, + skiprows=0, + skipfooter=0, nrows=None, - byte_range=None, - skip_blank_lines=True, - parse_dates=None, - comment=None, na_values=None, keep_default_na=True, na_filter=True, - prefix=None, - index_col=None, + skip_blank_lines=True, + parse_dates=None, + dayfirst=False, + compression="infer", + thousands=None, + decimal=".", + lineterminator="\n", + quotechar='"', + quoting=0, + doublequote=True, + comment=None, + delim_whitespace=False, + byte_range=None, use_python_file_object=True, - **kwargs, + storage_options=None, + bytes_per_thread=None, ): """{docstring}""" + if use_python_file_object and bytes_per_thread is not None: + raise ValueError( + "bytes_per_thread is only supported when " + "`use_python_file_object=False`" + ) + is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer( path_or_data=filepath_or_buffer, - **kwargs, + storage_options=storage_options, ) if not is_single_filepath_or_buffer: raise NotImplementedError( @@ -68,7 +75,10 @@ def read_csv( compression=compression, iotypes=(BytesIO, StringIO, NativeFile), use_python_file_object=use_python_file_object, - **kwargs, + storage_options=storage_options, + bytes_per_thread=256_000_000 + if bytes_per_thread is None + else bytes_per_thread, ) if na_values is not None and is_scalar(na_values): @@ -142,11 +152,11 @@ def to_csv( columns=None, header=True, index=True, - line_terminator="\n", - chunksize=None, encoding=None, compression=None, - **kwargs, + line_terminator="\n", + chunksize=None, + storage_options=None, ): """{docstring}""" @@ -172,7 +182,7 @@ def to_csv( return_as_string = True path_or_buf = ioutils.get_writer_filepath_or_buffer( - path_or_data=path_or_buf, mode="w", **kwargs + path_or_data=path_or_buf, mode="w", storage_options=storage_options ) if columns is not None: diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index 3219a6ad847..5c06dea4ca6 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -151,7 +151,6 @@ def test_read_csv(s3_base, s3so, pdf, bytes_per_thread): got = cudf.read_csv( f"s3://{bucket}/{fname}", storage_options=s3so, - bytes_per_thread=bytes_per_thread, use_python_file_object=True, ) assert_eq(pdf, got) @@ -188,7 +187,9 @@ def test_read_csv_byte_range( f"s3://{bucket}/{fname}", storage_options=s3so, byte_range=(74, 73), - bytes_per_thread=bytes_per_thread, + bytes_per_thread=bytes_per_thread + if not use_python_file_object + else None, header=None, names=["Integer", "Float", "Integer2", "String", "Boolean"], use_python_file_object=use_python_file_object, diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index fb1b0235822..84d39459a12 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -842,79 +842,43 @@ Delimiter to be used. delimiter : char, default None Alternative argument name for sep. -delim_whitespace : bool, default False - Determines whether to use whitespace as delimiter. -lineterminator : char, default '\\n' - Character to indicate end of line. -skipinitialspace : bool, default False - Skip spaces after delimiter. -names : list of str, default None - List of column names to be used. -dtype : type, str, list of types, or dict of column -> type, default None - Data type(s) for data or columns. If `dtype` is a type/str, all columns - are mapped to the particular type passed. If list, types are applied in - the same order as the column names. If dict, types are mapped to the - column names. - E.g. {{‘a’: np.float64, ‘b’: int32, ‘c’: ‘float’}} - If `None`, dtypes are inferred from the dataset. Use `str` to preserve data - and not infer or interpret to dtype. -quotechar : char, default '"' - Character to indicate start and end of quote item. -quoting : str or int, default 0 - Controls quoting behavior. Set to one of - 0 (csv.QUOTE_MINIMAL), 1 (csv.QUOTE_ALL), - 2 (csv.QUOTE_NONNUMERIC) or 3 (csv.QUOTE_NONE). - Quoting is enabled with all values except 3. -doublequote : bool, default True - When quoting is enabled, indicates whether to interpret two - consecutive quotechar inside fields as single quotechar header : int, default 'infer' Row number to use as the column names. Default behavior is to infer the column names: if no names are passed, header=0; if column names are passed explicitly, header=None. +names : list of str, default None + List of column names to be used. +index_col : int, string or False, default None + Column to use as the row labels of the DataFrame. Passing `index_col=False` + explicitly disables index column inference and discards the last column. usecols : list of int or str, default None Returns subset of the columns given in the list. All elements must be either integer indices (column number) or strings that correspond to column names +prefix : str, default None + Prefix to add to column numbers when parsing without a header row mangle_dupe_cols : boolean, default True Duplicate columns will be specified as 'X','X.1',...'X.N'. -skiprows : int, default 0 - Number of rows to be skipped from the start of file. -skipfooter : int, default 0 - Number of rows to be skipped at the bottom of file. -compression : {{'infer', 'gzip', 'zip', None}}, default 'infer' - For on-the-fly decompression of on-disk data. If ‘infer’, then detect - compression from the following extensions: ‘.gz’,‘.zip’ (otherwise no - decompression). If using ‘zip’, the ZIP file must contain only one - data file to be read in, otherwise the first non-zero-sized file will - be used. Set to None for no decompression. -decimal : char, default '.' - Character used as a decimal point. -thousands : char, default None - Character used as a thousands delimiter. +dtype : type, str, list of types, or dict of column -> type, default None + Data type(s) for data or columns. If `dtype` is a type/str, all columns + are mapped to the particular type passed. If list, types are applied in + the same order as the column names. If dict, types are mapped to the + column names. + E.g. {{‘a’: np.float64, ‘b’: int32, ‘c’: ‘float’}} + If `None`, dtypes are inferred from the dataset. Use `str` to preserve data + and not infer or interpret to dtype. true_values : list, default None Values to consider as boolean True false_values : list, default None Values to consider as boolean False +skipinitialspace : bool, default False + Skip spaces after delimiter. +skiprows : int, default 0 + Number of rows to be skipped from the start of file. +skipfooter : int, default 0 + Number of rows to be skipped at the bottom of file. nrows : int, default None If specified, maximum number of rows to read -byte_range : list or tuple, default None - Byte range within the input file to be read. The first number is the - offset in bytes, the second number is the range size in bytes. Set the - size to zero to read all data after the offset location. Reads the row - that starts before or at the end of the range, even if it ends after - the end of the range. -skip_blank_lines : bool, default True - If True, discard and do not parse empty lines - If False, interpret empty lines as NaN values -parse_dates : list of int or names, default None - If list of columns, then attempt to parse each entry as a date. - Columns may not always be recognized as dates, for instance due to - unusual or non-standard formats. To guarantee a date and increase parsing - speed, explicitly specify `dtype='date'` for the desired columns. -comment : char, default None - Character used as a comments indicator. If found at the beginning of a - line, the line will be ignored altogether. na_values : scalar, str, or list-like, optional Additional strings to recognize as nulls. By default the following values are interpreted as @@ -927,16 +891,67 @@ na_filter : bool, default True Detect missing values (empty strings and the values in na_values). Passing False can improve performance. -prefix : str, default None - Prefix to add to column numbers when parsing without a header row -index_col : int, string or False, default None - Column to use as the row labels of the DataFrame. Passing `index_col=False` - explicitly disables index column inference and discards the last column. +skip_blank_lines : bool, default True + If True, discard and do not parse empty lines + If False, interpret empty lines as NaN values +parse_dates : list of int or names, default None + If list of columns, then attempt to parse each entry as a date. + Columns may not always be recognized as dates, for instance due to + unusual or non-standard formats. To guarantee a date and increase parsing + speed, explicitly specify `dtype='date'` for the desired columns. +dayfirst : bool, default False + DD/MM format dates, international and European format. +compression : {{'infer', 'gzip', 'zip', None}}, default 'infer' + For on-the-fly decompression of on-disk data. If ‘infer’, then detect + compression from the following extensions: ‘.gz’,‘.zip’ (otherwise no + decompression). If using ‘zip’, the ZIP file must contain only one + data file to be read in, otherwise the first non-zero-sized file will + be used. Set to None for no decompression. +thousands : char, default None + Character used as a thousands delimiter. +decimal : char, default '.' + Character used as a decimal point. +lineterminator : char, default '\\n' + Character to indicate end of line. +quotechar : char, default '"' + Character to indicate start and end of quote item. +quoting : str or int, default 0 + Controls quoting behavior. Set to one of + 0 (csv.QUOTE_MINIMAL), 1 (csv.QUOTE_ALL), + 2 (csv.QUOTE_NONNUMERIC) or 3 (csv.QUOTE_NONE). + Quoting is enabled with all values except 3. +doublequote : bool, default True + When quoting is enabled, indicates whether to interpret two + consecutive quotechar inside fields as single quotechar +comment : char, default None + Character used as a comments indicator. If found at the beginning of a + line, the line will be ignored altogether. +delim_whitespace : bool, default False + Determines whether to use whitespace as delimiter. +byte_range : list or tuple, default None + Byte range within the input file to be read. The first number is the + offset in bytes, the second number is the range size in bytes. Set the + size to zero to read all data after the offset location. Reads the row + that starts before or at the end of the range, even if it ends after + the end of the range. use_python_file_object : boolean, default True If True, Arrow-backed PythonFile objects will be used in place of fsspec AbstractBufferedFile objects at IO time. This option is likely to improve performance when making small reads from larger CSV files. - +storage_options : dict, optional, default None + Extra options that make sense for a particular storage connection, + e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value + pairs are forwarded to ``urllib.request.Request`` as header options. + For other URLs (e.g. starting with “s3://”, and “gcs://”) the key-value + pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and + ``urllib`` for more details. +bytes_per_thread : int, default None + Determines the number of bytes to be allocated per thread to read the + files in parallel. When there is a file of large size, we get slightly + better throughput by decomposing it and transferring multiple "blocks" + in parallel (using a python thread pool). Default allocation is + 256_000_000 bytes. + This parameter is functional only when `use_python_file_object=False`. Returns ------- GPU ``DataFrame`` object. @@ -1010,15 +1025,22 @@ Write out the column names index : bool, default True Write out the index as a column -line_terminator : char, default '\\n' -chunksize : int or None, default None - Rows to write at a time encoding : str, default 'utf-8' A string representing the encoding to use in the output file Only ‘utf-8’ is currently supported compression : str, None A string representing the compression scheme to use in the the output file Compression while writing csv is not supported currently +line_terminator : char, default '\\n' +chunksize : int or None, default None + Rows to write at a time +storage_options : dict, optional, default None + Extra options that make sense for a particular storage connection, + e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value + pairs are forwarded to ``urllib.request.Request`` as header options. + For other URLs (e.g. starting with “s3://”, and “gcs://”) the key-value + pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and + ``urllib`` for more details. Returns ------- None or str From 0a430fa42cf74ea876a530b0a51efc90e1285f37 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 26 Sep 2022 19:55:42 -0700 Subject: [PATCH 06/10] Fix `cudf::partition*` APIs that do not return offsets for empty output table (#11709) By definition, the `cudf::partition*` API will return a vector of offsets with size is at least the number of partitions. As such, an output empty table should associate with an output offset array like `[0, 0, ..., 0]` (all zeros). However, currently the output offsets in such situations is an empty array. This PR corrects the implementation for such corner cases. Closes https://github.com/rapidsai/cudf/issues/11700. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Vukasin Milovanovic (https://github.com/vuule) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/11709 --- cpp/src/partitioning/partitioning.cu | 5 +++-- cpp/src/partitioning/round_robin.cu | 4 ++++ cpp/tests/partitioning/hash_partition_test.cpp | 10 +++++----- cpp/tests/partitioning/partition_test.cpp | 2 +- cpp/tests/partitioning/round_robin_test.cpp | 14 +++++++++++++- python/cudf/cudf/_lib/cpp/table/table.pxd | 3 ++- python/cudf/cudf/_lib/hash.pyx | 4 ---- python/cudf/cudf/core/indexed_frame.py | 3 +++ 8 files changed, 31 insertions(+), 14 deletions(-) diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index bbf601784fb..3e0cc26dcdd 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -728,7 +728,7 @@ std::pair, std::vector> hash_partition( // Return empty result if there are no partitions or nothing to hash if (num_partitions <= 0 || input.num_rows() == 0 || table_to_hash.num_columns() == 0) { - return std::pair(empty_like(input), std::vector{}); + return std::pair(empty_like(input), std::vector(num_partitions, 0)); } if (has_nulls(table_to_hash)) { @@ -753,7 +753,8 @@ std::pair, std::vector> partition( CUDF_EXPECTS(not partition_map.has_nulls(), "Unexpected null values in partition_map."); if (num_partitions == 0 or t.num_rows() == 0) { - return std::pair(empty_like(t), std::vector{}); + // The output offsets vector must have size `num_partitions + 1` as per documentation. + return std::pair(empty_like(t), std::vector(num_partitions + 1, 0)); } return cudf::type_dispatcher( diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu index b6a098a0cab..d455df3e890 100644 --- a/cpp/src/partitioning/round_robin.cu +++ b/cpp/src/partitioning/round_robin.cu @@ -166,6 +166,10 @@ std::pair, std::vector> round_robin_part "Incorrect start_partition index. Must be positive."); // since cudf::size_type is an alias for // int32_t, it _can_ be negative + if (nrows == 0) { + return std::pair(empty_like(input), std::vector(num_partitions, 0)); + } + // handle degenerate case: // if (num_partitions >= nrows) { diff --git a/cpp/tests/partitioning/hash_partition_test.cpp b/cpp/tests/partitioning/hash_partition_test.cpp index 3ec6ae97595..1addbca945b 100644 --- a/cpp/tests/partitioning/hash_partition_test.cpp +++ b/cpp/tests/partitioning/hash_partition_test.cpp @@ -72,7 +72,7 @@ TEST_F(HashPartition, ZeroPartitions) // Expect empty table with same number of columns and zero partitions EXPECT_EQ(input.num_columns(), output->num_columns()); EXPECT_EQ(0, output->num_rows()); - EXPECT_EQ(std::size_t{0}, offsets.size()); + EXPECT_EQ(std::size_t{num_partitions}, offsets.size()); } TEST_F(HashPartition, ZeroRows) @@ -87,10 +87,10 @@ TEST_F(HashPartition, ZeroRows) cudf::size_type const num_partitions = 3; auto [output, offsets] = cudf::hash_partition(input, columns_to_hash, num_partitions); - // Expect empty table with same number of columns and zero partitions + // Expect empty table with same number of columns and same number of partitions EXPECT_EQ(input.num_columns(), output->num_columns()); EXPECT_EQ(0, output->num_rows()); - EXPECT_EQ(std::size_t{0}, offsets.size()); + EXPECT_EQ(std::size_t{num_partitions}, offsets.size()); } TEST_F(HashPartition, ZeroColumns) @@ -102,10 +102,10 @@ TEST_F(HashPartition, ZeroColumns) cudf::size_type const num_partitions = 3; auto [output, offsets] = cudf::hash_partition(input, columns_to_hash, num_partitions); - // Expect empty table with same number of columns and zero partitions + // Expect empty table with same number of columns and same number of partitions EXPECT_EQ(input.num_columns(), output->num_columns()); EXPECT_EQ(0, output->num_rows()); - EXPECT_EQ(std::size_t{0}, offsets.size()); + EXPECT_EQ(std::size_t{num_partitions}, offsets.size()); } TEST_F(HashPartition, MixedColumnTypes) diff --git a/cpp/tests/partitioning/partition_test.cpp b/cpp/tests/partitioning/partition_test.cpp index 014a19e93a9..98cd1b821c6 100644 --- a/cpp/tests/partitioning/partition_test.cpp +++ b/cpp/tests/partitioning/partition_test.cpp @@ -53,7 +53,7 @@ TYPED_TEST(PartitionTest, EmptyInputs) auto result_offsets = result.second; - EXPECT_TRUE(result_offsets.empty()); + EXPECT_EQ(result_offsets.size(), std::size_t{11}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column, result.first->get_column(0)); } diff --git a/cpp/tests/partitioning/round_robin_test.cpp b/cpp/tests/partitioning/round_robin_test.cpp index a72c22f5714..eebc65a07f0 100644 --- a/cpp/tests/partitioning/round_robin_test.cpp +++ b/cpp/tests/partitioning/round_robin_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -47,6 +47,18 @@ class RoundRobinTest : public cudf::test::BaseFixture { TYPED_TEST_SUITE(RoundRobinTest, cudf::test::FixedWidthTypes); +TYPED_TEST(RoundRobinTest, EmptyInput) +{ + auto const empty_column = fixed_width_column_wrapper{}; + auto const num_partitions = 5; + auto const start_partition = 0; + auto const [out_table, out_offsets] = + cudf::round_robin_partition(cudf::table_view{{empty_column}}, num_partitions, start_partition); + + EXPECT_EQ(out_table->num_rows(), 0); + EXPECT_EQ(out_offsets.size(), std::size_t{num_partitions}); +} + TYPED_TEST(RoundRobinTest, RoundRobinPartitions13_3) { strings_column_wrapper rrColWrap1( diff --git a/python/cudf/cudf/_lib/cpp/table/table.pxd b/python/cudf/cudf/_lib/cpp/table/table.pxd index 13e1ceb6430..d7f3de76c63 100644 --- a/python/cudf/cudf/_lib/cpp/table/table.pxd +++ b/python/cudf/cudf/_lib/cpp/table/table.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector @@ -14,6 +14,7 @@ cdef extern from "cudf/table/table.hpp" namespace "cudf" nogil: table(vector[unique_ptr[column]]&& columns) except + table(table_view) except + size_type num_columns() except + + size_type num_rows() except + table_view view() except + mutable_table_view mutable_view() except + vector[unique_ptr[column]] release() except + diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index 8bb8ab92a48..1eba3a2f6b5 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -33,13 +33,9 @@ def hash_partition(list source_columns, object columns_to_hash, ) ) - # Note that the offsets (`c_result.second`) may be empty when - # the original table (`source_columns`) is empty. We need to - # return a list of zeros in this case. return ( columns_from_unique_ptr(move(c_result.first)), list(c_result.second) - if c_result.second.size() else [0] * num_partitions ) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 30b1bc704c8..0acacc798a1 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1621,6 +1621,9 @@ def _empty_like(self, keep_index=True): ) def _split(self, splits, keep_index=True): + if self._num_rows == 0: + return [] + columns_split = libcudf.copying.columns_split( [ *(self._index._data.columns if keep_index else []), From c5d555abb635f40d8f607acf02783b7e6d67e324 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Tue, 27 Sep 2022 15:19:29 +0530 Subject: [PATCH 07/10] JSON Column creation in GPU (#11714) This PR generates json column creation from the traversed json tree. It has following parts 1. `reduce_to_column_tree` - Reduce node tree into column tree by aggregating each property of each column and number of rows in each column. 2. `make_json_column2` - creates the GPU json column tree structure from tree and column info 3. `json_column_to_cudf_column2` - converts this GPU json column to cudf column. 4. `parse_nested_json2` - combines all json tokenizer, json tree generation, traversal, json column creation, cudf column conversion together. All steps run on device. Depends on PR #11518 #11610 For code-review, use PR https://github.com/karthikeyann/cudf/pull/5 which contains only this tree changes. ### Overview - PR #11264 Tokenizes the JSON string to Tokens - PR #11518 Converts Tokens to Nodes (tree representation) - PR #11610 Traverses this node tree --> assigns column id and row index to each node. - This PR #11714 Converts this traversed tree into JSON Column, which in turn is translated to `cudf::column` JSON has 5 categories of nodes. STRUCT, LIST, FIELD, VALUE, STRING, STRUCT, LIST are nested types. FIELD nodes are struct columns' keys. VALUE node is similar to STRING column but without double quotes. Actual datatype conversion happens in `json_column_to_cudf_column2` Tree Representation `tree_meta_t` has 4 data members. 1. node categories 2. node parents' id 3. node level 4. node's string range {begin, end} (as 2 vectors) Currently supported JSON formats are records orient, and JSON lines. ### This PR - Detailed explanation This PR has 3 steps. 1. `reduce_to_column_tree` - Required to compute total number of columns, column type, nested column structure, and number of rows in each column. - Generates `tree_meta_t` data members for column. - - Sort node tree by col_id (stable sort) - - reduce_by_key custom_op on node_categories, collapses to column category - - unique_by_key_copy by col_id, copies first parent_node_id, string_ranges. This parent_node_id will be transformed to parent_column_id. - - reduce_by_key max on row_offsets gives maximum row offset in each column, Propagate list column children's max row offset to their children because sometime structs may miss entries, so parent list gives correct count. 5. `make_json_column2` - Converts nodes to GPU json columns in tree structure - - get column tree, transfer column names to host. - - Create `d_json_column` for non-field columns. - - if 2 columns occurs on same path, and one of them is nested and other is string column, discard the string column. - - For STRUCT, LIST, VALUE, STRING nodes, set the validity bits, and copy string {begin, end} range to string_offsets and string length. - - Compute list offset - - Perform scan max operation on offsets. (to fill 0's with previous offset value). - Now the `d_json_column` is nested, and contains offsets, validity bits, unparsed unconverted string information. 6. `json_column_to_cudf_column2` - converts this GPU json column to cudf column. - Recursively goes over each `d_json_column` and converts to `cudf::column` by inferring the type, parsing the string to type, and setting validity bits further. 7. `parse_nested_json2` - combines all json tokenizer, json tree generation, traversal, json column creation, cudf column conversion together. All steps run on device. Authors: - Karthikeyan (https://github.com/karthikeyann) - Elias Stehle (https://github.com/elstehle) - Yunsong Wang (https://github.com/PointKernel) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Tobias Ribizel (https://github.com/upsj) - https://github.com/nvdbaranec - GALI PREM SAGAR (https://github.com/galipremsagar) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/11714 --- cpp/CMakeLists.txt | 1 + cpp/benchmarks/io/json/nested_json.cpp | 3 +- cpp/include/cudf/io/detail/data_casting.cuh | 2 + cpp/src/io/fst/dispatch_dfa.cuh | 6 +- cpp/src/io/json/experimental/read_json.cpp | 9 +- cpp/src/io/json/json_column.cu | 840 ++++++++++++++++++++ cpp/src/io/json/json_tree.cu | 41 +- cpp/src/io/json/nested_json.hpp | 79 +- cpp/src/io/json/nested_json_gpu.cu | 14 +- cpp/src/io/utilities/type_inference.cuh | 2 + cpp/tests/io/json_tree.cpp | 20 +- cpp/tests/io/nested_json_test.cpp | 83 +- python/cudf/cudf/tests/test_json.py | 11 +- 13 files changed, 1051 insertions(+), 60 deletions(-) create mode 100644 cpp/src/io/json/json_column.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7efa186aede..8847ad36000 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -327,6 +327,7 @@ add_library( src/io/csv/reader_impl.cu src/io/csv/writer_impl.cu src/io/functions.cpp + src/io/json/json_column.cu src/io/json/json_gpu.cu src/io/json/json_tree.cu src/io/json/nested_json_gpu.cu diff --git a/cpp/benchmarks/io/json/nested_json.cpp b/cpp/benchmarks/io/json/nested_json.cpp index e2d4c3b77d8..bb3e13a3a01 100644 --- a/cpp/benchmarks/io/json/nested_json.cpp +++ b/cpp/benchmarks/io/json/nested_json.cpp @@ -76,7 +76,8 @@ void BM_NESTED_JSON(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { // Allocate device-side temporary storage & run algorithm - cudf::io::json::detail::parse_nested_json(input, default_options, cudf::default_stream_value); + cudf::io::json::detail::device_parse_nested_json( + input, default_options, cudf::default_stream_value); }); auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 3862b9e033d..628c00ad603 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -304,6 +305,7 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + CUDF_FUNC_RANGE(); if (col_type == cudf::data_type{cudf::type_id::STRING}) { rmm::device_uvector offsets(col_size + 1, stream); diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh index cabbe863131..6b6077f475b 100644 --- a/cpp/src/io/fst/dispatch_dfa.cuh +++ b/cpp/src/io/fst/dispatch_dfa.cuh @@ -211,8 +211,8 @@ struct DispatchFSM : DeviceFSMPolicy { if (CubDebug(error = dfa_simulation_config.Init(dfa_kernel))) return error; // Kernel invocation - uint32_t grid_size = - CUB_QUOTIENT_CEILING(num_chars, PolicyT::BLOCK_THREADS * PolicyT::ITEMS_PER_THREAD); + uint32_t grid_size = std::max( + 1u, CUB_QUOTIENT_CEILING(num_chars, PolicyT::BLOCK_THREADS * PolicyT::ITEMS_PER_THREAD)); uint32_t block_threads = dfa_simulation_config.block_threads; dfa_kernel<<>>(dfa, @@ -348,7 +348,7 @@ struct DispatchFSM : DeviceFSMPolicy { NUM_SYMBOLS_PER_BLOCK = BLOCK_THREADS * SYMBOLS_PER_THREAD }; - BlockOffsetT num_blocks = CUB_QUOTIENT_CEILING(num_chars, NUM_SYMBOLS_PER_BLOCK); + BlockOffsetT num_blocks = std::max(1u, CUB_QUOTIENT_CEILING(num_chars, NUM_SYMBOLS_PER_BLOCK)); size_t num_threads = num_blocks * BLOCK_THREADS; //------------------------------------------------------------------------------ diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp index 7d78bd34b19..c0eaa43e68f 100644 --- a/cpp/src/io/json/experimental/read_json.cpp +++ b/cpp/src/io/json/experimental/read_json.cpp @@ -53,7 +53,14 @@ table_with_metadata read_json(host_span> sources, auto const buffer = ingest_raw_input(sources, reader_opts.get_compression()); auto data = host_span(reinterpret_cast(buffer.data()), buffer.size()); - return cudf::io::json::detail::parse_nested_json(data, reader_opts, stream, mr); + try { + return cudf::io::json::detail::device_parse_nested_json(data, reader_opts, stream, mr); + } catch (cudf::logic_error const& err) { +#ifdef NJP_DEBUG_PRINT + std::cout << "Fall back to host nested json parser" << std::endl; +#endif + return cudf::io::json::detail::host_parse_nested_json(data, reader_opts, stream, mr); + } } } // namespace cudf::io::detail::json::experimental diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu new file mode 100644 index 00000000000..d54bb5c8ea9 --- /dev/null +++ b/cpp/src/io/json/json_column.cu @@ -0,0 +1,840 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nested_json.hpp" +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace cudf::io::json { +namespace detail { + +// DEBUG prints +auto to_cat = [](auto v) -> std::string { + switch (v) { + case NC_STRUCT: return " S"; + case NC_LIST: return " L"; + case NC_STR: return " \""; + case NC_VAL: return " V"; + case NC_FN: return " F"; + case NC_ERR: return "ER"; + default: return "UN"; + }; +}; +auto to_int = [](auto v) { return std::to_string(static_cast(v)); }; +auto print_vec = [](auto const& cpu, auto const name, auto converter) { + for (auto const& v : cpu) + printf("%3s,", converter(v).c_str()); + std::cout << name << std::endl; +}; + +void print_tree(host_span input, + tree_meta_t const& d_gpu_tree, + rmm::cuda_stream_view stream = cudf::default_stream_value) +{ + print_vec(cudf::detail::make_std_vector_async(d_gpu_tree.node_categories, stream), + "node_categories", + to_cat); + print_vec(cudf::detail::make_std_vector_async(d_gpu_tree.parent_node_ids, stream), + "parent_node_ids", + to_int); + print_vec( + cudf::detail::make_std_vector_async(d_gpu_tree.node_levels, stream), "node_levels", to_int); + auto node_range_begin = cudf::detail::make_std_vector_async(d_gpu_tree.node_range_begin, stream); + auto node_range_end = cudf::detail::make_std_vector_async(d_gpu_tree.node_range_end, stream); + print_vec(node_range_begin, "node_range_begin", to_int); + print_vec(node_range_end, "node_range_end", to_int); + for (int i = 0; i < int(node_range_begin.size()); i++) { + printf("%3s ", + std::string(input.data() + node_range_begin[i], node_range_end[i] - node_range_begin[i]) + .c_str()); + } + printf(" (JSON)\n"); +} + +/** + * @brief Reduces node tree representation to column tree representation. + * + * @param tree Node tree representation of JSON string + * @param col_ids Column ids of nodes + * @param row_offsets Row offsets of nodes + * @param stream CUDA stream used for device memory operations and kernel launches + * @return A tuple of column tree representation of JSON string, column ids of columns, and + * max row offsets of columns + */ +std::tuple, rmm::device_uvector> +reduce_to_column_tree(tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + // 1. sort_by_key {col_id}, {row_offset} stable + rmm::device_uvector node_ids(row_offsets.size(), stream); + thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end()); + thrust::stable_sort_by_key(rmm::exec_policy(stream), + col_ids.begin(), + col_ids.end(), + thrust::make_zip_iterator(node_ids.begin(), row_offsets.begin())); + auto num_columns = thrust::unique_count(rmm::exec_policy(stream), col_ids.begin(), col_ids.end()); + + // 2. reduce_by_key {col_id}, {row_offset}, max. + rmm::device_uvector unique_col_ids(num_columns, stream); + rmm::device_uvector max_row_offsets(num_columns, stream); + thrust::reduce_by_key(rmm::exec_policy(stream), + col_ids.begin(), + col_ids.end(), + row_offsets.begin(), + unique_col_ids.begin(), + max_row_offsets.begin(), + thrust::equal_to(), + thrust::maximum()); + + // 3. reduce_by_key {col_id}, {node_categories} - custom opp (*+v=*, v+v=v, *+#=E) + rmm::device_uvector column_categories(num_columns, stream); + thrust::reduce_by_key( + rmm::exec_policy(stream), + col_ids.begin(), + col_ids.end(), + thrust::make_permutation_iterator(tree.node_categories.begin(), node_ids.begin()), + unique_col_ids.begin(), + column_categories.begin(), + thrust::equal_to(), + [] __device__(NodeT type_a, NodeT type_b) -> NodeT { + auto is_a_leaf = (type_a == NC_VAL || type_a == NC_STR); + auto is_b_leaf = (type_b == NC_VAL || type_b == NC_STR); + // (v+v=v, *+*=*, *+v=*, *+#=E, NESTED+VAL=NESTED) + // *+*=*, v+v=v + if (type_a == type_b) { + return type_a; + } else if (is_a_leaf) { + // *+v=*, N+V=N + // STRUCT/LIST + STR/VAL = STRUCT/LIST, STR/VAL + FN = ERR, STR/VAL + STR = STR + return type_b == NC_FN ? NC_ERR : (is_b_leaf ? NC_STR : type_b); + } else if (is_b_leaf) { + return type_a == NC_FN ? NC_ERR : (is_a_leaf ? NC_STR : type_a); + } + // *+#=E + return NC_ERR; + }); + + // 4. unique_copy parent_node_ids, ranges + rmm::device_uvector column_levels(0, stream); // not required + rmm::device_uvector parent_col_ids(num_columns, stream); + rmm::device_uvector col_range_begin(num_columns, stream); // Field names + rmm::device_uvector col_range_end(num_columns, stream); + rmm::device_uvector unique_node_ids(num_columns, stream); + thrust::unique_by_key_copy(rmm::exec_policy(stream), + col_ids.begin(), + col_ids.end(), + node_ids.begin(), + thrust::make_discard_iterator(), + unique_node_ids.begin()); + thrust::copy_n( + rmm::exec_policy(stream), + thrust::make_zip_iterator( + thrust::make_permutation_iterator(tree.parent_node_ids.begin(), unique_node_ids.begin()), + thrust::make_permutation_iterator(tree.node_range_begin.begin(), unique_node_ids.begin()), + thrust::make_permutation_iterator(tree.node_range_end.begin(), unique_node_ids.begin())), + unique_node_ids.size(), + thrust::make_zip_iterator( + parent_col_ids.begin(), col_range_begin.begin(), col_range_end.begin())); + + // Restore the order + { + // use scatter to restore the order + rmm::device_uvector temp_col_ids(col_ids.size(), stream); + rmm::device_uvector temp_row_offsets(row_offsets.size(), stream); + thrust::scatter(rmm::exec_policy(stream), + thrust::make_zip_iterator(col_ids.begin(), row_offsets.begin()), + thrust::make_zip_iterator(col_ids.end(), row_offsets.end()), + node_ids.begin(), + thrust::make_zip_iterator(temp_col_ids.begin(), temp_row_offsets.begin())); + thrust::copy(rmm::exec_policy(stream), + thrust::make_zip_iterator(temp_col_ids.begin(), temp_row_offsets.begin()), + thrust::make_zip_iterator(temp_col_ids.end(), temp_row_offsets.end()), + thrust::make_zip_iterator(col_ids.begin(), row_offsets.begin())); + } + + // convert parent_node_ids to parent_col_ids + thrust::transform(rmm::exec_policy(stream), + parent_col_ids.begin(), + parent_col_ids.end(), + parent_col_ids.begin(), + [col_ids = col_ids.begin()] __device__(auto parent_node_id) -> size_type { + return parent_node_id == parent_node_sentinel ? parent_node_sentinel + : col_ids[parent_node_id]; + }); + + // copy lists' max_row_offsets to children. + // all structs should have same size. + thrust::transform_if( + rmm::exec_policy(stream), + unique_col_ids.begin(), + unique_col_ids.end(), + max_row_offsets.begin(), + [column_categories = column_categories.begin(), + parent_col_ids = parent_col_ids.begin(), + max_row_offsets = max_row_offsets.begin()] __device__(size_type col_id) { + auto parent_col_id = parent_col_ids[col_id]; + while (parent_col_id != parent_node_sentinel and + column_categories[parent_col_id] != node_t::NC_LIST) { + col_id = parent_col_id; + parent_col_id = parent_col_ids[parent_col_id]; + } + return max_row_offsets[col_id]; + }, + [column_categories = column_categories.begin(), + parent_col_ids = parent_col_ids.begin()] __device__(size_type col_id) { + auto parent_col_id = parent_col_ids[col_id]; + return parent_col_id != parent_node_sentinel and + (column_categories[parent_col_id] != node_t::NC_LIST); + // Parent is not a list, or sentinel/root + }); + + return std::tuple{tree_meta_t{std::move(column_categories), + std::move(parent_col_ids), + std::move(column_levels), + std::move(col_range_begin), + std::move(col_range_end)}, + std::move(unique_col_ids), + std::move(max_row_offsets)}; +} + +/** + * @brief Copies strings specified by pair of begin, end offsets to host vector of strings. + * + * @param input String device buffer + * @param node_range_begin Begin offset of the strings + * @param node_range_end End offset of the strings + * @param stream CUDA stream + * @return Vector of strings + */ +std::vector copy_strings_to_host(device_span input, + device_span node_range_begin, + device_span node_range_end, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + auto const num_strings = node_range_begin.size(); + rmm::device_uvector> string_views(num_strings, stream); + auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin()); + thrust::transform(rmm::exec_policy(stream), + d_offset_pairs, + d_offset_pairs + num_strings, + string_views.begin(), + [data = input.data()] __device__(auto const& offsets) { + // Note: first character for non-field columns + return thrust::make_pair( + data + thrust::get<0>(offsets), + static_cast(thrust::get<1>(offsets) - thrust::get<0>(offsets))); + }); + auto d_column_names = cudf::make_strings_column(string_views, stream); + auto to_host = [](auto const& col) { + if (col.is_empty()) return std::vector{}; + auto const scv = cudf::strings_column_view(col); + auto const h_chars = cudf::detail::make_std_vector_sync( + cudf::device_span(scv.chars().data(), scv.chars().size()), + cudf::default_stream_value); + auto const h_offsets = cudf::detail::make_std_vector_sync( + cudf::device_span( + scv.offsets().data() + scv.offset(), scv.size() + 1), + cudf::default_stream_value); + + // build std::string vector from chars and offsets + std::vector host_data; + host_data.reserve(col.size()); + std::transform( + std::begin(h_offsets), + std::end(h_offsets) - 1, + std::begin(h_offsets) + 1, + std::back_inserter(host_data), + [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); }); + return host_data; + }; + return to_host(d_column_names->view()); +} + +/** + * @brief Holds member data pointers of `d_json_column` + * + */ +struct json_column_data { + using row_offset_t = json_column::row_offset_t; + row_offset_t* string_offsets; + row_offset_t* string_lengths; + row_offset_t* child_offsets; + bitmask_type* validity; +}; + +/** + * @brief Constructs `d_json_column` from node tree representation + * Newly constructed columns are insert into `root`'s children. + * `root` must be a list type. + * + * @param input Input JSON string device data + * @param tree Node tree representation of the JSON string + * @param col_ids Column ids of the nodes in the tree + * @param row_offsets Row offsets of the nodes in the tree + * @param root Root node of the `d_json_column` tree + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the device memory + * of child_offets and validity members of `d_json_column` + */ +void make_device_json_column(device_span input, + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_json_column& root, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + // 1. gather column information. + auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] = + reduce_to_column_tree(tree, col_ids, row_offsets, stream); + auto num_columns = d_unique_col_ids.size(); + auto unique_col_ids = cudf::detail::make_std_vector_async(d_unique_col_ids, stream); + auto column_categories = + cudf::detail::make_std_vector_async(d_column_tree.node_categories, stream); + auto column_parent_ids = + cudf::detail::make_std_vector_async(d_column_tree.parent_node_ids, stream); + auto column_range_beg = + cudf::detail::make_std_vector_async(d_column_tree.node_range_begin, stream); + auto max_row_offsets = cudf::detail::make_std_vector_async(d_max_row_offsets, stream); + std::vector column_names = copy_strings_to_host( + input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream); + + auto to_json_col_type = [](auto category) { + switch (category) { + case NC_STRUCT: return json_col_t::StructColumn; + case NC_LIST: return json_col_t::ListColumn; + case NC_STR: + case NC_VAL: return json_col_t::StringColumn; + default: return json_col_t::Unknown; + } + }; + auto init_to_zero = [stream](auto& v) { + thrust::uninitialized_fill(rmm::exec_policy(stream), v.begin(), v.end(), 0); + }; + + auto initialize_json_columns = [&](auto i, auto& col) { + if (column_categories[i] == NC_ERR || column_categories[i] == NC_FN) { + return; + } else if (column_categories[i] == NC_VAL || column_categories[i] == NC_STR) { + col.string_offsets.resize(max_row_offsets[i] + 1, stream); + col.string_lengths.resize(max_row_offsets[i] + 1, stream); + init_to_zero(col.string_offsets); + init_to_zero(col.string_lengths); + } else if (column_categories[i] == NC_LIST) { + col.child_offsets.resize(max_row_offsets[i] + 2, stream); + init_to_zero(col.child_offsets); + } + col.num_rows = max_row_offsets[i] + 1; + col.validity.resize(bitmask_allocation_size_bytes(max_row_offsets[i] + 1), stream); + init_to_zero(col.validity); + col.type = to_json_col_type(column_categories[i]); + }; + + // 2. generate nested columns tree and its device_memory + // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order. + auto h_range_col_id_it = + thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin()); + std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { + return thrust::get<0>(a) < thrust::get<0>(b); + }); + + // use hash map because we may skip field name's col_ids + std::unordered_map> columns; + // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking + std::map, NodeIndexT> mapped_columns; + // find column_ids which are values, but should be ignored in validity + std::vector ignore_vals(num_columns, 0); + columns.try_emplace(parent_node_sentinel, std::ref(root)); + + for (auto const this_col_id : unique_col_ids) { + if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { + continue; + } + // Struct, List, String, Value + std::string name = ""; + auto parent_col_id = column_parent_ids[this_col_id]; + if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { + name = "element"; + } else if (column_categories[parent_col_id] == NC_FN) { + auto field_name_col_id = parent_col_id; + parent_col_id = column_parent_ids[parent_col_id]; + name = column_names[field_name_col_id]; + } else { + CUDF_FAIL("Unexpected parent column category"); + } + // If the child is already found, + // replace if this column is a nested column and the existing was a value column + // ignore this column if this column is a value column and the existing was a nested column + auto it = columns.find(parent_col_id); + CUDF_EXPECTS(it != columns.end(), "Parent column not found"); + auto& parent_col = it->second.get(); + bool replaced = false; + if (mapped_columns.count({parent_col_id, name}) > 0) { + if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) { + ignore_vals[this_col_id] = 1; + continue; + } + auto old_col_id = mapped_columns[{parent_col_id, name}]; + if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { + // remap + ignore_vals[old_col_id] = 1; + mapped_columns.erase({parent_col_id, name}); + columns.erase(old_col_id); + parent_col.child_columns.erase(name); + replaced = true; // to skip duplicate name in column_order + } else { + // If this is a nested column but we're trying to insert either (a) a list node into a + // struct column or (b) a struct node into a list column, we fail + CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and + column_categories[this_col_id] == NC_STRUCT) or + (column_categories[old_col_id] == NC_STRUCT and + column_categories[this_col_id] == NC_LIST)), + "A mix of lists and structs within the same column is not supported"); + } + } + CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name"); + // move into parent + device_json_column col(stream, mr); + initialize_json_columns(this_col_id, col); + auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second; + CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent"); + if (not replaced) parent_col.column_order.push_back(name); + columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name))); + mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id); + } + // restore unique_col_ids order + std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { + return thrust::get<1>(a) < thrust::get<1>(b); + }); + // move columns data to device. + std::vector columns_data(num_columns); + for (auto& [col_id, col_ref] : columns) { + if (col_id == parent_node_sentinel) continue; + auto& col = col_ref.get(); + columns_data[col_id] = json_column_data{col.string_offsets.data(), + col.string_lengths.data(), + col.child_offsets.data(), + col.validity.data()}; + } + + // 3. scatter string offsets to respective columns, set validity bits + auto d_ignore_vals = cudf::detail::make_device_uvector_async(ignore_vals, stream); + auto d_columns_data = cudf::detail::make_device_uvector_async(columns_data, stream); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + col_ids.size(), + [node_categories = tree.node_categories.begin(), + col_ids = col_ids.begin(), + row_offsets = row_offsets.begin(), + range_begin = tree.node_range_begin.begin(), + range_end = tree.node_range_end.begin(), + d_ignore_vals = d_ignore_vals.begin(), + d_columns_data = d_columns_data.begin()] __device__(size_type i) { + switch (node_categories[i]) { + case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; + case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; + case NC_VAL: + case NC_STR: + if (d_ignore_vals[col_ids[i]]) break; + set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); + d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i]; + d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i]; + break; + default: break; + } + }); + + // 4. scatter List offset + // sort_by_key {col_id}, {node_id} + // unique_copy_by_key {parent_node_id} {row_offset} to + // col[parent_col_id].child_offsets[row_offset[parent_node_id]] + + rmm::device_uvector original_col_ids(col_ids.size(), stream); // make a copy + thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), original_col_ids.begin()); + rmm::device_uvector node_ids(row_offsets.size(), stream); + thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end()); + thrust::stable_sort_by_key( + rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), node_ids.begin()); + + auto ordered_parent_node_ids = + thrust::make_permutation_iterator(tree.parent_node_ids.begin(), node_ids.begin()); + auto ordered_row_offsets = + thrust::make_permutation_iterator(row_offsets.begin(), node_ids.begin()); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + col_ids.size(), + [num_nodes = col_ids.size(), + ordered_parent_node_ids, + ordered_row_offsets, + original_col_ids = original_col_ids.begin(), + col_ids = col_ids.begin(), + row_offsets = row_offsets.begin(), + node_categories = tree.node_categories.begin(), + d_columns_data = d_columns_data.begin()] __device__(size_type i) { + auto parent_node_id = ordered_parent_node_ids[i]; + if (parent_node_id != parent_node_sentinel and node_categories[parent_node_id] == NC_LIST) { + // unique item + if (i == 0 || + (col_ids[i - 1] != col_ids[i] or ordered_parent_node_ids[i - 1] != parent_node_id)) { + // scatter to list_offset + d_columns_data[original_col_ids[parent_node_id]] + .child_offsets[row_offsets[parent_node_id]] = ordered_row_offsets[i]; + } + // TODO: verify if this code is right. check with more test cases. + if (i == num_nodes - 1 || (col_ids[i] != col_ids[i + 1])) { + // last value of list child_offset is its size. + d_columns_data[original_col_ids[parent_node_id]] + .child_offsets[row_offsets[parent_node_id] + 1] = ordered_row_offsets[i] + 1; + } + } + }); + + // restore col_ids, TODO is this required? + // thrust::copy( + // rmm::exec_policy(stream), original_col_ids.begin(), original_col_ids.end(), col_ids.begin()); + + // 5. scan on offsets. + for (auto& [id, col_ref] : columns) { + auto& col = col_ref.get(); + if (col.type == json_col_t::StringColumn) { + thrust::inclusive_scan(rmm::exec_policy(stream), + col.string_offsets.begin(), + col.string_offsets.end(), + col.string_offsets.begin(), + thrust::maximum{}); + } else if (col.type == json_col_t::ListColumn) { + thrust::inclusive_scan(rmm::exec_policy(stream), + col.child_offsets.begin(), + col.child_offsets.end(), + col.child_offsets.begin(), + thrust::maximum{}); + } + } +} + +/** + * @brief Retrieves the parse_options to be used for type inference and type casting + * + * @param options The reader options to influence the relevant type inference and type casting + * options + */ +cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& options); + +std::pair, std::vector> device_json_column_to_cudf_column( + device_json_column& json_col, + device_span d_input, + cudf::io::json_reader_options const& options, + std::optional schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + auto make_validity = + [stream](device_json_column& json_col) -> std::pair { + CUDF_EXPECTS(json_col.validity.size() >= bitmask_allocation_size_bytes(json_col.num_rows), + "valid_count is too small"); + auto null_count = + cudf::detail::null_count(json_col.validity.data(), 0, json_col.num_rows, stream); + // full null_mask is always required for parse_data + return {json_col.validity.release(), null_count}; + // Note: json_col modified here, moves this memory + }; + + auto get_child_schema = [schema](auto child_name) -> std::optional { + if (schema.has_value()) { + auto const result = schema.value().child_types.find(child_name); + if (result != std::end(schema.value().child_types)) { return result->second; } + } + return {}; + }; + + switch (json_col.type) { + case json_col_t::StringColumn: { + // move string_offsets to GPU and transform to string column + auto const col_size = json_col.string_offsets.size(); + using char_length_pair_t = thrust::pair; + CUDF_EXPECTS(json_col.string_offsets.size() == json_col.string_lengths.size(), + "string offset, string length mismatch"); + rmm::device_uvector d_string_data(col_size, stream); + // TODO how about directly storing pair in json_column? + auto offset_length_it = + thrust::make_zip_iterator(json_col.string_offsets.begin(), json_col.string_lengths.begin()); + // Prepare iterator that returns (string_offset, string_length)-pairs needed by inference + auto string_ranges_it = + thrust::make_transform_iterator(offset_length_it, [] __device__(auto ip) { + return thrust::pair{ + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; + }); + + // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion + auto string_spans_it = thrust::make_transform_iterator( + offset_length_it, [data = d_input.data()] __device__(auto ip) { + return thrust::pair{ + data + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; + }); + + data_type target_type{}; + + if (schema.has_value()) { +#ifdef NJP_DEBUG_PRINT + std::cout << "-> explicit type: " + << (schema.has_value() ? std::to_string(static_cast(schema->type.id())) + : "n/a"); +#endif + target_type = schema.value().type; + } + // Infer column type, if we don't have an explicit type for it + else { + target_type = cudf::io::detail::infer_data_type( + parsing_options(options).json_view(), d_input, string_ranges_it, col_size, stream); + } + // Convert strings to the inferred data type + auto col = experimental::detail::parse_data(string_spans_it, + col_size, + target_type, + make_validity(json_col).first, + parsing_options(options).view(), + stream, + mr); + + // Reset nullable if we do not have nulls + // This is to match the existing JSON reader's behaviour: + // - Non-string columns will always be returned as nullable + // - String columns will be returned as nullable, iff there's at least one null entry + if (target_type.id() == type_id::STRING and col->null_count() == 0) { + col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); + } + + // For string columns return ["offsets", "char"] schema + if (target_type.id() == type_id::STRING) { + return {std::move(col), {{"offsets"}, {"chars"}}}; + } + // Non-string leaf-columns (e.g., numeric) do not have child columns in the schema + return {std::move(col), {}}; + } + case json_col_t::StructColumn: { + std::vector> child_columns; + std::vector column_names{}; + size_type num_rows{json_col.num_rows}; + // Create children columns + for (auto const& col_name : json_col.column_order) { + auto const& col = json_col.child_columns.find(col_name); + column_names.emplace_back(col->first); + auto& child_col = col->second; + auto [child_column, names] = device_json_column_to_cudf_column( + child_col, d_input, options, get_child_schema(col_name), stream, mr); + CUDF_EXPECTS(num_rows == child_column->size(), + "All children columns must have the same size"); + child_columns.push_back(std::move(child_column)); + column_names.back().children = names; + } + auto [result_bitmask, null_count] = make_validity(json_col); + return { + make_structs_column( + num_rows, std::move(child_columns), null_count, std::move(result_bitmask), stream, mr), + column_names}; + } + case json_col_t::ListColumn: { + size_type num_rows = json_col.child_offsets.size() - 1; + std::vector column_names{}; + column_names.emplace_back("offsets"); + column_names.emplace_back(json_col.child_columns.begin()->first); + + // Note: json_col modified here, reuse the memory + auto offsets_column = std::make_unique( + data_type{type_id::INT32}, num_rows + 1, json_col.child_offsets.release()); + // Create children column + auto [child_column, names] = + device_json_column_to_cudf_column(json_col.child_columns.begin()->second, + d_input, + options, + get_child_schema(json_col.child_columns.begin()->first), + stream, + mr); + column_names.back().children = names; + auto [result_bitmask, null_count] = make_validity(json_col); + return {make_lists_column(num_rows, + std::move(offsets_column), + std::move(child_column), + null_count, + std::move(result_bitmask), + stream, + mr), + std::move(column_names)}; + } + default: CUDF_FAIL("Unsupported column type"); break; + } +} + +table_with_metadata device_parse_nested_json(host_span input, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + + // Allocate device memory for the JSON input & copy over to device + rmm::device_uvector d_input = cudf::detail::make_device_uvector_async(input, stream); + + auto gpu_tree = [&]() { + // Parse the JSON and get the token stream + const auto [tokens_gpu, token_indices_gpu] = get_token_stream(d_input, options, stream); + // gpu tree generation + return get_tree_representation(tokens_gpu, token_indices_gpu, stream); + }(); // IILE used to free memory of token data. +#ifdef NJP_DEBUG_PRINT + print_tree(input, gpu_tree, stream); +#endif + + auto [gpu_col_id, gpu_row_offsets] = records_orient_tree_traversal(d_input, gpu_tree, stream); + + device_json_column root_column(stream, mr); + root_column.type = json_col_t::ListColumn; + root_column.child_offsets.resize(2, stream); + thrust::fill(rmm::exec_policy(stream), + root_column.child_offsets.begin(), + root_column.child_offsets.end(), + 0); + + // Get internal JSON column + make_device_json_column(d_input, gpu_tree, gpu_col_id, gpu_row_offsets, root_column, stream, mr); + + // data_root refers to the root column of the data represented by the given JSON string + auto& data_root = + options.is_enabled_lines() ? root_column : root_column.child_columns.begin()->second; + + // Zero row entries + if (data_root.type == json_col_t::ListColumn && data_root.child_columns.size() == 0) { + return table_with_metadata{std::make_unique(std::vector>{}), + {{}, std::vector{}}}; + } + + // Verify that we were in fact given a list of structs (or in JSON speech: an array of objects) + auto constexpr single_child_col_count = 1; + CUDF_EXPECTS(data_root.type == json_col_t::ListColumn and + data_root.child_columns.size() == single_child_col_count and + data_root.child_columns.begin()->second.type == json_col_t::StructColumn, + "Currently the nested JSON parser only supports an array of (nested) objects"); + + // Slice off the root list column, which has only a single row that contains all the structs + auto& root_struct_col = data_root.child_columns.begin()->second; + + // Initialize meta data to be populated while recursing through the tree of columns + std::vector> out_columns; + std::vector out_column_names; + + // Iterate over the struct's child columns and convert to cudf column + size_type column_index = 0; + for (auto const& col_name : root_struct_col.column_order) { + auto& json_col = root_struct_col.child_columns.find(col_name)->second; + // Insert this columns name into the schema + out_column_names.emplace_back(col_name); + + std::optional child_schema_element = std::visit( + cudf::detail::visitor_overload{ + [column_index](const std::vector& user_dtypes) -> std::optional { + return (static_cast(column_index) < user_dtypes.size()) + ? std::optional{{user_dtypes[column_index]}} + : std::optional{}; + }, + [col_name]( + std::map const& user_dtypes) -> std::optional { + return (user_dtypes.find(col_name) != std::end(user_dtypes)) + ? std::optional{{user_dtypes.find(col_name)->second}} + : std::optional{}; + }, + [col_name](std::map const& user_dtypes) + -> std::optional { + return (user_dtypes.find(col_name) != std::end(user_dtypes)) + ? user_dtypes.find(col_name)->second + : std::optional{}; + }}, + options.get_dtypes()); +#ifdef NJP_DEBUG_PRINT + auto debug_schema_print = [](auto ret) { + std::cout << ", type id: " + << (ret.has_value() ? std::to_string(static_cast(ret->type.id())) : "n/a") + << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children" + << "\n"; + }; + std::visit( + cudf::detail::visitor_overload{[column_index](const std::vector&) { + std::cout << "Column by index: #" << column_index; + }, + [col_name](std::map const&) { + std::cout << "Column by flat name: '" << col_name; + }, + [col_name](std::map const&) { + std::cout << "Column by nested name: #" << col_name; + }}, + options.get_dtypes()); + debug_schema_print(child_schema_element); +#endif + + // Get this JSON column's cudf column and schema info, (modifies json_col) + auto [cudf_col, col_name_info] = device_json_column_to_cudf_column( + json_col, d_input, options, child_schema_element, stream, mr); + + out_column_names.back().children = std::move(col_name_info); + out_columns.emplace_back(std::move(cudf_col)); + + column_index++; + } + + return table_with_metadata{std::make_unique
(std::move(out_columns)), + {{}, out_column_names}}; +} + +} // namespace detail +} // namespace cudf::io::json diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 3a26a1479e5..dbf026c351e 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -61,7 +61,10 @@ struct token_to_node { case token_t::StructBegin: return NC_STRUCT; case token_t::ListBegin: return NC_LIST; case token_t::StringBegin: return NC_STR; - case token_t::ValueBegin: return NC_VAL; + case token_t::ValueBegin: + return NC_STR; // NC_VAL; + // NV_VAL is removed because type inference and + // reduce_to_column_tree category collapsing takes care of this. case token_t::FieldNameBegin: return NC_FN; default: return NC_ERR; }; @@ -143,7 +146,7 @@ tree_meta_t get_tree_representation(device_span tokens, }; // Whether the token pops from the parent node stack - auto does_pop = [] __device__(PdaTokenT const token) { + auto does_pop = [] __device__(PdaTokenT const token) -> bool { switch (token) { case token_t::StructMemberEnd: case token_t::StructEnd: @@ -153,7 +156,7 @@ tree_meta_t get_tree_representation(device_span tokens, }; // Whether the token pushes onto the parent node stack - auto does_push = [] __device__(PdaTokenT const token) { + auto does_push = [] __device__(PdaTokenT const token) -> bool { switch (token) { case token_t::FieldNameBegin: case token_t::StructBegin: @@ -182,7 +185,7 @@ tree_meta_t get_tree_representation(device_span tokens, "node category count mismatch"); // Node levels: transform_exclusive_scan, copy_if. - rmm::device_uvector token_levels(num_tokens, stream); + rmm::device_uvector token_levels(num_tokens, stream); auto push_pop_it = thrust::make_transform_iterator( tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type { return does_push(token) - does_pop(token); @@ -225,6 +228,7 @@ tree_meta_t get_tree_representation(device_span tokens, // TODO: make it own function. rmm::device_uvector parent_token_ids(num_tokens, stream); rmm::device_uvector initial_order(num_tokens, stream); + // TODO re-write the algorithm to work only on nodes, not tokens. thrust::sequence(rmm::exec_policy(stream), initial_order.begin(), initial_order.end()); thrust::tabulate(rmm::exec_policy(stream), @@ -250,15 +254,18 @@ tree_meta_t get_tree_representation(device_span tokens, parent_token_ids.data(), thrust::equal_to{}, thrust::maximum{}); - // Reusing token_levels memory & use scatter to restore the original order. - std::swap(token_levels, parent_token_ids); - auto& sorted_parent_token_ids = token_levels; - thrust::scatter(rmm::exec_policy(stream), - sorted_parent_token_ids.begin(), - sorted_parent_token_ids.end(), - initial_order.data(), - parent_token_ids.data()); + // scatter to restore the original order. + { + rmm::device_uvector temp_storage(num_tokens, stream); + thrust::scatter(rmm::exec_policy(stream), + parent_token_ids.begin(), + parent_token_ids.end(), + initial_order.begin(), + temp_storage.begin()); + thrust::copy( + rmm::exec_policy(stream), temp_storage.begin(), temp_storage.end(), parent_token_ids.begin()); + } rmm::device_uvector node_ids_gpu(num_tokens, stream); thrust::exclusive_scan( @@ -349,6 +356,7 @@ rmm::device_uvector hash_node_type_with_field_name(device_spansecond.load(cuda::std::memory_order_relaxed); }; + // convert field nodes to node indices, and other nodes to enum value. rmm::device_uvector node_type(num_nodes, stream); thrust::tabulate(rmm::exec_policy(stream), @@ -378,6 +386,7 @@ rmm::device_uvector translate_sorted_parent_node_indices( device_span parent_node_ids, rmm::cuda_stream_view stream) { + CUDF_FUNC_RANGE(); auto const num_nodes = scatter_indices.size(); auto const gather_indices = cudf::detail::scatter_to_gather( scatter_indices.begin(), scatter_indices.end(), num_nodes, stream); @@ -425,10 +434,11 @@ std::pair, rmm::device_uvector> gene CUDF_FUNC_RANGE(); auto const num_nodes = node_type.size(); - rmm::device_uvector scatter_indices(num_nodes, stream); - thrust::sequence(rmm::exec_policy(stream), scatter_indices.begin(), scatter_indices.end()); rmm::device_uvector col_id(num_nodes, stream, mr); rmm::device_uvector parent_col_id(num_nodes, stream); + if (num_nodes == 0) { return {std::move(col_id), std::move(parent_col_id)}; } + rmm::device_uvector scatter_indices(num_nodes, stream); + thrust::sequence(rmm::exec_policy(stream), scatter_indices.begin(), scatter_indices.end()); // scatter 1 to level_boundaries alone, useful for scan later thrust::scatter(rmm::exec_policy(stream), thrust::make_constant_iterator(1), @@ -457,6 +467,7 @@ std::pair, rmm::device_uvector> gene // To invoke Radix sort for keys {parent_col_id, node_type} instead of merge sort, // we need to split to 2 Radix sorts. // Secondary sort on node_type + thrust::stable_sort_by_key( rmm::exec_policy(stream), node_type.data() + level_boundaries[level - 1], @@ -493,6 +504,7 @@ std::pair, rmm::device_uvector> gene col_id.data() + level_boundaries[level] + (level != num_levels - 1), // +1 only for not-last-levels, for next level start col_id col_id.data() + level_boundaries[level - 1]); + // scatter to restore original order. auto const num_nodes_per_level = level_boundaries[level] - level_boundaries[level - 1]; { @@ -662,6 +674,7 @@ records_orient_tree_traversal(device_span d_input, // 3. Find level boundaries. auto level_boundaries = [&]() { + if (d_tree.node_levels.is_empty()) return rmm::device_uvector{0, stream}; // Already node_levels is sorted auto max_level = d_tree.node_levels.back_element(stream); rmm::device_uvector level_boundaries(max_level + 1, stream); diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 548f5c4e9e9..10d209b2ea6 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -172,6 +172,52 @@ struct json_column { uint32_t child_count); }; +/** + * @brief Intermediate representation of data from a nested JSON input, in device memory. + * Device memory equivalent of `json_column`. + */ +struct device_json_column { + // Type used to count number of rows + using row_offset_t = size_type; + + // The inferred type of this column (list, struct, or value/string column) + json_col_t type = json_col_t::Unknown; + + rmm::device_uvector string_offsets; + rmm::device_uvector string_lengths; + + // Row offsets + rmm::device_uvector child_offsets; + + // Validity bitmap + rmm::device_uvector validity; + + // Map of child columns, if applicable. + // Following "element" as the default child column's name of a list column + // Using the struct's field names + std::map child_columns; + std::vector column_order; + // Counting the current number of items in this column + row_offset_t num_rows = 0; + + /** + * @brief Construct a new d json column object + * + * @note `mr` is used for allocating the device memory for child_offsets, and validity + * since it will moved into cudf::column later. + * + * @param stream The CUDA stream to which kernels are dispatched + * @param mr Optional, resource with which to allocate + */ + device_json_column(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) + : string_offsets(0, stream), + string_lengths(0, stream), + child_offsets(0, stream, mr), + validity(0, stream, mr) + { + } +}; + /** * @brief Tokens emitted while parsing a JSON input */ @@ -256,7 +302,7 @@ std::pair, rmm::device_uvector> ge tree_meta_t get_tree_representation( device_span tokens, device_span token_indices, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -276,6 +322,33 @@ records_orient_tree_traversal( tree_meta_t& d_tree, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Reduce node tree into column tree by aggregating each property of column. + * + * @param tree json node tree to reduce (modified in-place, but restored to original state) + * @param col_ids column ids of each node (modified in-place, but restored to original state) + * @param row_offsets row offsets of each node (modified in-place, but restored to original state) + * @param stream The CUDA stream to which kernels are dispatched + * @return A tuple containing the column tree, identifier for each column and the maximum row index + * in each column + */ +std::tuple, rmm::device_uvector> +reduce_to_column_tree(tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + rmm::cuda_stream_view stream); + +/** @copydoc host_parse_nested_json + * All processing is done in device memory. + * + */ +table_with_metadata device_parse_nested_json( + host_span input, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Parses the given JSON string and generates table from the given input. * @@ -285,10 +358,10 @@ records_orient_tree_traversal( * @param mr Optional, resource with which to allocate * @return The data parsed from the given JSON input */ -table_with_metadata parse_nested_json( +table_with_metadata host_parse_nested_json( host_span input, cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 552cd1e6167..5d60a564b9b 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1712,10 +1712,10 @@ std::pair, std::vector> json_column_to return {}; } -table_with_metadata parse_nested_json(host_span input, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +table_with_metadata host_parse_nested_json(host_span input, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Range of orchestrating/encapsulating function CUDF_FUNC_RANGE(); @@ -1758,6 +1758,12 @@ table_with_metadata parse_nested_json(host_span input, auto const& data_root = new_line_delimited_json ? root_column : root_column.child_columns.begin()->second; + // Zero row entries + if (data_root.type == json_col_t::ListColumn && data_root.child_columns.size() == 0) { + return table_with_metadata{std::make_unique
(std::vector>{}), + {{}, std::vector{}}}; + } + // Verify that we were in fact given a list of structs (or in JSON speech: an array of objects) auto constexpr single_child_col_count = 1; CUDF_EXPECTS(data_root.type == json_col_t::ListColumn and diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index 578c72fc316..a9bc15ec40f 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -283,6 +284,7 @@ cudf::data_type infer_data_type(OptionsView const& options, std::size_t const size, rmm::cuda_stream_view stream) { + CUDF_FUNC_RANGE(); CUDF_EXPECTS(size != 0, "No data available for data type inference.\n"); auto const h_column_info = infer_column_type(options, data, column_strings_begin, size, stream); diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp index 940d9d8ca0a..3d024fe8af8 100644 --- a/cpp/tests/io/json_tree.cpp +++ b/cpp/tests/io/json_tree.cpp @@ -289,7 +289,7 @@ tree_meta_t2 get_tree_representation_cpu(device_span tokens_gpu case token_t::StructBegin: return NC_STRUCT; case token_t::ListBegin: return NC_LIST; case token_t::StringBegin: return NC_STR; - case token_t::ValueBegin: return NC_VAL; + case token_t::ValueBegin: return NC_STR; // NC_VAL; case token_t::FieldNameBegin: return NC_FN; default: return NC_ERR; }; @@ -576,14 +576,14 @@ TEST_F(JsonTest, TreeRepresentation) // Golden sample of node categories std::vector golden_node_categories = { cuio_json::NC_LIST, cuio_json::NC_STRUCT, cuio_json::NC_FN, cuio_json::NC_STR, - cuio_json::NC_FN, cuio_json::NC_LIST, cuio_json::NC_VAL, cuio_json::NC_VAL, - cuio_json::NC_VAL, cuio_json::NC_FN, cuio_json::NC_STR, cuio_json::NC_FN, - cuio_json::NC_STR, cuio_json::NC_FN, cuio_json::NC_VAL, cuio_json::NC_STRUCT, + cuio_json::NC_FN, cuio_json::NC_LIST, cuio_json::NC_STR, cuio_json::NC_STR, + cuio_json::NC_STR, cuio_json::NC_FN, cuio_json::NC_STR, cuio_json::NC_FN, + cuio_json::NC_STR, cuio_json::NC_FN, cuio_json::NC_STR, cuio_json::NC_STRUCT, cuio_json::NC_FN, cuio_json::NC_STR, cuio_json::NC_FN, cuio_json::NC_LIST, - cuio_json::NC_VAL, cuio_json::NC_STRUCT, cuio_json::NC_VAL, cuio_json::NC_STRUCT, + cuio_json::NC_STR, cuio_json::NC_STRUCT, cuio_json::NC_STR, cuio_json::NC_STRUCT, cuio_json::NC_FN, cuio_json::NC_LIST, cuio_json::NC_STRUCT, cuio_json::NC_STRUCT, cuio_json::NC_FN, cuio_json::NC_STR, cuio_json::NC_FN, cuio_json::NC_STR, - cuio_json::NC_FN, cuio_json::NC_VAL}; + cuio_json::NC_FN, cuio_json::NC_STR}; // Golden sample of node ids // clang-format off @@ -664,9 +664,9 @@ TEST_F(JsonTest, TreeRepresentation2) // clang-format off std::vector golden_node_categories = { cuio_json::NC_LIST, cuio_json::NC_STRUCT, - cuio_json::NC_STRUCT, cuio_json::NC_FN, cuio_json::NC_STRUCT, cuio_json::NC_FN, cuio_json::NC_VAL, cuio_json::NC_FN, cuio_json::NC_LIST, - cuio_json::NC_STRUCT, cuio_json::NC_FN, cuio_json::NC_STRUCT, cuio_json::NC_FN, cuio_json::NC_VAL, cuio_json::NC_FN, cuio_json::NC_VAL, - cuio_json::NC_FN, cuio_json::NC_STRUCT, cuio_json::NC_FN, cuio_json::NC_VAL, cuio_json::NC_FN, cuio_json::NC_VAL}; + cuio_json::NC_STRUCT, cuio_json::NC_FN, cuio_json::NC_STRUCT, cuio_json::NC_FN, cuio_json::NC_STR, cuio_json::NC_FN, cuio_json::NC_LIST, + cuio_json::NC_STRUCT, cuio_json::NC_FN, cuio_json::NC_STRUCT, cuio_json::NC_FN, cuio_json::NC_STR, cuio_json::NC_FN, cuio_json::NC_STR, + cuio_json::NC_FN, cuio_json::NC_STRUCT, cuio_json::NC_FN, cuio_json::NC_STR, cuio_json::NC_FN, cuio_json::NC_STR}; // Golden sample of node ids std::vector golden_parent_node_ids = { @@ -744,6 +744,7 @@ struct JsonTreeTraversalTest : public cudf::test::BaseFixture, // std::vector json_list = { + "[]", "value", "\"string\"", "[1, 2, 3]", @@ -767,6 +768,7 @@ std::vector json_list = { std::vector json_lines_list = { // Test input a: {x:i, y:i, z:[]}, b: {x:i, z:i} with JSON-lines + "", R"( {} { "a": { "y" : 6, "z": [] }} { "a": { "y" : 6, "z": [2, 3, 4, 5] }} diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index bcfde4eedeb..65926be495f 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -450,40 +450,50 @@ TEST_F(JsonTest, TokenStream2) } } -TEST_F(JsonTest, ExtractColumn) +struct JsonParserTest : public cudf::test::BaseFixture, public testing::WithParamInterface { +}; +INSTANTIATE_TEST_SUITE_P(Experimental, JsonParserTest, testing::Bool()); + +TEST_P(JsonParserTest, ExtractColumn) { using cuio_json::SymbolT; + bool const is_full_gpu = GetParam(); + auto json_parser = is_full_gpu ? cuio_json::detail::device_parse_nested_json + : cuio_json::detail::host_parse_nested_json; // Prepare cuda stream for data transfers & kernels auto const stream = cudf::default_stream_value; + auto mr = rmm::mr::get_current_device_resource(); // Default parsing options cudf::io::json_reader_options default_options{}; std::string const input = R"( [{"a":0.0, "b":1.0}, {"a":0.1, "b":1.1}, {"a":0.2, "b":1.2}] )"; // Get the JSON's tree representation - auto const cudf_table = cuio_json::detail::parse_nested_json( - cudf::host_span{input.data(), input.size()}, default_options, stream); + auto const cudf_table = json_parser( + cudf::host_span{input.data(), input.size()}, default_options, stream, mr); - auto const expected_col_count = 2; - auto const first_column_index = 0; - auto const second_column_index = 1; + auto const expected_col_count = 2; EXPECT_EQ(cudf_table.tbl->num_columns(), expected_col_count); auto expected_col1 = cudf::test::fixed_width_column_wrapper({0.0, 0.1, 0.2}, {true, true, true}); auto expected_col2 = cudf::test::fixed_width_column_wrapper({1.0, 1.1, 1.2}, {true, true, true}); - cudf::column_view parsed_col1 = cudf_table.tbl->get_column(first_column_index); + cudf::column_view parsed_col1 = cudf_table.tbl->get_column(0); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col1, parsed_col1); - cudf::column_view parsed_col2 = cudf_table.tbl->get_column(second_column_index); + cudf::column_view parsed_col2 = cudf_table.tbl->get_column(1); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col2, parsed_col2); } -TEST_F(JsonTest, UTF_JSON) +TEST_P(JsonParserTest, UTF_JSON) { // Prepare cuda stream for data transfers & kernels - auto const stream = cudf::default_stream_value; + auto const stream = cudf::default_stream_value; + auto mr = rmm::mr::get_current_device_resource(); + bool const is_full_gpu = GetParam(); + auto json_parser = is_full_gpu ? cuio_json::detail::device_parse_nested_json + : cuio_json::detail::host_parse_nested_json; // Default parsing options cudf::io::json_reader_options default_options{}; @@ -497,7 +507,7 @@ TEST_F(JsonTest, UTF_JSON) {"a":1,"b":null,"c":null}, {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}}])"; - CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(ascii_pass, default_options, stream)); + CUDF_EXPECT_NO_THROW(json_parser(ascii_pass, default_options, stream, mr)); // utf-8 string that fails parsing. std::string const utf_failed = R"([ @@ -507,7 +517,7 @@ TEST_F(JsonTest, UTF_JSON) {"a":1,"b":8.0,"c":null, "d": {}}, {"a":1,"b":null,"c":null}, {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "filip ʒakotɛ"}}])"; - CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(utf_failed, default_options, stream)); + CUDF_EXPECT_NO_THROW(json_parser(utf_failed, default_options, stream, mr)); // utf-8 string that passes parsing. std::string const utf_pass = R"([ @@ -518,15 +528,19 @@ TEST_F(JsonTest, UTF_JSON) {"a":1,"b":null,"c":null}, {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}}, {"a":1,"b":NaN,"c":[null, null], "d": {"year": 2, "author": "filip ʒakotɛ"}}])"; - CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(utf_pass, default_options, stream)); + CUDF_EXPECT_NO_THROW(json_parser(utf_pass, default_options, stream, mr)); } -TEST_F(JsonTest, ExtractColumnWithQuotes) +TEST_P(JsonParserTest, ExtractColumnWithQuotes) { using cuio_json::SymbolT; + bool const is_full_gpu = GetParam(); + auto json_parser = is_full_gpu ? cuio_json::detail::device_parse_nested_json + : cuio_json::detail::host_parse_nested_json; // Prepare cuda stream for data transfers & kernels auto const stream = cudf::default_stream_value; + auto mr = rmm::mr::get_current_device_resource(); // Default parsing options cudf::io::json_reader_options options{}; @@ -534,8 +548,8 @@ TEST_F(JsonTest, ExtractColumnWithQuotes) std::string const input = R"( [{"a":"0.0", "b":1.0}, {"b":1.1}, {"b":2.1, "a":"2.0"}] )"; // Get the JSON's tree representation - auto const cudf_table = cuio_json::detail::parse_nested_json( - cudf::host_span{input.data(), input.size()}, options, stream); + auto const cudf_table = + json_parser(cudf::host_span{input.data(), input.size()}, options, stream, mr); auto constexpr expected_col_count = 2; EXPECT_EQ(cudf_table.tbl->num_columns(), expected_col_count); @@ -550,12 +564,16 @@ TEST_F(JsonTest, ExtractColumnWithQuotes) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col2, parsed_col2); } -TEST_F(JsonTest, ExpectFailMixStructAndList) +TEST_P(JsonParserTest, ExpectFailMixStructAndList) { using cuio_json::SymbolT; + bool const is_full_gpu = GetParam(); + auto json_parser = is_full_gpu ? cuio_json::detail::device_parse_nested_json + : cuio_json::detail::host_parse_nested_json; // Prepare cuda stream for data transfers & kernels auto const stream = cudf::default_stream_value; + auto mr = rmm::mr::get_current_device_resource(); // Default parsing options cudf::io::json_reader_options options{}; @@ -572,14 +590,37 @@ TEST_F(JsonTest, ExpectFailMixStructAndList) for (auto const& input : inputs_fail) { CUDF_EXPECT_THROW_MESSAGE( - auto const cudf_table = cuio_json::detail::parse_nested_json( - cudf::host_span{input.data(), input.size()}, options, stream), + auto const cudf_table = json_parser( + cudf::host_span{input.data(), input.size()}, options, stream, mr), "A mix of lists and structs within the same column is not supported"); } for (auto const& input : inputs_succeed) { CUDF_EXPECT_NO_THROW( - auto const cudf_table = cuio_json::detail::parse_nested_json( - cudf::host_span{input.data(), input.size()}, options, stream)); + auto const cudf_table = json_parser( + cudf::host_span{input.data(), input.size()}, options, stream, mr)); } } + +TEST_P(JsonParserTest, EmptyString) +{ + using cuio_json::SymbolT; + bool const is_full_gpu = GetParam(); + auto json_parser = is_full_gpu ? cuio_json::detail::device_parse_nested_json + : cuio_json::detail::host_parse_nested_json; + + // Prepare cuda stream for data transfers & kernels + auto const stream = cudf::default_stream_value; + auto mr = rmm::mr::get_current_device_resource(); + + // Default parsing options + cudf::io::json_reader_options default_options{}; + + std::string const input = R"([])"; + // Get the JSON's tree representation + auto const cudf_table = json_parser( + cudf::host_span{input.data(), input.size()}, default_options, stream, mr); + + auto const expected_col_count = 0; + EXPECT_EQ(cudf_table.tbl->num_columns(), expected_col_count); +} diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index f6ca4691669..92227707b18 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -603,18 +603,21 @@ def test_json_nested_basic(tmpdir): "c1": [{"f2": "sf21"}, {"f1": "sf12"}], "c2": [["l11", "l21"], []], }, + # empty input + {}, ], ) -def test_json_nested_lines(data): +@pytest.mark.parametrize("lines", [True, False]) +def test_json_nested_lines(data, lines): bytes = BytesIO() pdf = pd.DataFrame(data) - pdf.to_json(bytes, orient="records", lines=True) + pdf.to_json(bytes, orient="records", lines=lines) bytes.seek(0) df = cudf.read_json( - bytes, engine="cudf_experimental", orient="records", lines=True + bytes, engine="cudf_experimental", orient="records", lines=lines ) bytes.seek(0) - pdf = pd.read_json(bytes, orient="records", lines=True) + pdf = pd.read_json(bytes, orient="records", lines=lines) # In the second test-case we need to take a detour via pyarrow # Pandas omits "f1" in first row, so we have to enforce a common schema, # such that pandas would have the f1 member with null From 831ef04e4e01f7bd1aa8f580dfb038c96997f3ef Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 27 Sep 2022 11:50:29 +0200 Subject: [PATCH 08/10] Add BGZIP `data_chunk_reader` (#11652) This adds a BGZIP `data_chunk_reader` usable with `multibyte_split`. The BGZIP format is a modified GZIP format that consists of multiple blocks of at most 65536 bytes compressed data describing at most 65536 bytes of uncompressed data. The data can be accessed with record offsets provided by Tabix index files, which contain so-called virtual offsets (unsigned 64 bit) of the following form ``` 63 16 0 +----------------------+-------+ | block offset | local | +----------------------+-------+ ``` The lower 16 bits describe the offset inside the uncompressed data belonging to a single compressed block, the upper 48 bits describe the offset of the compressed block inside the BGZIP file. The interface allows two modes: Reading a full compressed file, and reading between the locations described by two Tabix virtual offsets. For a description of the BGZIP format, check section 4 in the [SAM specification](https://github.com/samtools/hts-specs/blob/master/SAMv1.pdf). Closes #10466 ## TODO - [x] Use events to avoid clobbering data that is still in use - [x] stricter handling of local_begin (currently it may overflow into subsequent blocks) - [x] add tests where local_begin and local_end are in the same chunk or even block - [x] ~~add cudf deflate fallback if nvComp doesn't support it~~ this should not be necessary, since we only test with compatible nvcomp versions Authors: - Tobias Ribizel (https://github.com/upsj) Approvers: - Michael Wang (https://github.com/isVoid) - Yunsong Wang (https://github.com/PointKernel) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/11652 --- cpp/CMakeLists.txt | 1 + .../io/text/data_chunk_source_factories.hpp | 37 +- cpp/src/io/comp/nvcomp_adapter.cpp | 25 +- cpp/src/io/text/bgzip_data_chunk_source.cu | 444 ++++++++++++++++++ .../io/text/data_chunk_source_factories.cpp | 30 +- cpp/src/io/text/device_data_chunks.hpp | 47 ++ cpp/tests/io/text/data_chunk_source_test.cpp | 249 +++++++++- 7 files changed, 776 insertions(+), 57 deletions(-) create mode 100644 cpp/src/io/text/bgzip_data_chunk_source.cu create mode 100644 cpp/src/io/text/device_data_chunks.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8847ad36000..96fc75adcff 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -355,6 +355,7 @@ add_library( src/io/statistics/parquet_column_statistics.cu src/io/text/byte_range_info.cpp src/io/text/data_chunk_source_factories.cpp + src/io/text/bgzip_data_chunk_source.cu src/io/text/multibyte_split.cu src/io/utilities/column_buffer.cpp src/io/utilities/config_utils.cpp diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp index 0f6417878a6..6f94fb170a8 100644 --- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp @@ -27,16 +27,51 @@ namespace cudf::io::text { /** * @brief Creates a data source capable of producing device-buffered views of the given string. + * @param data the host data to be exposed as a data chunk source. Its lifetime must be at least as + * long as the lifetime of the returned data_chunk_source. + * @return the data chunk source for the provided host data. It copies data from the host to the + * device. */ std::unique_ptr make_source(host_span data); /** * @brief Creates a data source capable of producing device-buffered views of the file + * @param filename the filename of the file to be exposed as a data chunk source. + * @return the data chunk source for the provided filename. It reads data from the file and copies + * it to the device. */ -std::unique_ptr make_source_from_file(std::string const& filename); +std::unique_ptr make_source_from_file(std::string_view filename); + +/** + * @brief Creates a data source capable of producing device-buffered views of a BGZIP compressed + * file. + * @param filename the filename of the BGZIP-compressed file to be exposed as a data chunk source. + * @return the data chunk source for the provided filename. It reads data from the file and copies + * it to the device, where it will be decompressed. + */ +std::unique_ptr make_source_from_bgzip_file(std::string_view filename); + +/** + * @brief Creates a data source capable of producing device-buffered views of a BGZIP compressed + * file with virtual record offsets. + * @param filename the filename of the BGZIP-compressed file to be exposed as a data chunk source. + * @param virtual_begin the virtual (Tabix) offset of the first byte to be read. Its upper 48 bits + * describe the offset into the compressed file, its lower 16 bits describe the + * block-local offset. + * @param virtual_end the virtual (Tabix) offset one past the last byte to be read. + * @return the data chunk source for the provided filename. It reads data from the file and copies + * it to the device, where it will be decompressed. The chunk source only returns data + * between the virtual offsets `virtual_begin` and `virtual_end`. + */ +std::unique_ptr make_source_from_bgzip_file(std::string_view filename, + uint64_t virtual_begin, + uint64_t virtual_end); /** * @brief Creates a data source capable of producing views of the given device string scalar + * @param data the device data to be exposed as a data chunk source. Its lifetime must be at least + * as long as the lifetime of the returned data_chunk_source. + * @return the data chunk source for the provided host data. It does not create any copies. */ std::unique_ptr make_source(cudf::string_scalar& data); diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 31f7b9b472e..9fa442f3d08 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -68,7 +68,8 @@ namespace cudf::io::nvcomp { // Dispatcher for nvcompBatchedDecompressGetTempSizeEx template -nvcompStatus_t batched_decompress_get_temp_size_ex(compression_type compression, Args&&... args) +std::optional batched_decompress_get_temp_size_ex(compression_type compression, + Args&&... args) { #if NVCOMP_HAS_TEMPSIZE_EX switch (compression) { @@ -78,13 +79,13 @@ nvcompStatus_t batched_decompress_get_temp_size_ex(compression_type compression, #if NVCOMP_HAS_ZSTD_DECOMP return nvcompBatchedZstdDecompressGetTempSizeEx(std::forward(args)...); #else - CUDF_FAIL("Unsupported compression type"); + return std::nullopt; #endif case compression_type::DEFLATE: [[fallthrough]]; - default: CUDF_FAIL("Unsupported compression type"); + default: return std::nullopt; } #endif - CUDF_FAIL("GetTempSizeEx is not supported in the current nvCOMP version"); + return std::nullopt; } // Dispatcher for nvcompBatchedDecompressGetTempSize @@ -138,16 +139,12 @@ size_t batched_decompress_temp_size(compression_type compression, size_t max_uncomp_chunk_size, size_t max_total_uncomp_size) { - size_t temp_size = 0; - auto const nvcomp_status = [&]() { - try { - return batched_decompress_get_temp_size_ex( - compression, num_chunks, max_uncomp_chunk_size, &temp_size, max_total_uncomp_size); - } catch (cudf::logic_error const& err) { - return batched_decompress_get_temp_size( - compression, num_chunks, max_uncomp_chunk_size, &temp_size); - } - }(); + size_t temp_size = 0; + auto const nvcomp_status = + batched_decompress_get_temp_size_ex( + compression, num_chunks, max_uncomp_chunk_size, &temp_size, max_total_uncomp_size) + .value_or(batched_decompress_get_temp_size( + compression, num_chunks, max_uncomp_chunk_size, &temp_size)); CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "Unable to get scratch size for decompression"); diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu new file mode 100644 index 00000000000..7715c2ca7e1 --- /dev/null +++ b/cpp/src/io/text/bgzip_data_chunk_source.cu @@ -0,0 +1,444 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "io/comp/nvcomp_adapter.hpp" +#include "io/text/device_data_chunks.hpp" +#include "io/utilities/config_utils.hpp" + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +namespace cudf::io::text { + +namespace { + +/** + * @brief Transforms offset tuples of the form [compressed_begin, compressed_end, + * decompressed_begin, decompressed_end] into span tuples of the form [compressed_device_span, + * decompressed_device_span] based on the provided pointers. + */ +struct bgzip_nvcomp_transform_functor { + uint8_t const* compressed_ptr; + uint8_t* decompressed_ptr; + + __device__ thrust::tuple, device_span> operator()( + thrust::tuple t) + { + auto const compressed_begin = thrust::get<0>(t); + auto const compressed_end = thrust::get<1>(t); + auto const decompressed_begin = thrust::get<2>(t); + auto const decompressed_end = thrust::get<3>(t); + return thrust::make_tuple(device_span{compressed_ptr + compressed_begin, + compressed_end - compressed_begin}, + device_span{decompressed_ptr + decompressed_begin, + decompressed_end - decompressed_begin}); + } +}; + +class bgzip_data_chunk_reader : public data_chunk_reader { + private: + template + static IntType read_int(char* data) + { + IntType result{}; + // we assume little-endian + std::memcpy(&result, &data[0], sizeof(result)); + return result; + } + + struct bgzip_header { + int block_size; + int extra_length; + [[nodiscard]] int data_size() const { return block_size - extra_length - 20; } + }; + + bgzip_header read_header() + { + std::array buffer{}; + _data_stream->read(buffer.data(), sizeof(buffer)); + std::array const expected_header{{31, 139, 8, 4}}; + CUDF_EXPECTS( + std::equal( + expected_header.begin(), expected_header.end(), reinterpret_cast(buffer.data())), + "malformed BGZIP header"); + // we ignore the remaining bytes of the fixed header, since they don't matter to us + auto const extra_length = read_int(&buffer[10]); + uint16_t extra_offset{}; + // read all the extra subfields + while (extra_offset < extra_length) { + auto const remaining_size = extra_length - extra_offset; + CUDF_EXPECTS(remaining_size >= 4, "invalid extra field length"); + // a subfield consists of 2 identifier bytes and a uint16 length + // 66/67 identifies a BGZIP block size field, we skip all other fields + _data_stream->read(buffer.data(), 4); + extra_offset += 4; + auto const subfield_size = read_int(&buffer[2]); + if (buffer[0] == 66 && buffer[1] == 67) { + // the block size subfield contains a single uint16 value, which is block_size - 1 + CUDF_EXPECTS(subfield_size == sizeof(uint16_t), "malformed BGZIP extra subfield"); + _data_stream->read(buffer.data(), sizeof(uint16_t)); + _data_stream->seekg(remaining_size - 6, std::ios_base::cur); + auto const block_size_minus_one = read_int(&buffer[0]); + return {block_size_minus_one + 1, extra_length}; + } else { + _data_stream->seekg(subfield_size, std::ios_base::cur); + extra_offset += subfield_size; + } + } + CUDF_FAIL("missing BGZIP size extra subfield"); + } + + struct bgzip_footer { + uint32_t decompressed_size; + }; + + bgzip_footer read_footer() + { + std::array buffer{}; + _data_stream->read(buffer.data(), sizeof(buffer)); + return {read_int(&buffer[4])}; + } + + template + using pinned_host_vector = + thrust::host_vector>; + + template + static void copy_to_device(const pinned_host_vector& host, + rmm::device_uvector& device, + rmm::cuda_stream_view stream) + { + device.resize(host.size(), stream); + CUDF_CUDA_TRY(cudaMemcpyAsync( + device.data(), host.data(), host.size() * sizeof(T), cudaMemcpyHostToDevice, stream.value())); + } + + struct decompression_blocks { + static constexpr std::size_t default_buffer_alloc = + 1 << 24; // 16MB buffer allocation, resized on demand + static constexpr std::size_t default_offset_alloc = + 1 << 16; // 64k offset allocation, resized on demand + + cudaEvent_t event; + pinned_host_vector h_compressed_blocks; + pinned_host_vector h_compressed_offsets; + pinned_host_vector h_decompressed_offsets; + rmm::device_uvector d_compressed_blocks; + rmm::device_uvector d_decompressed_blocks; + rmm::device_uvector d_compressed_offsets; + rmm::device_uvector d_decompressed_offsets; + rmm::device_uvector> d_compressed_spans; + rmm::device_uvector> d_decompressed_spans; + rmm::device_uvector d_decompression_results; + std::size_t compressed_size_with_headers{}; + std::size_t max_decompressed_size{}; + // this is usually equal to decompressed_size() + // unless we are in the last chunk, where it's limited by _local_end + std::size_t available_decompressed_size{}; + std::size_t read_pos{}; + bool is_decompressed{}; + + decompression_blocks(rmm::cuda_stream_view init_stream) + : d_compressed_blocks(0, init_stream), + d_decompressed_blocks(0, init_stream), + d_compressed_offsets(0, init_stream), + d_decompressed_offsets(0, init_stream), + d_compressed_spans(0, init_stream), + d_decompressed_spans(0, init_stream), + d_decompression_results(0, init_stream) + { + CUDF_CUDA_TRY(cudaEventCreate(&event)); + h_compressed_blocks.reserve(default_buffer_alloc); + h_compressed_offsets.reserve(default_offset_alloc); + h_compressed_offsets.push_back(0); + h_decompressed_offsets.reserve(default_offset_alloc); + h_decompressed_offsets.push_back(0); + } + + void decompress(rmm::cuda_stream_view stream) + { + if (is_decompressed) { return; } + copy_to_device(h_compressed_blocks, d_compressed_blocks, stream); + copy_to_device(h_compressed_offsets, d_compressed_offsets, stream); + copy_to_device(h_decompressed_offsets, d_decompressed_offsets, stream); + d_decompressed_blocks.resize(decompressed_size(), stream); + d_compressed_spans.resize(num_blocks(), stream); + d_decompressed_spans.resize(num_blocks(), stream); + d_decompression_results.resize(num_blocks(), stream); + + auto offset_it = thrust::make_zip_iterator(d_compressed_offsets.begin(), + d_compressed_offsets.begin() + 1, + d_decompressed_offsets.begin(), + d_decompressed_offsets.begin() + 1); + auto span_it = + thrust::make_zip_iterator(d_compressed_spans.begin(), d_decompressed_spans.begin()); + thrust::transform( + rmm::exec_policy_nosync(stream), + offset_it, + offset_it + num_blocks(), + span_it, + bgzip_nvcomp_transform_functor{reinterpret_cast(d_compressed_blocks.data()), + reinterpret_cast(d_decompressed_blocks.begin())}); + if (decompressed_size() > 0) { + if (cudf::io::detail::nvcomp_integration::is_all_enabled()) { + cudf::io::nvcomp::batched_decompress(cudf::io::nvcomp::compression_type::DEFLATE, + d_compressed_spans, + d_decompressed_spans, + d_decompression_results, + max_decompressed_size, + decompressed_size(), + stream); + } else { + gpuinflate(d_compressed_spans, + d_decompressed_spans, + d_decompression_results, + gzip_header_included::NO, + stream); + } + } + is_decompressed = true; + } + + void reset() + { + h_compressed_blocks.resize(0); + h_compressed_offsets.resize(1); + h_decompressed_offsets.resize(1); + // shrinking doesn't allocate/free, so we don't need to worry about streams + auto stream = cudf::default_stream_value; + d_compressed_blocks.resize(0, stream); + d_decompressed_blocks.resize(0, stream); + d_compressed_offsets.resize(0, stream); + d_decompressed_offsets.resize(0, stream); + d_compressed_spans.resize(0, stream); + d_decompressed_spans.resize(0, stream); + d_decompression_results.resize(0, stream); + compressed_size_with_headers = 0; + max_decompressed_size = 0; + available_decompressed_size = 0; + read_pos = 0; + is_decompressed = false; + } + + [[nodiscard]] std::size_t num_blocks() const { return h_compressed_offsets.size() - 1; } + + [[nodiscard]] std::size_t compressed_size() const { return h_compressed_offsets.back(); } + + [[nodiscard]] std::size_t decompressed_size() const { return h_decompressed_offsets.back(); } + + [[nodiscard]] std::size_t remaining_size() const + { + return available_decompressed_size - read_pos; + } + + void read_block(bgzip_header header, std::istream& stream) + { + h_compressed_blocks.resize(h_compressed_blocks.size() + header.data_size()); + stream.read(h_compressed_blocks.data() + compressed_size(), header.data_size()); + } + + void add_block_offsets(bgzip_header header, bgzip_footer footer) + { + max_decompressed_size = + std::max(footer.decompressed_size, max_decompressed_size); + h_compressed_offsets.push_back(compressed_size() + header.data_size()); + h_decompressed_offsets.push_back(decompressed_size() + footer.decompressed_size); + } + + void consume_bytes(std::size_t size) + { + CUDF_EXPECTS(size <= remaining_size(), "out of bounds"); + read_pos += size; + } + }; + + void read_next_compressed_chunk(std::size_t requested_size) + { + std::swap(_curr_blocks, _prev_blocks); + if (_curr_blocks.is_decompressed) { + // synchronize on the last decompression + copy, so we don't clobber any buffers + CUDF_CUDA_TRY(cudaEventSynchronize(_curr_blocks.event)); + } + _curr_blocks.reset(); + // read chunks until we have enough decompressed data + while (_curr_blocks.decompressed_size() < requested_size) { + // calling peek on an already EOF stream causes it to fail, we need to avoid that + if (_data_stream->eof()) { break; } + // peek is necessary if we are already at the end, but didn't try to read another byte + _data_stream->peek(); + if (_data_stream->eof() || _compressed_pos > _compressed_end) { break; } + auto header = read_header(); + _curr_blocks.read_block(header, *_data_stream); + auto footer = read_footer(); + _curr_blocks.add_block_offsets(header, footer); + // for the last GZIP block, we restrict ourselves to the bytes up to _local_end + // but only for the reader, not for decompression! + if (_compressed_pos == _compressed_end) { + _curr_blocks.available_decompressed_size += _local_end; + _compressed_pos += header.block_size; + break; + } else { + _curr_blocks.available_decompressed_size += footer.decompressed_size; + _compressed_pos += header.block_size; + } + } + } + + constexpr static std::size_t chunk_load_size = 1 << 24; // load 16 MB of data by default + + public: + bgzip_data_chunk_reader(std::unique_ptr input_stream, + uint64_t virtual_begin, + uint64_t virtual_end) + : _data_stream(std::move(input_stream)), + _prev_blocks{cudf::default_stream_value}, // here we can use the default stream because + _curr_blocks{cudf::default_stream_value}, // we only initialize empty device_uvectors + _local_end{virtual_end & 0xFFFFu}, + _compressed_pos{virtual_begin >> 16}, + _compressed_end{virtual_end >> 16} + { + // set failbit to throw on IO failures + _data_stream->exceptions(std::istream::failbit); + // seek to the beginning of the provided compressed offset + _data_stream->seekg(_compressed_pos, std::ios_base::cur); + // read the first blocks + read_next_compressed_chunk(chunk_load_size); + // seek to the beginning of the provided local offset + auto const local_pos = virtual_begin & 0xFFFFu; + if (local_pos > 0) { + CUDF_EXPECTS(_curr_blocks.h_compressed_offsets.size() > 1 && + local_pos < _curr_blocks.h_compressed_offsets[1], + "local part of virtual offset is out of bounds"); + _curr_blocks.consume_bytes(local_pos); + } + } + + void skip_bytes(std::size_t read_size) override + { + while (read_size > _curr_blocks.remaining_size()) { + read_size -= _curr_blocks.remaining_size(); + _curr_blocks.consume_bytes(_curr_blocks.remaining_size()); + read_next_compressed_chunk(chunk_load_size); + // calling peek on an already EOF stream causes it to fail, we need to avoid that + if (_data_stream->eof()) { break; } + // peek is necessary if we are already at the end, but didn't try to read another byte + _data_stream->peek(); + if (_data_stream->eof() || _compressed_pos > _compressed_end) { break; } + } + read_size = std::min(read_size, _curr_blocks.remaining_size()); + _curr_blocks.consume_bytes(read_size); + } + + std::unique_ptr get_next_chunk(std::size_t read_size, + rmm::cuda_stream_view stream) override + { + CUDF_FUNC_RANGE(); + if (read_size <= _curr_blocks.remaining_size()) { + _curr_blocks.decompress(stream); + rmm::device_uvector data(read_size, stream); + CUDF_CUDA_TRY( + cudaMemcpyAsync(data.data(), + _curr_blocks.d_decompressed_blocks.data() + _curr_blocks.read_pos, + read_size, + cudaMemcpyDeviceToDevice, + stream.value())); + // record the host-to-device copy, decompression and device copy + CUDF_CUDA_TRY(cudaEventRecord(_curr_blocks.event, stream.value())); + _curr_blocks.consume_bytes(read_size); + return std::make_unique(std::move(data)); + } + read_next_compressed_chunk(read_size /* - _curr_blocks.remaining_size()*/); + _prev_blocks.decompress(stream); + _curr_blocks.decompress(stream); + read_size = std::min(read_size, _prev_blocks.remaining_size() + _curr_blocks.remaining_size()); + rmm::device_uvector data(read_size, stream); + CUDF_CUDA_TRY(cudaMemcpyAsync(data.data(), + _prev_blocks.d_decompressed_blocks.data() + _prev_blocks.read_pos, + _prev_blocks.remaining_size(), + cudaMemcpyDeviceToDevice, + stream.value())); + CUDF_CUDA_TRY(cudaMemcpyAsync(data.data() + _prev_blocks.remaining_size(), + _curr_blocks.d_decompressed_blocks.data() + _curr_blocks.read_pos, + read_size - _prev_blocks.remaining_size(), + cudaMemcpyDeviceToDevice, + stream.value())); + // record the host-to-device copy, decompression and device copy + CUDF_CUDA_TRY(cudaEventRecord(_curr_blocks.event, stream.value())); + CUDF_CUDA_TRY(cudaEventRecord(_prev_blocks.event, stream.value())); + read_size -= _prev_blocks.remaining_size(); + _prev_blocks.consume_bytes(_prev_blocks.remaining_size()); + _curr_blocks.consume_bytes(read_size); + return std::make_unique(std::move(data)); + } + + private: + std::unique_ptr _data_stream; + decompression_blocks _prev_blocks; + decompression_blocks _curr_blocks; + std::size_t _local_end; + std::size_t _compressed_pos; + std::size_t _compressed_end; +}; + +class bgzip_data_chunk_source : public data_chunk_source { + public: + bgzip_data_chunk_source(std::string_view filename, uint64_t virtual_begin, uint64_t virtual_end) + : _filename{filename}, _virtual_begin{virtual_begin}, _virtual_end{virtual_end} + { + } + + [[nodiscard]] std::unique_ptr create_reader() const override + { + return std::make_unique( + std::make_unique(_filename, std::ifstream::in), _virtual_begin, _virtual_end); + } + + private: + std::string _filename; + uint64_t _virtual_begin; + uint64_t _virtual_end; +}; + +} // namespace + +std::unique_ptr make_source_from_bgzip_file(std::string_view filename, + uint64_t virtual_begin, + uint64_t virtual_end) +{ + return std::make_unique(filename, virtual_begin, virtual_end); +} + +std::unique_ptr make_source_from_bgzip_file(std::string_view filename) +{ + return std::make_unique( + filename, 0, std::numeric_limits::max()); +} + +} // namespace cudf::io::text diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp index 5256f5381e8..9a549951d66 100644 --- a/cpp/src/io/text/data_chunk_source_factories.cpp +++ b/cpp/src/io/text/data_chunk_source_factories.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "io/text/device_data_chunks.hpp" + #include #include @@ -28,30 +30,6 @@ namespace cudf::io::text { namespace { -class device_span_data_chunk : public device_data_chunk { - public: - device_span_data_chunk(device_span data) : _data(data) {} - - [[nodiscard]] char const* data() const override { return _data.data(); } - [[nodiscard]] std::size_t size() const override { return _data.size(); } - operator device_span() const override { return _data; } - - private: - device_span _data; -}; - -class device_uvector_data_chunk : public device_data_chunk { - public: - device_uvector_data_chunk(rmm::device_uvector&& data) : _data(std::move(data)) {} - - [[nodiscard]] char const* data() const override { return _data.data(); } - [[nodiscard]] std::size_t size() const override { return _data.size(); } - operator device_span() const override { return _data; } - - private: - rmm::device_uvector _data; -}; - /** * @brief A reader which produces owning chunks of device memory which contain a copy of the data * from an istream. @@ -207,7 +185,7 @@ class device_span_data_chunk_reader : public data_chunk_reader { */ class file_data_chunk_source : public data_chunk_source { public: - file_data_chunk_source(std::string filename) : _filename(std::move(filename)) {} + file_data_chunk_source(std::string_view filename) : _filename(filename) {} [[nodiscard]] std::unique_ptr create_reader() const override { return std::make_unique( @@ -255,7 +233,7 @@ std::unique_ptr make_source(host_span data) return std::make_unique(data); } -std::unique_ptr make_source_from_file(std::string const& filename) +std::unique_ptr make_source_from_file(std::string_view filename) { return std::make_unique(filename); } diff --git a/cpp/src/io/text/device_data_chunks.hpp b/cpp/src/io/text/device_data_chunks.hpp new file mode 100644 index 00000000000..3f971ae147a --- /dev/null +++ b/cpp/src/io/text/device_data_chunks.hpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cudf::io::text { + +class device_span_data_chunk : public device_data_chunk { + public: + device_span_data_chunk(device_span data) : _data(data) {} + + [[nodiscard]] char const* data() const override { return _data.data(); } + [[nodiscard]] std::size_t size() const override { return _data.size(); } + operator device_span() const override { return _data; } + + private: + device_span _data; +}; + +class device_uvector_data_chunk : public device_data_chunk { + public: + device_uvector_data_chunk(rmm::device_uvector&& data) : _data(std::move(data)) {} + + [[nodiscard]] char const* data() const override { return _data.data(); } + [[nodiscard]] std::size_t size() const override { return _data.size(); } + operator device_span() const override { return _data; } + + private: + rmm::device_uvector _data; +}; + +} // namespace cudf::io::text diff --git a/cpp/tests/io/text/data_chunk_source_test.cpp b/cpp/tests/io/text/data_chunk_source_test.cpp index ed660cb1792..115a66cdd95 100644 --- a/cpp/tests/io/text/data_chunk_source_test.cpp +++ b/cpp/tests/io/text/data_chunk_source_test.cpp @@ -22,6 +22,7 @@ #include #include +#include using namespace cudf::test; @@ -42,8 +43,8 @@ void test_source(const std::string& content, const cudf::io::text::data_chunk_so { { // full contents - auto reader = source.create_reader(); - auto chunk = reader->get_next_chunk(content.size(), rmm::cuda_stream_default); + auto reader = source.create_reader(); + auto const chunk = reader->get_next_chunk(content.size(), rmm::cuda_stream_default); ASSERT_EQ(chunk->size(), content.size()); ASSERT_EQ(chunk_to_host(*chunk), content); } @@ -51,15 +52,25 @@ void test_source(const std::string& content, const cudf::io::text::data_chunk_so // skipping contents auto reader = source.create_reader(); reader->skip_bytes(4); - auto chunk = reader->get_next_chunk(content.size(), rmm::cuda_stream_default); + auto const chunk = reader->get_next_chunk(content.size(), rmm::cuda_stream_default); ASSERT_EQ(chunk->size(), content.size() - 4); ASSERT_EQ(chunk_to_host(*chunk), content.substr(4)); } + { + // reading multiple chunks, starting with a small one + auto reader = source.create_reader(); + auto const chunk1 = reader->get_next_chunk(5, rmm::cuda_stream_default); + auto const chunk2 = reader->get_next_chunk(content.size() - 5, rmm::cuda_stream_default); + ASSERT_EQ(chunk1->size(), 5); + ASSERT_EQ(chunk2->size(), content.size() - 5); + ASSERT_EQ(chunk_to_host(*chunk1), content.substr(0, 5)); + ASSERT_EQ(chunk_to_host(*chunk2), content.substr(5)); + } { // reading multiple chunks - auto reader = source.create_reader(); - auto chunk1 = reader->get_next_chunk(content.size() / 2, rmm::cuda_stream_default); - auto chunk2 = + auto reader = source.create_reader(); + auto const chunk1 = reader->get_next_chunk(content.size() / 2, rmm::cuda_stream_default); + auto const chunk2 = reader->get_next_chunk(content.size() - content.size() / 2, rmm::cuda_stream_default); ASSERT_EQ(chunk1->size(), content.size() / 2); ASSERT_EQ(chunk2->size(), content.size() - content.size() / 2); @@ -68,8 +79,8 @@ void test_source(const std::string& content, const cudf::io::text::data_chunk_so } { // reading too many bytes - auto reader = source.create_reader(); - auto chunk = reader->get_next_chunk(content.size() + 10, rmm::cuda_stream_default); + auto reader = source.create_reader(); + auto const chunk = reader->get_next_chunk(content.size() + 10, rmm::cuda_stream_default); ASSERT_EQ(chunk->size(), content.size()); ASSERT_EQ(chunk_to_host(*chunk), content); auto next_chunk = reader->get_next_chunk(1, rmm::cuda_stream_default); @@ -79,39 +90,245 @@ void test_source(const std::string& content, const cudf::io::text::data_chunk_so // skipping past the end auto reader = source.create_reader(); reader->skip_bytes(content.size() + 10); - auto next_chunk = reader->get_next_chunk(1, rmm::cuda_stream_default); + auto const next_chunk = reader->get_next_chunk(1, rmm::cuda_stream_default); ASSERT_EQ(next_chunk->size(), 0); } } TEST_F(DataChunkSourceTest, Device) { - std::string content = "device buffer source"; + std::string const content = "device buffer source"; cudf::string_scalar scalar(content); - auto source = cudf::io::text::make_source(scalar); + auto const source = cudf::io::text::make_source(scalar); test_source(content, *source); } TEST_F(DataChunkSourceTest, File) { - std::string content = "file source"; - auto filename = temp_env->get_temp_filepath("file_source"); + std::string const content = "file source"; + auto const filename = temp_env->get_temp_filepath("file_source"); { std::ofstream file{filename}; file << content; } - auto source = cudf::io::text::make_source_from_file(filename); + auto const source = cudf::io::text::make_source_from_file(filename); test_source(content, *source); } TEST_F(DataChunkSourceTest, Host) { - std::string content = "host buffer source"; - auto source = cudf::io::text::make_source(content); + std::string const content = "host buffer source"; + auto const source = cudf::io::text::make_source(content); test_source(content, *source); } +template +void write_int(std::ostream& stream, T val) +{ + std::array bytes; + // we assume little-endian + std::memcpy(&bytes[0], &val, sizeof(T)); + stream.write(bytes.data(), bytes.size()); +} + +void write_bgzip_block(std::ostream& stream, + const std::string& data, + bool add_extra_garbage_before, + bool add_extra_garbage_after) +{ + std::array const header{{ + 31, // magic number + 139, // magic number + 8, // compression type: deflate + 4, // flags: extra header + 0, // mtime + 0, // mtime + 0, // mtime + 0, // mtime: irrelevant + 4, // xfl: irrelevant + 3 // OS: irrelevant + }}; + std::array const extra_blocklen_field{{66, 67, 2, 0}}; + std::array const extra_garbage_field1{{13, // magic number + 37, // magic number + 7, // field length + 0, // field length + 1, + 2, + 3, + 4, + 5, + 6, + 7}}; + std::array const extra_garbage_field2{{12, // magic number + 34, // magic number + 2, // field length + 0, // field length + 1, 2, + 56, // magic number + 78, // magic number + 1, // field length + 0, // field length + 3, // + 90, // magic number + 12, // magic number + 8, // field length + 0, // field length + 1, 2, 3, 4, 5, 6, 7, 8}}; + stream.write(reinterpret_cast(header.data()), header.size()); + uint16_t extra_size = extra_blocklen_field.size() + 2; + if (add_extra_garbage_before) { extra_size += extra_garbage_field1.size(); } + if (add_extra_garbage_after) { extra_size += extra_garbage_field2.size(); } + write_int(stream, extra_size); + if (add_extra_garbage_before) { + stream.write(extra_garbage_field1.data(), extra_garbage_field1.size()); + } + stream.write(extra_blocklen_field.data(), extra_blocklen_field.size()); + auto const compressed_size = data.size() + 5; + uint16_t const block_size_minus_one = compressed_size + 19 + extra_size; + write_int(stream, block_size_minus_one); + if (add_extra_garbage_after) { + stream.write(extra_garbage_field2.data(), extra_garbage_field2.size()); + } + write_int(stream, 1); + write_int(stream, data.size()); + write_int(stream, ~static_cast(data.size())); + stream.write(data.data(), data.size()); + // this does not produce a valid file, since we write 0 as the CRC + // the parser ignores the checksum, so it doesn't matter to the test + // to check output with gzip, plug in the CRC of `data` here. + write_int(stream, 0); + write_int(stream, data.size()); +} + +void write_bgzip(std::ostream& stream, + const std::string& data, + std::default_random_engine& rng, + bool write_eof = true) +{ + // make sure the block size with header stays below 65536 + std::uniform_int_distribution block_size_dist{1, 65000}; + auto begin = data.begin(); + auto const end = data.end(); + int i = 0; + while (begin < end) { + auto len = std::min(end - begin, block_size_dist(rng)); + write_bgzip_block(stream, std::string{begin, begin + len}, i & 1, i & 2); + begin += len; + i++; + } + if (write_eof) { write_bgzip_block(stream, {}, false, false); } +} + +TEST_F(DataChunkSourceTest, BgzipSource) +{ + auto const filename = temp_env->get_temp_filepath("bgzip_source"); + std::string input{"bananarama"}; + for (int i = 0; i < 24; i++) { + input = input + input; + } + { + std::ofstream stream{filename}; + std::default_random_engine rng{}; + write_bgzip(stream, input, rng); + } + + auto const source = cudf::io::text::make_source_from_bgzip_file(filename); + + test_source(input, *source); +} + +TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsets) +{ + auto const filename = temp_env->get_temp_filepath("bgzip_source"); + std::string input{"bananarama"}; + for (int i = 0; i < 24; i++) { + input = input + input; + } + std::string padding_garbage{"garbage"}; + for (int i = 0; i < 10; i++) { + padding_garbage = padding_garbage + padding_garbage; + } + std::string const data_garbage{"GARBAGE"}; + std::string const begininput{"begin of bananarama"}; + std::string const endinput{"end of bananarama"}; + std::size_t begin_compressed_offset{}; + std::size_t end_compressed_offset{}; + std::size_t const begin_local_offset{data_garbage.size()}; + std::size_t const end_local_offset{endinput.size()}; + { + std::ofstream stream{filename}; + stream.write(padding_garbage.data(), padding_garbage.size()); + std::default_random_engine rng{}; + begin_compressed_offset = stream.tellp(); + write_bgzip_block(stream, data_garbage + begininput, false, false); + write_bgzip(stream, input, rng, false); + end_compressed_offset = stream.tellp(); + write_bgzip_block(stream, endinput + data_garbage + data_garbage, false, false); + write_bgzip_block(stream, {}, false, false); + stream.write(padding_garbage.data(), padding_garbage.size()); + } + input = begininput + input + endinput; + + auto const source = + cudf::io::text::make_source_from_bgzip_file(filename, + begin_compressed_offset << 16 | begin_local_offset, + end_compressed_offset << 16 | end_local_offset); + + test_source(input, *source); +} + +TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleGZipBlock) +{ + auto const filename = temp_env->get_temp_filepath("bgzip_source"); + std::string const input{"collection unit brings"}; + std::string const head_garbage{"garbage"}; + std::string const tail_garbage{"GARBAGE"}; + std::size_t begin_compressed_offset{}; + std::size_t end_compressed_offset{}; + std::size_t const begin_local_offset{head_garbage.size()}; + std::size_t const end_local_offset{head_garbage.size() + input.size()}; + { + std::ofstream stream{filename}; + write_bgzip_block(stream, head_garbage + input + tail_garbage, false, false); + write_bgzip_block(stream, {}, false, false); + } + + auto const source = + cudf::io::text::make_source_from_bgzip_file(filename, + begin_compressed_offset << 16 | begin_local_offset, + end_compressed_offset << 16 | end_local_offset); + + test_source(input, *source); +} + +TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleChunk) +{ + auto const filename = temp_env->get_temp_filepath("bgzip_source"); + std::string const input{"collection unit brings"}; + std::string const head_garbage{"garbage"}; + std::string const tail_garbage{"GARBAGE"}; + std::size_t begin_compressed_offset{}; + std::size_t end_compressed_offset{}; + std::size_t const begin_local_offset{head_garbage.size()}; + std::size_t const end_local_offset{input.size() - 10}; + { + std::ofstream stream{filename}; + write_bgzip_block(stream, head_garbage + input.substr(0, 10), false, false); + end_compressed_offset = stream.tellp(); + write_bgzip_block(stream, input.substr(10) + tail_garbage, false, false); + write_bgzip_block(stream, {}, false, false); + } + + auto const source = + cudf::io::text::make_source_from_bgzip_file(filename, + begin_compressed_offset << 16 | begin_local_offset, + end_compressed_offset << 16 | end_local_offset); + + test_source(input, *source); +} + CUDF_TEST_PROGRAM_MAIN() From 35b0a52a6288cc971bb731371cda8d72772b530b Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 27 Sep 2022 04:50:34 -0500 Subject: [PATCH 09/10] Enable `schema_element` & `keep_quotes` support in json reader (#11746) This PR plumbs `schema_element` and `keep_quotes` support in json reader. **Deprecation:** This PR also contains changes deprecating `dtype` as `list` inputs. This seems to be a very outdated legacy feature we continued to support and cannot be supported with the `schema_element`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/11746 --- python/cudf/cudf/_lib/cpp/io/json.pxd | 12 ++- python/cudf/cudf/_lib/json.pyx | 39 +++++++- python/cudf/cudf/io/json.py | 17 +++- python/cudf/cudf/tests/test_json.py | 135 ++++++++++++++++++++++++-- python/cudf/cudf/utils/ioutils.py | 29 ++++++ 5 files changed, 217 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd index bc9d87a5cbf..7333aad7ddf 100644 --- a/python/cudf/cudf/_lib/cpp/io/json.pxd +++ b/python/cudf/cudf/_lib/cpp/io/json.pxd @@ -15,6 +15,10 @@ from cudf._lib.cpp.types cimport data_type, size_type cdef extern from "cudf/io/json.hpp" \ namespace "cudf::io" nogil: + cdef struct schema_element: + data_type type + map[string, schema_element] child_types + cdef cppclass json_reader_options: json_reader_options() except+ cudf_io_types.source_info get_source() except+ @@ -28,7 +32,7 @@ cdef extern from "cudf/io/json.hpp" \ # setter void set_dtypes(vector[data_type] types) except+ - void set_dtypes(map[string, data_type] types) except+ + void set_dtypes(map[string, schema_element] types) except+ void set_compression( cudf_io_types.compression_type compression ) except+ @@ -37,6 +41,7 @@ cdef extern from "cudf/io/json.hpp" \ void enable_lines(bool val) except+ void enable_dayfirst(bool val) except+ void enable_experimental(bool val) except+ + void enable_keep_quotes(bool val) except+ @staticmethod json_reader_options_builder builder( @@ -55,7 +60,7 @@ cdef extern from "cudf/io/json.hpp" \ vector[data_type] types ) except+ json_reader_options_builder& dtypes( - map[string, data_type] types + map[string, schema_element] types ) except+ json_reader_options_builder& compression( cudf_io_types.compression_type compression @@ -75,6 +80,9 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& experimental( bool val ) except+ + json_reader_options_builder& keep_quotes( + bool val + ) except+ json_reader_options build() except+ diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 376850b7b1b..b0aafc275d6 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -20,6 +20,7 @@ cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.cpp.io.json cimport ( json_reader_options, read_json as libcudf_read_json, + schema_element, ) from cudf._lib.cpp.types cimport data_type, size_type, type_id from cudf._lib.io.utils cimport make_source_info, update_struct_field_names @@ -32,7 +33,8 @@ cpdef read_json(object filepaths_or_buffers, bool lines, object compression, object byte_range, - bool experimental): + bool experimental, + bool keep_quotes): """ Cython function to call into libcudf API, see `read_json`. @@ -55,7 +57,7 @@ cpdef read_json(object filepaths_or_buffers, # Setup arguments cdef vector[data_type] c_dtypes_list - cdef map[string, data_type] c_dtypes_map + cdef map[string, schema_element] c_dtypes_schema_map cdef cudf_io_types.compression_type c_compression # Determine byte read offsets if applicable cdef size_type c_range_offset = ( @@ -81,8 +83,8 @@ cpdef read_json(object filepaths_or_buffers, elif dtype is not True: if isinstance(dtype, abc.Mapping): for k, v in dtype.items(): - c_dtypes_map[str(k).encode()] = \ - _get_cudf_data_type_from_dtype(v) + c_dtypes_schema_map[str(k).encode()] = \ + _get_cudf_schema_element_from_dtype(v) elif isinstance(dtype, abc.Collection): is_list_like_dtypes = True c_dtypes_list.reserve(len(dtype)) @@ -105,8 +107,9 @@ cpdef read_json(object filepaths_or_buffers, if is_list_like_dtypes: opts.set_dtypes(c_dtypes_list) else: - opts.set_dtypes(c_dtypes_map) + opts.set_dtypes(c_dtypes_schema_map) + opts.enable_keep_quotes(keep_quotes) # Read JSON cdef cudf_io_types.table_with_metadata c_result @@ -123,6 +126,32 @@ cpdef read_json(object filepaths_or_buffers, return df + +cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except +: + cdef schema_element s_element + cdef data_type lib_type + if cudf.api.types.is_categorical_dtype(dtype): + raise NotImplementedError( + "CategoricalDtype as dtype is not yet " + "supported in JSON reader" + ) + + dtype = cudf.dtype(dtype) + lib_type = dtype_to_data_type(dtype) + s_element.type = lib_type + if isinstance(dtype, cudf.StructDtype): + for name, child_type in dtype.fields.items(): + s_element.child_types[name.encode()] = \ + _get_cudf_schema_element_from_dtype(child_type) + elif isinstance(dtype, cudf.ListDtype): + s_element.child_types["offsets".encode()] = \ + _get_cudf_schema_element_from_dtype(cudf.dtype("int32")) + s_element.child_types["element".encode()] = \ + _get_cudf_schema_element_from_dtype(dtype.element_type) + + return s_element + + cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +: if cudf.api.types.is_categorical_dtype(dtype): raise NotImplementedError( diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index e1e8e7cdb3d..2a0ae565974 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -21,13 +21,27 @@ def read_json( lines=False, compression="infer", byte_range=None, + keep_quotes=False, *args, **kwargs, ): """{docstring}""" + if not isinstance(dtype, (abc.Mapping, bool)): + warnings.warn( + "passing 'dtype' as list is deprecated, instead pass " + "a dict of column name and types key-value paris." + "in future versions 'dtype' can only be a dict or bool", + FutureWarning, + ) + if engine == "cudf" and not lines: - raise ValueError("cudf engine only supports JSON Lines format") + raise ValueError(f"{engine} engine only supports JSON Lines format") + if engine != "cudf_experimental" and keep_quotes: + raise ValueError( + "keep_quotes='True' is supported only with" + " engine='cudf_experimental'" + ) if engine == "auto": engine = "cudf" if lines else "pandas" if engine == "cudf" or engine == "cudf_experimental": @@ -64,6 +78,7 @@ def read_json( compression, byte_range, engine == "cudf_experimental", + keep_quotes, ) else: warnings.warn( diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 92227707b18..1fdef44546a 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -274,11 +274,10 @@ def test_json_lines_byte_range(json_input): assert df.shape == (1, 3) -@pytest.mark.parametrize( - "dtype", [["float", "int", "short"], {1: "int", 2: "short", 0: "float"}] -) -def test_json_lines_dtypes(json_input, dtype): - df = cudf.read_json(json_input, lines=True, dtype=dtype) +def test_json_lines_dtypes(json_input): + df = cudf.read_json( + json_input, lines=True, dtype={1: "int", 2: "short", 0: "float"} + ) assert all(df.dtypes == ["float64", "int64", "int16"]) @@ -302,7 +301,10 @@ def test_json_lines_compression(tmpdir, ext, out_comp, in_comp): pd_df.to_json(fname, compression=out_comp, lines=True, orient="records") cu_df = cudf.read_json( - str(fname), compression=in_comp, lines=True, dtype=["int32", "int32"] + str(fname), + compression=in_comp, + lines=True, + dtype={"col1": "int32", "col2": "int32"}, ) assert_eq(pd_df, cu_df) @@ -345,7 +347,9 @@ def test_json_bool_values(): # boolean values should be converted to 0/1 np.testing.assert_array_equal(pd_df[1], cu_df["1"].to_numpy()) - cu_df = cudf.read_json(buffer, lines=True, dtype=["bool", "long"]) + cu_df = cudf.read_json( + buffer, lines=True, dtype={"0": "bool", "1": "long"} + ) np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes) @@ -663,3 +667,120 @@ def test_json_types_data(): pdf, schema=df.to_arrow().schema, safe=False ) assert df.to_arrow().equals(pa_table_pdf) + + +@pytest.mark.parametrize( + "keep_quotes,result", + [ + ( + True, + { + "c1": [ + {"f1": '"sf11"', "f2": '"sf21"'}, + {"f1": '"sf12"', "f2": '"sf22"'}, + ], + "c2": [['"l11"', '"l21"'], ['"l12"', '"l22"']], + }, + ), + ( + False, + { + "c1": [ + {"f1": "sf11", "f2": "sf21"}, + {"f1": "sf12", "f2": "sf22"}, + ], + "c2": [["l11", "l21"], ["l12", "l22"]], + }, + ), + ], +) +def test_json_keep_quotes(keep_quotes, result): + bytes_file = BytesIO() + data = { + "c1": [{"f1": "sf11", "f2": "sf21"}, {"f1": "sf12", "f2": "sf22"}], + "c2": [["l11", "l21"], ["l12", "l22"]], + } + pdf = pd.DataFrame(data) + pdf.to_json(bytes_file, orient="records", lines=True) + + actual = cudf.read_json( + bytes_file, + engine="cudf_experimental", + orient="records", + lines=True, + keep_quotes=keep_quotes, + ) + expected = pd.DataFrame(result) + + assert_eq(actual, expected) + + +def test_json_dtypes_nested_data(): + # a: StructDtype({'a': StructDtype({'b': dtype('float64')}), + # 'b': dtype('int64')}) + # b: ListDtype(ListDtype(float64)) + actual_json_str = ( + '{"a":{"a":{"b":10.0},"b":11},"b":[[10.0,1.1],[12.0,23.0]]}\n' + '{"a":{"a":{"b":107.0},"b":5},"b":[[10.0,11.2],[12.0,0.23]]}\n' + '{"a":{"a":{"b":50.7},"b":2},"b":[[10.0,11.3],[12.0,2.3]]}\n' + '{"a":{"a":{"b":1.2},"b":67},"b":[[6.0,7.0]]}\n' + '{"a":{"a":{"b":40.1},"b":1090},"b":null}\n' + ) + + """ + In [3]: df + Out[3]: + a b + 0 {'a': {'b': 10.0}, 'b': 11} [[10.0, 1.1], [12.0, 23.0]] + 1 {'a': {'b': 107.0}, 'b': 5} [[10.0, 11.2], [12.0, 0.23]] + 2 {'a': {'b': 50.7}, 'b': 2} [[10.0, 11.3], [12.0, 2.3]] + 3 {'a': {'b': 1.2}, 'b': 67} [[6.0, 7.0]] + 4 {'a': {'b': 40.1}, 'b': 1090} None + """ + + # a: StructDtype({'a': StructDtype({'b': dtype('int64')}), + # 'b': dtype('float64')}) + # b: ListDtype(ListDtype(int64)) + expected_json_str = ( + '{"a":{"a":{"b":10},"b":11.0},"b":[[10,1],[12,23]]}\n' + '{"a":{"a":{"b":107},"b":5.0},"b":[[10,11],[12,0]]}\n' + '{"a":{"a":{"b":50},"b":2.0},"b":[[10,11],[12,2]]}\n' + '{"a":{"a":{"b":1},"b":67.0},"b":[[6,7]]}\n' + '{"a":{"a":{"b":40},"b":1090.0},"b":null}\n' + ) + + """ + In [7]: df + Out[7]: + a b + 0 {'a': {'b': 10}, 'b': 11.0} [[10, 1], [12, 23]] + 1 {'a': {'b': 107}, 'b': 5.0} [[10, 11], [12, 0]] + 2 {'a': {'b': 50}, 'b': 2.0} [[10, 11], [12, 2]] + 3 {'a': {'b': 1}, 'b': 67.0} [[6, 7]] + 4 {'a': {'b': 40}, 'b': 1090.0} None + """ + + df = cudf.read_json( + StringIO(actual_json_str), + engine="cudf_experimental", + orient="records", + lines=True, + dtype={ + "a": cudf.StructDtype( + { + "a": cudf.StructDtype({"b": cudf.dtype("int64")}), + "b": cudf.dtype("float64"), + } + ), + "b": cudf.ListDtype(cudf.ListDtype("int64")), + }, + ) + + pdf = pd.read_json( + StringIO(expected_json_str), orient="records", lines=True + ) + pdf.columns = pdf.columns.astype("str") + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) + assert df.to_arrow().equals(pa_table_pdf) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 84d39459a12..9670a5e2d81 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -559,6 +559,11 @@ size in bytes. Set the size to zero to read all data after the offset location. Reads the row that starts before or at the end of the range, even if it ends after the end of the range. +keep_quotes : bool, default False + This parameter is only supported in ``cudf_experimental`` engine. + If `True`, any string values are read literally (and wrapped in an + additional set of quotes). + If `False` string values are parsed into Python strings. Returns ------- @@ -567,6 +572,30 @@ See Also -------- cudf.DataFrame.to_json + +Examples +-------- +>>> import cudf +>>> df = cudf.DataFrame({'a': ["hello", "rapids"], 'b': ["hello", "worlds"]}) +>>> df + a b +0 hello hello +1 rapids worlds +>>> json_str = df.to_json(orient='records', lines=True) +>>> json_str +'{"a":"hello","b":"hello"}\n{"a":"rapids","b":"worlds"}\n' +>>> cudf.read_json(json_str, engine="cudf", lines=True) + a b +0 hello hello +1 rapids worlds + +To read the strings with additional set of quotes: + +>>> cudf.read_json(json_str, engine="cudf_experimental", lines=True, +... keep_quotes=True) + a b +0 "hello" "hello" +1 "rapids" "worlds" """ doc_read_json = docfmt_partial(docstring=_docstring_read_json) From a270ae6303487397a21ade949786c43636194752 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 27 Sep 2022 09:30:06 -0500 Subject: [PATCH 10/10] Add `istitle` to string UDFs (#11738) This PR adds support for the use of the`str.istitle()` method within udfs for `apply`. Authors: - https://github.com/brandon-b-miller - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - David Wendt (https://github.com/davidwendt) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/11738 --- python/cudf/cudf/core/udf/strings_lowering.py | 2 ++ python/cudf/cudf/tests/test_udf_masked_ops.py | 13 ++++++++++ .../include/cudf/strings/udf/char_types.cuh | 24 +++++++++++++++++++ .../strings_udf/cpp/src/strings/udf/shim.cu | 9 +++++++ python/strings_udf/strings_udf/_typing.py | 1 + python/strings_udf/strings_udf/lowering.py | 6 +++++ .../strings_udf/tests/test_string_udfs.py | 12 ++++++++++ 7 files changed, 67 insertions(+) diff --git a/python/cudf/cudf/core/udf/strings_lowering.py b/python/cudf/cudf/core/udf/strings_lowering.py index 5b69d1a9da3..59041977f87 100644 --- a/python/cudf/cudf/core/udf/strings_lowering.py +++ b/python/cudf/cudf/core/udf/strings_lowering.py @@ -19,6 +19,7 @@ isdigit_impl, islower_impl, isspace_impl, + istitle_impl, isupper_impl, len_impl, rfind_impl, @@ -123,3 +124,4 @@ def masked_unary_func_impl(context, builder, sig, args): create_masked_unary_identifier_func("MaskedType.islower", islower_impl) create_masked_unary_identifier_func("MaskedType.isspace", isspace_impl) create_masked_unary_identifier_func("MaskedType.isdecimal", isdecimal_impl) +create_masked_unary_identifier_func("MaskedType.istitle", istitle_impl) diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index 2b96c920765..20245bd2a20 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -56,6 +56,11 @@ def str_udf_data(): "cudf", "cuda", "gpu", + "This Is A Title", + "This is Not a Title", + "Neither is This a Title", + "NoT a TiTlE", + "123 Title Works", ] } ) @@ -839,6 +844,14 @@ def func(row): run_masked_udf_test(func, str_udf_data, check_dtype=False) +@string_udf_test +def test_string_udf_istitle(str_udf_data): + def func(row): + return row["str_col"].istitle() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + @string_udf_test def test_string_udf_count(str_udf_data, substr): def func(row): diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/char_types.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/char_types.cuh index e28111fd1f2..9320686442b 100644 --- a/python/strings_udf/cpp/include/cudf/strings/udf/char_types.cuh +++ b/python/strings_udf/cpp/include/cudf/strings/udf/char_types.cuh @@ -183,6 +183,30 @@ __device__ inline bool is_lower(cudf::strings::detail::character_flags_table_typ flags_table, d_str, string_character_types::LOWER, string_character_types::CASE_TYPES); } +/** + * @brief Returns true if string is in title case + * + * @param tables The char tables required for checking characters + * @param d_str Input string to check + * @return True if string is in title case + */ +__device__ inline bool is_title(cudf::strings::detail::character_flags_table_type* flags_table, + string_view d_str) +{ + auto valid = false; // requires one or more cased characters + auto should_be_capitalized = true; // current character should be upper-case + for (auto const chr : d_str) { + auto const code_point = cudf::strings::detail::utf8_to_codepoint(chr); + auto const flag = code_point <= 0x00FFFF ? flags_table[code_point] : 0; + if (cudf::strings::detail::IS_UPPER_OR_LOWER(flag)) { + if (should_be_capitalized == !cudf::strings::detail::IS_UPPER(flag)) return false; + valid = true; + } + should_be_capitalized = !cudf::strings::detail::IS_UPPER_OR_LOWER(flag); + } + return valid; +} + } // namespace udf } // namespace strings } // namespace cudf diff --git a/python/strings_udf/cpp/src/strings/udf/shim.cu b/python/strings_udf/cpp/src/strings/udf/shim.cu index 656861f9cd6..4d6690468ff 100644 --- a/python/strings_udf/cpp/src/strings/udf/shim.cu +++ b/python/strings_udf/cpp/src/strings/udf/shim.cu @@ -198,6 +198,15 @@ extern "C" __device__ int pyisalpha(bool* nb_retval, void const* str, std::int64 return 0; } +extern "C" __device__ int pyistitle(bool* nb_retval, void const* str, std::int64_t chars_table) +{ + auto str_view = reinterpret_cast(str); + + *nb_retval = is_title( + reinterpret_cast(chars_table), *str_view); + return 0; +} + extern "C" __device__ int pycount(int* nb_retval, void const* str, void const* substr) { auto str_view = reinterpret_cast(str); diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py index 2e4519a01fe..675507bccde 100644 --- a/python/strings_udf/strings_udf/_typing.py +++ b/python/strings_udf/strings_udf/_typing.py @@ -209,6 +209,7 @@ def resolve_count(self, mod): "islower", "isspace", "isnumeric", + "istitle", ] for func in bool_binary_funcs: diff --git a/python/strings_udf/strings_udf/lowering.py b/python/strings_udf/strings_udf/lowering.py index fd965a7a187..df0902dfa98 100644 --- a/python/strings_udf/strings_udf/lowering.py +++ b/python/strings_udf/strings_udf/lowering.py @@ -76,6 +76,7 @@ def _declare_binary_func(lhs, rhs, out, name): _string_view_isspace = _declare_bool_str_int_func("pyisspace") _string_view_isupper = _declare_bool_str_int_func("pyisupper") _string_view_islower = _declare_bool_str_int_func("pyislower") +_string_view_istitle = _declare_bool_str_int_func("pyistitle") _string_view_count = cuda.declare_device( @@ -285,3 +286,8 @@ def isupper_impl(st, tbl): @create_unary_identifier_func("StringView.islower") def islower_impl(st, tbl): return _string_view_islower(st, tbl) + + +@create_unary_identifier_func("StringView.istitle") +def istitle_impl(st, tbl): + return _string_view_istitle(st, tbl) diff --git a/python/strings_udf/strings_udf/tests/test_string_udfs.py b/python/strings_udf/strings_udf/tests/test_string_udfs.py index 9038f4cc79a..f214915ae12 100644 --- a/python/strings_udf/strings_udf/tests/test_string_udfs.py +++ b/python/strings_udf/strings_udf/tests/test_string_udfs.py @@ -89,6 +89,11 @@ def data(): "cudf", "cuda", "gpu", + "This Is A Title", + "This is Not a Title", + "Neither is This a Title", + "NoT a TiTlE", + "123 Title Works", ] @@ -228,6 +233,13 @@ def func(st): run_udf_test(data, func, "bool") +def test_string_udf_istitle(data): + def func(st): + return st.istitle() + + run_udf_test(data, func, "bool") + + def test_string_udf_len(data): def func(st): return len(st)