From dbdcc31fe1cbe902d495428da3c68dc59d289dc5 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 4 Mar 2024 18:22:49 +0000 Subject: [PATCH 1/4] Expose new stable_sort and finish stream_compaction in pylibcudf (#15175) Completes coverage of `sorting.hpp` and `stream_compaction.hpp` Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15175 --- python/cudf/cudf/_lib/cpp/sorting.pxd | 7 +- .../cudf/cudf/_lib/cpp/stream_compaction.pxd | 43 +++- python/cudf/cudf/_lib/pylibcudf/sorting.pxd | 2 + python/cudf/cudf/_lib/pylibcudf/sorting.pyx | 39 +++- .../cudf/_lib/pylibcudf/stream_compaction.pxd | 34 +++- .../cudf/_lib/pylibcudf/stream_compaction.pyx | 185 ++++++++++++++++-- python/cudf/cudf/_lib/stream_compaction.pyx | 1 + 7 files changed, 275 insertions(+), 36 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/sorting.pxd b/python/cudf/cudf/_lib/cpp/sorting.pxd index 68f01003fe6..86dc0f0de95 100644 --- a/python/cudf/cudf/_lib/cpp/sorting.pxd +++ b/python/cudf/cudf/_lib/cpp/sorting.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -68,3 +68,8 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil: table_view source_table, vector[libcudf_types.order] column_order, vector[libcudf_types.null_order] null_precedence) except + + + cdef unique_ptr[table] stable_sort( + table_view source_table, + vector[libcudf_types.order] column_order, + vector[libcudf_types.null_order] null_precedence) except + diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd index e8539ecb9c3..55854a9444f 100644 --- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd @@ -30,21 +30,28 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil: vector[size_type] keys, size_type keep_threshold) except + + cdef unique_ptr[table] drop_nans(table_view source_table, + vector[size_type] keys, + size_type keep_threshold) except + + cdef unique_ptr[table] apply_boolean_mask( table_view source_table, column_view boolean_mask ) except + - cdef size_type distinct_count( - column_view source_table, - null_policy null_handling, - nan_policy nan_handling) except + + cdef unique_ptr[table] unique( + table_view input, + vector[size_type] keys, + duplicate_keep_option keep, + null_equality nulls_equal, + ) except + - cdef unique_ptr[table] stable_distinct( + cdef unique_ptr[table] distinct( table_view input, vector[size_type] keys, duplicate_keep_option keep, null_equality nulls_equal, + nan_equality nans_equals, ) except + cdef unique_ptr[column] distinct_indices( @@ -53,3 +60,29 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil: null_equality nulls_equal, nan_equality nans_equal, ) except + + + cdef unique_ptr[table] stable_distinct( + table_view input, + vector[size_type] keys, + duplicate_keep_option keep, + null_equality nulls_equal, + nan_equality nans_equal, + ) except + + + cdef size_type unique_count( + column_view column, + null_policy null_handling, + nan_policy nan_handling) except + + + cdef size_type unique_count( + table_view source_table, + null_policy null_handling) except + + + cdef size_type distinct_count( + column_view column, + null_policy null_handling, + nan_policy nan_handling) except + + + cdef size_type distinct_count( + table_view source_table, + null_policy null_handling) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd b/python/cudf/cudf/_lib/pylibcudf/sorting.pxd index fb22da0b0fd..3ed241622c0 100644 --- a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/sorting.pxd @@ -59,3 +59,5 @@ cpdef Table stable_sort_by_key( ) cpdef Table sort(Table source_table, list column_order, list null_precedence) + +cpdef Table stable_sort(Table source_table, list column_order, list null_precedence) diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx b/python/cudf/cudf/_lib/pylibcudf/sorting.pyx index 4e73760720a..1668a3efc7c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/sorting.pyx @@ -50,7 +50,8 @@ cpdef Column stable_sorted_order( list column_order, list null_precedence, ): - """Computes the row indices required to sort the table, maintaining input order. + """Computes the row indices required to sort the table, + preserving order of equal elements. Parameters ---------- @@ -206,7 +207,8 @@ cpdef Table stable_segmented_sort_by_key( list column_order, list null_precedence, ): - """Sorts the table by key, within segments, maintaining input order. + """Sorts the table by key preserving order of equal elements, + within segments. Parameters ---------- @@ -287,7 +289,7 @@ cpdef Table stable_sort_by_key( list column_order, list null_precedence, ): - """Sorts the table by key, maintaining input order. + """Sorts the table by key preserving order of equal elements. Parameters ---------- @@ -349,3 +351,34 @@ cpdef Table sort(Table source_table, list column_order, list null_precedence): ) ) return Table.from_libcudf(move(c_result)) + + +cpdef Table stable_sort(Table source_table, list column_order, list null_precedence): + """Sorts the table preserving order of equal elements. + + Parameters + ---------- + source_table : Table + The table to sort. + column_order : List[ColumnOrder] + Whether each column should be sorted in ascending or descending order. + null_precedence : List[NullOrder] + Whether nulls should come before or after non-nulls. + + Returns + ------- + Table + The sorted table. + """ + cdef unique_ptr[table] c_result + cdef vector[order] c_orders = column_order + cdef vector[null_order] c_null_precedence = null_precedence + with nogil: + c_result = move( + cpp_sorting.stable_sort( + source_table.view(), + c_orders, + c_null_precedence, + ) + ) + return Table.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd index 78adb20021c..29acc21fc05 100644 --- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd @@ -15,19 +15,21 @@ from .table cimport Table cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold) -cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask) +cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold) -cpdef size_type distinct_count( - Column source_table, - null_policy null_handling, - nan_policy nan_handling +cpdef Table unique( + Table input, + list keys, + duplicate_keep_option keep, + null_equality nulls_equal, ) -cpdef Table stable_distinct( +cpdef Table distinct( Table input, list keys, duplicate_keep_option keep, null_equality nulls_equal, + nan_equality nans_equal, ) cpdef Column distinct_indices( @@ -36,3 +38,23 @@ cpdef Column distinct_indices( null_equality nulls_equal, nan_equality nans_equal, ) + +cpdef Table stable_distinct( + Table input, + list keys, + duplicate_keep_option keep, + null_equality nulls_equal, + nan_equality nans_equal, +) + +cpdef size_type unique_count( + Column column, + null_policy null_handling, + nan_policy nan_handling +) + +cpdef size_type distinct_count( + Column column, + null_policy null_handling, + nan_policy nan_handling +) diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx index 0357866980a..af7a85d31bf 100644 --- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx @@ -51,6 +51,34 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold): return Table.from_libcudf(move(c_result)) +cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold): + """Filters out rows from the input table based on the presence of NaNs. + + Parameters + ---------- + source_table : Table + The input table to filter. + keys : List[size_type] + The list of column indexes to consider for NaN filtering. + keep_threshold : size_type + The minimum number of non-NaNs required to keep a row. + + Returns + ------- + Table + A new table with rows removed based on NaNs. + """ + cdef unique_ptr[table] c_result + cdef vector[size_type] c_keys = keys + with nogil: + c_result = move( + cpp_stream_compaction.drop_nulls( + source_table.view(), c_keys, keep_threshold + ) + ) + return Table.from_libcudf(move(c_result)) + + cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask): """Filters out rows from the input table based on a boolean mask. @@ -76,39 +104,55 @@ cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask): return Table.from_libcudf(move(c_result)) -cpdef size_type distinct_count( - Column source_table, - null_policy null_handling, - nan_policy nan_handling +cpdef Table unique( + Table input, + list keys, + duplicate_keep_option keep, + null_equality nulls_equal, ): - """Returns the number of unique elements in the input column. + """Filter duplicate consecutive rows from the input table. Parameters ---------- - source_table : Column - The input column to count the unique elements of. - null_handling : null_policy - Flag to include or exclude nulls from the count. - nan_handling : nan_policy - Flag to include or exclude NaNs from the count. + input : Table + The input table to filter + keys : list[int] + The list of column indexes to consider for filtering. + keep : duplicate_keep_option + The option to specify which rows to keep in the case of duplicates. + nulls_equal : null_equality + The option to specify how nulls are handled in the comparison. Returns ------- - size_type - The number of unique elements in the input column. + Table + New Table with unique rows from each sequence of equivalent rows + as specified by keep. In the same order as the input table. + + Notes + ----- + If the input columns to be filtered on are sorted, then + unique can produce the same result as stable_distinct, but faster. """ - return cpp_stream_compaction.distinct_count( - source_table.view(), null_handling, nan_handling - ) + cdef unique_ptr[table] c_result + cdef vector[size_type] c_keys = keys + with nogil: + c_result = move( + cpp_stream_compaction.unique( + input.view(), c_keys, keep, nulls_equal + ) + ) + return Table.from_libcudf(move(c_result)) -cpdef Table stable_distinct( +cpdef Table distinct( Table input, list keys, duplicate_keep_option keep, null_equality nulls_equal, + nan_equality nans_equal, ): - """Get the distinct rows from the input table, preserving input order. + """Get the distinct rows from the input table. Parameters ---------- @@ -120,18 +164,21 @@ cpdef Table stable_distinct( The option to specify which rows to keep in the case of duplicates. nulls_equal : null_equality The option to specify how nulls are handled in the comparison. + nans_equal : nan_equality + The option to specify how NaNs are handled in the comparison. Returns ------- Table - A new table with distinct rows from the input table. + A new table with distinct rows from the input table. The + output will not necessarily be in the same order as the input. """ cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys with nogil: c_result = move( - cpp_stream_compaction.stable_distinct( - input.view(), c_keys, keep, nulls_equal + cpp_stream_compaction.distinct( + input.view(), c_keys, keep, nulls_equal, nans_equal ) ) return Table.from_libcudf(move(c_result)) @@ -169,3 +216,99 @@ cpdef Column distinct_indices( ) ) return Column.from_libcudf(move(c_result)) + + +cpdef Table stable_distinct( + Table input, + list keys, + duplicate_keep_option keep, + null_equality nulls_equal, + nan_equality nans_equal, +): + """Get the distinct rows from the input table, preserving input order. + + Parameters + ---------- + input : Table + The input table to filter. + keys : list + The list of column indexes to consider for distinct filtering. + keep : duplicate_keep_option + The option to specify which rows to keep in the case of duplicates. + nulls_equal : null_equality + The option to specify how nulls are handled in the comparison. + nans_equal : nan_equality + The option to specify how NaNs are handled in the comparison. + + Returns + ------- + Table + A new table with distinct rows from the input table, preserving + the input table order. + """ + cdef unique_ptr[table] c_result + cdef vector[size_type] c_keys = keys + with nogil: + c_result = move( + cpp_stream_compaction.stable_distinct( + input.view(), c_keys, keep, nulls_equal, nans_equal + ) + ) + return Table.from_libcudf(move(c_result)) + + +cpdef size_type unique_count( + Column source, + null_policy null_handling, + nan_policy nan_handling +): + """Returns the number of unique consecutive elements in the input column. + + Parameters + ---------- + source : Column + The input column to count the unique elements of. + null_handling : null_policy + Flag to include or exclude nulls from the count. + nan_handling : nan_policy + Flag to include or exclude NaNs from the count. + + Returns + ------- + size_type + The number of unique consecutive elements in the input column. + + Notes + ----- + If the input column is sorted, then unique_count can produce the + same result as distinct_count, but faster. + """ + return cpp_stream_compaction.unique_count( + source.view(), null_handling, nan_handling + ) + + +cpdef size_type distinct_count( + Column source, + null_policy null_handling, + nan_policy nan_handling +): + """Returns the number of distinct elements in the input column. + + Parameters + ---------- + source : Column + The input column to count the unique elements of. + null_handling : null_policy + Flag to include or exclude nulls from the count. + nan_handling : nan_policy + Flag to include or exclude NaNs from the count. + + Returns + ------- + size_type + The number of distinct elements in the input column. + """ + return cpp_stream_compaction.distinct_count( + source.view(), null_handling, nan_handling + ) diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 04883eac559..834f91f48d9 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -109,6 +109,7 @@ def drop_duplicates(list columns, keep_option, pylibcudf.types.NullEquality.EQUAL if nulls_are_equal else pylibcudf.types.NullEquality.UNEQUAL, + pylibcudf.types.NanEquality.ALL_EQUAL, ) ) From da113015aade79d78628d00578dff22a4dd5cf35 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 4 Mar 2024 13:17:33 -0600 Subject: [PATCH 2/4] Switch `pytest-xdist` algo to `worksteal` (#15207) This PR switches `pytest-xdist` distribution algorithm to a much more efficient algorithm `worksteal`, that will assign any idle pytest worker to pickup remaining pytests. I see a 25% time savings when this switch is made locally: ``` `loadscope`: == 101421 passed, 2115 skipped, 867 xfailed in 1179.48s (0:19:39) == `worksteal`: == 101423 passed, 2115 skipped, 867 xfailed in 891.79s (0:14:51) == ``` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/cudf/pull/15207 --- ci/test_python_cudf.sh | 6 +++--- ci/test_python_other.sh | 4 ++-- ci/test_wheel_cudf.sh | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh index ace71bb0b75..bacb54b3896 100755 --- a/ci/test_python_cudf.sh +++ b/ci/test_python_cudf.sh @@ -18,7 +18,7 @@ rapids-logger "pytest cudf" ./ci/run_cudf_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \ --numprocesses=8 \ - --dist=loadscope \ + --dist=worksteal \ --cov-config=../.coveragerc \ --cov=cudf \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-coverage.xml" \ @@ -32,7 +32,7 @@ rapids-logger "pytest cudf" rapids-logger "pytest for cudf benchmarks" ./ci/run_cudf_pytest_benchmarks.sh \ --numprocesses=8 \ - --dist=loadscope \ + --dist=worksteal \ --cov-config=.coveragerc \ --cov=cudf \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-benchmark-coverage.xml" \ @@ -41,7 +41,7 @@ rapids-logger "pytest for cudf benchmarks" rapids-logger "pytest for cudf benchmarks using pandas" ./ci/run_cudf_pandas_pytest_benchmarks.sh \ --numprocesses=8 \ - --dist=loadscope \ + --dist=worksteal \ --cov-config=.coveragerc \ --cov=cudf \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-benchmark-pandas-coverage.xml" \ diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh index bc15747b26a..9cdceb295db 100755 --- a/ci/test_python_other.sh +++ b/ci/test_python_other.sh @@ -23,7 +23,7 @@ rapids-logger "pytest dask_cudf" ./ci/run_dask_cudf_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ --numprocesses=8 \ - --dist=loadscope \ + --dist=worksteal \ --cov-config=../.coveragerc \ --cov=dask_cudf \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \ @@ -33,7 +33,7 @@ rapids-logger "pytest custreamz" ./ci/run_custreamz_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-custreamz.xml" \ --numprocesses=8 \ - --dist=loadscope \ + --dist=worksteal \ --cov-config=../.coveragerc \ --cov=custreamz \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/custreamz-coverage.xml" \ diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh index b7e8f862ed5..af5779f478a 100755 --- a/ci/test_wheel_cudf.sh +++ b/ci/test_wheel_cudf.sh @@ -37,7 +37,7 @@ else --cache-clear \ --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \ --numprocesses=8 \ - --dist=loadscope \ + --dist=worksteal \ . popd fi From 0ff5a2c59cb62d6b3c473885ebbe883d1aae8c4f Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Mon, 4 Mar 2024 15:20:32 -0500 Subject: [PATCH 3/4] Replace local copyright check with pre-commit-hooks verify-copyright (#14917) The local `copyright.py` script is bug-prone. Replace it with a more robust centralized script from `pre-commit-hooks`. Authors: - Kyle Edwards (https://github.com/KyleFromNVIDIA) - Bradley Dice (https://github.com/bdice) Approvers: - Bradley Dice (https://github.com/bdice) - Jake Awe (https://github.com/AyodeAwe) - Karthikeyan (https://github.com/karthikeyann) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/14917 --- .pre-commit-config.yaml | 13 +- ci/checks/copyright.py | 277 ---------------------------------------- 2 files changed, 7 insertions(+), 283 deletions(-) delete mode 100644 ci/checks/copyright.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d302543368e..9235c80bdc9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -126,12 +126,6 @@ repos: - cmakelang==0.6.13 verbose: true require_serial: true - - id: copyright-check - name: copyright-check - entry: python ./ci/checks/copyright.py --git-modified-only --update-current-year - language: python - pass_filenames: false - additional_dependencies: [gitpython] - id: doxygen-check name: doxygen-check entry: ./ci/checks/doxygen.sh @@ -161,6 +155,13 @@ repos: hooks: - id: ruff files: python/.*$ + - repo: https://github.com/rapidsai/pre-commit-hooks + rev: v0.0.1 + hooks: + - id: verify-copyright + exclude: | + (?x) + cpp/include/cudf_test/cxxopts[.]hpp$ default_language_version: diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py deleted file mode 100644 index dd89b092496..00000000000 --- a/ci/checks/copyright.py +++ /dev/null @@ -1,277 +0,0 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import argparse -import datetime -import os -import re -import sys - -import git - -FilesToCheck = [ - re.compile(r"[.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx)$"), - re.compile(r"CMakeLists[.]txt$"), - re.compile(r"CMakeLists_standalone[.]txt$"), - re.compile(r"setup[.]cfg$"), - re.compile(r"meta[.]yaml$"), -] -ExemptFiles = [ - re.compile(r"cpp/include/cudf_test/cxxopts.hpp"), -] - -# this will break starting at year 10000, which is probably OK :) -CheckSimple = re.compile( - r"Copyright *(?:\(c\))? *(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)" -) -CheckDouble = re.compile( - r"Copyright *(?:\(c\))? *(\d{4})-(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)" # noqa: E501 -) - - -def checkThisFile(f): - if isinstance(f, git.Diff): - if f.deleted_file or f.b_blob.size == 0: - return False - f = f.b_path - elif not os.path.exists(f) or os.stat(f).st_size == 0: - # This check covers things like symlinks which point to files that DNE - return False - for exempt in ExemptFiles: - if exempt.search(f): - return False - for checker in FilesToCheck: - if checker.search(f): - return True - return False - - -def modifiedFiles(): - """Get a set of all modified files, as Diff objects. - - The files returned have been modified in git since the merge base of HEAD - and the upstream of the target branch. We return the Diff objects so that - we can read only the staged changes. - """ - repo = git.Repo() - # Use the environment variable TARGET_BRANCH or RAPIDS_BASE_BRANCH (defined in CI) if possible - target_branch = os.environ.get("TARGET_BRANCH", os.environ.get("RAPIDS_BASE_BRANCH")) - if target_branch is None: - # Fall back to the closest branch if not on CI - target_branch = repo.git.describe( - all=True, tags=True, match="branch-*", abbrev=0 - ).lstrip("heads/") - - upstream_target_branch = None - if target_branch in repo.heads: - # Use the tracking branch of the local reference if it exists. This - # returns None if no tracking branch is set. - upstream_target_branch = repo.heads[target_branch].tracking_branch() - if upstream_target_branch is None: - # Fall back to the remote with the newest target_branch. This code - # path is used on CI because the only local branch reference is - # current-pr-branch, and thus target_branch is not in repo.heads. - # This also happens if no tracking branch is defined for the local - # target_branch. We use the remote with the latest commit if - # multiple remotes are defined. - candidate_branches = [ - remote.refs[target_branch] for remote in repo.remotes - if target_branch in remote.refs - ] - if len(candidate_branches) > 0: - upstream_target_branch = sorted( - candidate_branches, - key=lambda branch: branch.commit.committed_datetime, - )[-1] - else: - # If no remotes are defined, try to use the local version of the - # target_branch. If this fails, the repo configuration must be very - # strange and we can fix this script on a case-by-case basis. - upstream_target_branch = repo.heads[target_branch] - merge_base = repo.merge_base("HEAD", upstream_target_branch.commit)[0] - diff = merge_base.diff() - changed_files = {f for f in diff if f.b_path is not None} - return changed_files - - -def getCopyrightYears(line): - res = CheckSimple.search(line) - if res: - return int(res.group(1)), int(res.group(1)) - res = CheckDouble.search(line) - if res: - return int(res.group(1)), int(res.group(2)) - return None, None - - -def replaceCurrentYear(line, start, end): - # first turn a simple regex into double (if applicable). then update years - res = CheckSimple.sub(r"Copyright (c) \1-\1, NVIDIA CORPORATION", line) - res = CheckDouble.sub( - rf"Copyright (c) {start:04d}-{end:04d}, NVIDIA CORPORATION", - res, - ) - return res - - -def checkCopyright(f, update_current_year): - """Checks for copyright headers and their years.""" - errs = [] - thisYear = datetime.datetime.now().year - lineNum = 0 - crFound = False - yearMatched = False - - if isinstance(f, git.Diff): - path = f.b_path - lines = f.b_blob.data_stream.read().decode().splitlines(keepends=True) - else: - path = f - with open(f, encoding="utf-8") as fp: - lines = fp.readlines() - - for line in lines: - lineNum += 1 - start, end = getCopyrightYears(line) - if start is None: - continue - crFound = True - if start > end: - e = [ - path, - lineNum, - "First year after second year in the copyright " - "header (manual fix required)", - None, - ] - errs.append(e) - elif thisYear < start or thisYear > end: - e = [ - path, - lineNum, - "Current year not included in the copyright header", - None, - ] - if thisYear < start: - e[-1] = replaceCurrentYear(line, thisYear, end) - if thisYear > end: - e[-1] = replaceCurrentYear(line, start, thisYear) - errs.append(e) - else: - yearMatched = True - # copyright header itself not found - if not crFound: - e = [ - path, - 0, - "Copyright header missing or formatted incorrectly " - "(manual fix required)", - None, - ] - errs.append(e) - # even if the year matches a copyright header, make the check pass - if yearMatched: - errs = [] - - if update_current_year: - errs_update = [x for x in errs if x[-1] is not None] - if len(errs_update) > 0: - lines_changed = ", ".join(str(x[1]) for x in errs_update) - print(f"File: {path}. Changing line(s) {lines_changed}") - for _, lineNum, __, replacement in errs_update: - lines[lineNum - 1] = replacement - with open(path, "w", encoding="utf-8") as out_file: - out_file.writelines(lines) - - return errs - - -def getAllFilesUnderDir(root, pathFilter=None): - retList = [] - for dirpath, dirnames, filenames in os.walk(root): - for fn in filenames: - filePath = os.path.join(dirpath, fn) - if pathFilter(filePath): - retList.append(filePath) - return retList - - -def checkCopyright_main(): - """ - Checks for copyright headers in all the modified files. In case of local - repo, this script will just look for uncommitted files and in case of CI - it compares between branches "$PR_TARGET_BRANCH" and "current-pr-branch" - """ - retVal = 0 - - argparser = argparse.ArgumentParser( - "Checks for a consistent copyright header in git's modified files" - ) - argparser.add_argument( - "--update-current-year", - dest="update_current_year", - action="store_true", - required=False, - help="If set, " - "update the current year if a header is already " - "present and well formatted.", - ) - argparser.add_argument( - "--git-modified-only", - dest="git_modified_only", - action="store_true", - required=False, - help="If set, " - "only files seen as modified by git will be " - "processed.", - ) - - args, dirs = argparser.parse_known_args() - - if args.git_modified_only: - files = [f for f in modifiedFiles() if checkThisFile(f)] - else: - files = [] - for d in [os.path.abspath(d) for d in dirs]: - if not os.path.isdir(d): - raise ValueError(f"{d} is not a directory.") - files += getAllFilesUnderDir(d, pathFilter=checkThisFile) - - errors = [] - for f in files: - errors += checkCopyright(f, args.update_current_year) - - if len(errors) > 0: - if any(e[-1] is None for e in errors): - print("Copyright headers incomplete in some of the files!") - for e in errors: - print(" %s:%d Issue: %s" % (e[0], e[1], e[2])) - print("") - n_fixable = sum(1 for e in errors if e[-1] is not None) - path_parts = os.path.abspath(__file__).split(os.sep) - file_from_repo = os.sep.join(path_parts[path_parts.index("ci") :]) - if n_fixable > 0 and not args.update_current_year: - print( - f"You can run `python {file_from_repo} --git-modified-only " - "--update-current-year` and stage the results in git to " - f"fix {n_fixable} of these errors.\n" - ) - retVal = 1 - - return retVal - - -if __name__ == "__main__": - sys.exit(checkCopyright_main()) From d158ccdbe651952bd649cb0f17c41467c5209824 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 4 Mar 2024 15:25:51 -0500 Subject: [PATCH 4/4] API for JSON unquoted whitespace normalization (#15033) This work is a follow-up to PR #14931 which provided a proof-of-concept for using the a FST to normalize unquoted whitespaces. This PR implements the pre-processing FST in cuIO and adds a JSON reader option that needs to be set to true to invoke the normalizer. Addresses feature request #14865 Authors: - Shruti Shivakumar (https://github.com/shrshi) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - Vukasin Milovanovic (https://github.com/vuule) - Robert Maynard (https://github.com/robertmaynard) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15033 --- cpp/CMakeLists.txt | 2 +- cpp/include/cudf/io/detail/json.hpp | 10 + cpp/include/cudf/io/json.hpp | 31 +++ ...normalization.cu => json_normalization.cu} | 142 ++++++++++++- cpp/src/io/json/read_json.cu | 7 + .../io/json_whitespace_normalization_test.cu | 201 ++++-------------- .../main/java/ai/rapids/cudf/JSONOptions.java | 15 ++ java/src/main/java/ai/rapids/cudf/Table.java | 9 + java/src/main/native/src/TableJni.cpp | 27 ++- .../test/java/ai/rapids/cudf/TableTest.java | 49 +++-- java/src/test/resources/whitespaces.json | 5 + 11 files changed, 314 insertions(+), 184 deletions(-) rename cpp/src/io/json/{json_quote_normalization.cu => json_normalization.cu} (57%) create mode 100644 java/src/test/resources/whitespaces.json diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5fd6cd3544a..c74963be50d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -376,7 +376,7 @@ add_library( src/io/functions.cpp src/io/json/byte_range_info.cu src/io/json/json_column.cu - src/io/json/json_quote_normalization.cu + src/io/json/json_normalization.cu src/io/json/json_tree.cu src/io/json/nested_json_gpu.cu src/io/json/read_json.cu diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 0eb0e17ea10..3f7f7e9bb32 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -63,4 +63,14 @@ rmm::device_uvector normalize_single_quotes(rmm::device_uvector&& in rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @brief Normalize unquoted whitespace (space and tab characters) using FST + * + * @param inbuf Input device buffer + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation + */ +rmm::device_uvector normalize_whitespace(rmm::device_uvector&& inbuf, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace cudf::io::json::detail diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index f0c3d48ab7e..593dd044d51 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -118,6 +118,9 @@ class json_reader_options { // Normalize single quotes bool _normalize_single_quotes = false; + // Normalize unquoted spaces and tabs + bool _normalize_whitespace = false; + // Whether to recover after an invalid JSON line json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL; @@ -265,6 +268,13 @@ class json_reader_options { */ bool is_enabled_normalize_single_quotes() const { return _normalize_single_quotes; } + /** + * @brief Whether the reader should normalize unquoted whitespace characters + * + * @returns true if the reader should normalize whitespace, false otherwise + */ + bool is_enabled_normalize_whitespace() const { return _normalize_whitespace; } + /** * @brief Queries the JSON reader's behavior on invalid JSON lines. * @@ -358,6 +368,14 @@ class json_reader_options { */ void enable_normalize_single_quotes(bool val) { _normalize_single_quotes = val; } + /** + * @brief Set whether the reader should enable normalization of unquoted whitespace + * + * @param val Boolean value to indicate whether the reader should normalize unquoted whitespace + * characters i.e. tabs and spaces + */ + void enable_normalize_whitespace(bool val) { _normalize_whitespace = val; } + /** * @brief Specifies the JSON reader's behavior on invalid JSON lines. * @@ -533,6 +551,19 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set whether the reader should normalize unquoted whitespace + * + * @param val Boolean value to indicate whether the reader should normalize unquoted + * whitespace + * @return this for chaining + */ + json_reader_options_builder& normalize_whitespace(bool val) + { + options._normalize_whitespace = val; + return *this; + } + /** * @brief Specifies the JSON reader's behavior on invalid JSON lines. * diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_normalization.cu similarity index 57% rename from cpp/src/io/json/json_quote_normalization.cu rename to cpp/src/io/json/json_normalization.cu index a13b6e0b016..86e4da664a8 100644 --- a/cpp/src/io/json/json_quote_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -32,13 +32,15 @@ namespace cudf::io::json { -using SymbolT = char; -using StateT = char; +// Type used to represent the atomic symbol type used within the finite-state machine +using SymbolT = char; +using StateT = char; + +// Type sufficiently large to index symbols within the input and output (may be unsigned) using SymbolOffsetT = uint32_t; namespace normalize_quotes { -// Type sufficiently large to index symbols within the input and output (may be unsigned) enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES }; enum class dfa_symbol_group_id : uint32_t { DOUBLE_QUOTE_CHAR, ///< Quote character SG: " @@ -172,6 +174,116 @@ struct TransduceToNormalizedQuotes { } // namespace normalize_quotes +namespace normalize_whitespace { + +enum class dfa_symbol_group_id : uint32_t { + DOUBLE_QUOTE_CHAR, ///< Quote character SG: " + ESCAPE_CHAR, ///< Escape character SG: '\\' + NEWLINE_CHAR, ///< Newline character SG: '\n' + WHITESPACE_SYMBOLS, ///< Whitespace characters SG: '\t' or ' ' + OTHER_SYMBOLS, ///< SG implicitly matching all other characters + NUM_SYMBOL_GROUPS ///< Total number of symbol groups +}; +// Alias for readability of symbol group ids +constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); +// The i-th string representing all the characters of a symbol group +std::array, NUM_SYMBOL_GROUPS - 1> const wna_sgs{ + {{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}}; + +/** + * -------- FST states --------- + * ----------------------------- + * TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double + * | quotes as well as any other character not enclosed by a string. Also handles + * | newline character present within a string + * TT_DQS | Double-quoted string state handling all characters within double quotes except + * | newline character + * TT_DEC | State handling escaped characters inside double-quoted string. Note that this + * | state is necessary to process escaped double-quote characters. Without this + * | state, whitespaces following escaped double quotes inside strings may be removed. + * + * NOTE: An important case NOT handled by this FST is that of whitespace following newline + * characters within a string. Consider the following example + * Input: {"a":"x\n y"} + * FST output: {"a":"x\ny"} + * Expected output: {"a":"x\n y"} + * Such strings are not part of the JSON standard (characters allowed within quotes should + * have ASCII at least 0x20 i.e. space character and above) but may be encountered while + * reading JSON files + */ +enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES }; +// Aliases for readability of the transition table +constexpr auto TT_OOS = dfa_states::TT_OOS; +constexpr auto TT_DQS = dfa_states::TT_DQS; +constexpr auto TT_DEC = dfa_states::TT_DEC; +constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); + +// Transition table +std::array, TT_NUM_STATES> const wna_state_tt{ + {/* IN_STATE " \ \n OTHER */ + /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}}, + /* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}}, + /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}}; + +// The DFA's starting state +constexpr StateT start_state = static_cast(TT_OOS); + +struct TransduceToNormalizedWS { + /** + * @brief Returns the -th output symbol on the transition (state_id, match_id). + */ + template + constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id, + SymbolGroupT const match_id, + RelativeOffsetT const relative_offset, + SymbolT const read_symbol) const + { + // -------- TRANSLATION TABLE ------------ + // Let the alphabet set be Sigma + // --------------------------------------- + // ---------- NON-SPECIAL CASES: ---------- + // Output symbol same as input symbol + // state | read_symbol -> output_symbol + // DQS | Sigma -> Sigma + // OOS | Sigma\{,\t} -> Sigma\{,\t} + // DEC | Sigma -> Sigma + // ---------- SPECIAL CASES: -------------- + // Input symbol translates to output symbol + // OOS | {} -> + // OOS | {\t} -> + + // Case when read symbol is a space or tab but is unquoted + // This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function + // However, since there is no output in this case i.e. the count returned by + // operator()(state_id, match_id, read_symbol) is zero, this function is never called. + // So skipping the check for this case. + + // In all other cases, we have an output symbol for the input symbol. + // We simply output the input symbol + return read_symbol; + } + + /** + * @brief Returns the number of output characters for a given transition. + * During whitespace normalization, we always emit one output character i.e., the input + * character, except when we need to remove the space/tab character + */ + template + constexpr CUDF_HOST_DEVICE uint32_t operator()(StateT const state_id, + SymbolGroupT const match_id, + SymbolT const read_symbol) const + { + // Case when read symbol is a space or tab but is unquoted + if (match_id == static_cast(dfa_symbol_group_id::WHITESPACE_SYMBOLS) && + state_id == static_cast(dfa_states::TT_OOS)) { + return 0; + } + return 1; + } +}; + +} // namespace normalize_whitespace + namespace detail { rmm::device_uvector normalize_single_quotes(rmm::device_uvector&& inbuf, @@ -198,5 +310,29 @@ rmm::device_uvector normalize_single_quotes(rmm::device_uvector normalize_whitespace(rmm::device_uvector&& inbuf, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto parser = fst::detail::make_fst( + fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs), + fst::detail::make_transition_table(normalize_whitespace::wna_state_tt), + fst::detail::make_translation_functor(normalize_whitespace::TransduceToNormalizedWS{}), + stream); + + rmm::device_uvector outbuf(inbuf.size(), stream, mr); + rmm::device_scalar outbuf_size(stream, mr); + parser.Transduce(inbuf.data(), + static_cast(inbuf.size()), + outbuf.data(), + thrust::make_discard_iterator(), + outbuf_size.data(), + normalize_whitespace::start_state, + stream); + + outbuf.resize(outbuf_size.value(stream), stream); + return outbuf; +} + } // namespace detail } // namespace cudf::io::json diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index ba8acf2d47a..506d7b6cddc 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -235,6 +235,13 @@ table_with_metadata read_json(host_span> sources, normalize_single_quotes(std::move(buffer), stream, rmm::mr::get_current_device_resource()); } + // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is + // enabled, invoke pre-processing FST + if (reader_opts.is_enabled_normalize_whitespace()) { + buffer = + normalize_whitespace(std::move(buffer), stream, rmm::mr::get_current_device_resource()); + } + return device_parse_nested_json(buffer, reader_opts, stream, mr); // For debug purposes, use host_parse_nested_json() } diff --git a/cpp/tests/io/json_whitespace_normalization_test.cu b/cpp/tests/io/json_whitespace_normalization_test.cu index 545d8d2c4f9..336d360063f 100644 --- a/cpp/tests/io/json_whitespace_normalization_test.cu +++ b/cpp/tests/io/json_whitespace_normalization_test.cu @@ -13,177 +13,41 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "io/fst/lookup_tables.cuh" -#include "io/utilities/hostdevice_vector.hpp" - #include #include -#include +#include #include -#include +#include +#include +#include #include +#include -#include #include +#include -#include - -#include #include -namespace { -// Type used to represent the atomic symbol type used within the finite-state machine -using SymbolT = char; -using StateT = char; - -// Type sufficiently large to index symbols within the input and output (may be unsigned) -using SymbolOffsetT = uint32_t; - -enum class dfa_symbol_group_id : uint32_t { - DOUBLE_QUOTE_CHAR, ///< Quote character SG: " - ESCAPE_CHAR, ///< Escape character SG: '\\' - NEWLINE_CHAR, ///< Newline character SG: '\n' - WHITESPACE_SYMBOLS, ///< Whitespace characters SG: '\t' or ' ' - OTHER_SYMBOLS, ///< SG implicitly matching all other characters - NUM_SYMBOL_GROUPS ///< Total number of symbol groups -}; -// Alias for readability of symbol group ids -constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); -// The i-th string representing all the characters of a symbol group -std::array, NUM_SYMBOL_GROUPS - 1> const wna_sgs{ - {{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}}; - -/** - * -------- FST states --------- - * ----------------------------- - * TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double - * | quotes as well as any other character not enclosed by a string. Also handles - * | newline character present within a string - * TT_DQS | Double-quoted string state handling all characters within double quotes except - * | newline character - * TT_DEC | State handling escaped characters inside double-quoted string. Note that this - * | state is necessary to process escaped double-quote characters. Without this - * | state, whitespaces following escaped double quotes inside strings may be removed. - * - * NOTE: An important case NOT handled by this FST is that of whitespace following newline - * characters within a string. Consider the following example - * Input: {"a":"x\n y"} - * FST output: {"a":"x\ny"} - * Expected output: {"a":"x\n y"} - */ -enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES }; -// Aliases for readability of the transition table -constexpr auto TT_OOS = dfa_states::TT_OOS; -constexpr auto TT_DQS = dfa_states::TT_DQS; -constexpr auto TT_DEC = dfa_states::TT_DEC; -constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); - -// Transition table -std::array, TT_NUM_STATES> const wna_state_tt{ - {/* IN_STATE " \ \n OTHER */ - /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}}, - /* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}}, - /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}}; - -// The DFA's starting state -constexpr StateT start_state = static_cast(TT_OOS); - -struct TransduceToNormalizedWS { - /** - * @brief Returns the -th output symbol on the transition (state_id, match_id). - */ - template - constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id, - SymbolGroupT const match_id, - RelativeOffsetT const relative_offset, - SymbolT const read_symbol) const - { - // -------- TRANSLATION TABLE ------------ - // Let the alphabet set be Sigma - // --------------------------------------- - // ---------- NON-SPECIAL CASES: ---------- - // Output symbol same as input symbol - // state | read_symbol -> output_symbol - // DQS | Sigma -> Sigma - // OOS | Sigma\{,\t} -> Sigma\{,\t} - // DEC | Sigma -> Sigma - // ---------- SPECIAL CASES: -------------- - // Input symbol translates to output symbol - // OOS | {} -> - // OOS | {\t} -> - - // Case when read symbol is a space or tab but is unquoted - // This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function - // However, since there is no output in this case i.e. the count returned by - // operator()(state_id, match_id, read_symbol) is zero, this function is never called. - // So skipping the check for this case. - - // In all other cases, we have an output symbol for the input symbol. - // We simply output the input symbol - return read_symbol; - } - - /** - * @brief Returns the number of output characters for a given transition. - * During whitespace normalization, we always emit one output character i.e., the input - * character, except when we need to remove the space/tab character - */ - template - constexpr CUDF_HOST_DEVICE uint32_t operator()(StateT const state_id, - SymbolGroupT const match_id, - SymbolT const read_symbol) const - { - // Case when read symbol is a space or tab but is unquoted - if (match_id == static_cast(dfa_symbol_group_id::WHITESPACE_SYMBOLS) && - state_id == static_cast(dfa_states::TT_OOS)) { - return 0; - } - return 1; - } -}; -} // namespace - // Base test fixture for tests struct JsonWSNormalizationTest : public cudf::test::BaseFixture {}; -void run_test(std::string const& input, std::string const& output) +void run_test(std::string const& host_input, std::string const& expected_host_output) { - auto parser = cudf::io::fst::detail::make_fst( - cudf::io::fst::detail::make_symbol_group_lut(wna_sgs), - cudf::io::fst::detail::make_transition_table(wna_state_tt), - cudf::io::fst::detail::make_translation_functor(TransduceToNormalizedWS{}), - cudf::test::get_default_stream()); - - auto d_input_scalar = cudf::make_string_scalar(input, cudf::test::get_default_stream()); - auto& d_input = static_cast&>(*d_input_scalar); + auto stream_view = cudf::get_default_stream(); + auto device_input = cudf::detail::make_device_uvector_async( + host_input, stream_view, rmm::mr::get_current_device_resource()); - // Prepare input & output buffers - constexpr std::size_t single_item = 1; - cudf::detail::hostdevice_vector output_gpu(input.size(), - cudf::test::get_default_stream()); - cudf::detail::hostdevice_vector output_gpu_size(single_item, - cudf::test::get_default_stream()); + // Preprocessing FST + auto device_fst_output = cudf::io::json::detail::normalize_whitespace( + std::move(device_input), stream_view, rmm::mr::get_current_device_resource()); - // Allocate device-side temporary storage & run algorithm - parser.Transduce(d_input.data(), - static_cast(d_input.size()), - output_gpu.device_ptr(), - thrust::make_discard_iterator(), - output_gpu_size.device_ptr(), - start_state, - cudf::test::get_default_stream()); + auto const preprocessed_host_output = + cudf::detail::make_std_vector_sync(device_fst_output, stream_view); - // Async copy results from device to host - output_gpu.device_to_host_async(cudf::test::get_default_stream()); - output_gpu_size.device_to_host_async(cudf::test::get_default_stream()); - - // Make sure results have been copied back to host - cudf::test::get_default_stream().synchronize(); - - // Verify results - ASSERT_EQ(output_gpu_size[0], output.size()); - CUDF_TEST_EXPECT_VECTOR_EQUAL(output_gpu, output, output.size()); + ASSERT_EQ(preprocessed_host_output.size(), expected_host_output.size()); + CUDF_TEST_EXPECT_VECTOR_EQUAL( + preprocessed_host_output, expected_host_output, preprocessed_host_output.size()); } TEST_F(JsonWSNormalizationTest, GroundTruth_Spaces) @@ -259,4 +123,33 @@ TEST_F(JsonWSNormalizationTest, GroundTruth_InvalidInput) run_test(input, output); } +TEST_F(JsonWSNormalizationTest, ReadJsonOption) +{ + // When mixed type fields are read as strings, the table read will differ depending the + // value of normalize_whitespace + + // Test input + std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}"; + cudf::io::json_reader_options input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{host_input.data(), host_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(true); + + cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options); + + // Expected table + std::string const expected_input = R"({ "a" : {"b":"c"}})"; + cudf::io::json_reader_options expected_input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{expected_input.data(), expected_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(false); + + cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index 62496e32f7a..b37d0d88ec9 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -31,6 +31,7 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean lines; private final boolean recoverWithNull; private final boolean normalizeSingleQuotes; + private final boolean normalizeWhitespace; private final boolean mixedTypesAsStrings; private final boolean keepStringQuotes; @@ -40,6 +41,7 @@ private JSONOptions(Builder builder) { lines = builder.lines; recoverWithNull = builder.recoverWithNull; normalizeSingleQuotes = builder.normalizeSingleQuotes; + normalizeWhitespace = builder.normalizeWhitespace; mixedTypesAsStrings = builder.mixedTypesAsStrings; keepStringQuotes = builder.keepQuotes; } @@ -61,6 +63,10 @@ public boolean isNormalizeSingleQuotes() { return normalizeSingleQuotes; } + public boolean isNormalizeWhitespace() { + return normalizeWhitespace; + } + public boolean isMixedTypesAsStrings() { return mixedTypesAsStrings; } @@ -84,6 +90,7 @@ public static final class Builder extends ColumnFilterOptions.Builder(lines)) .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) - .keep_quotes(keep_quotes) - .mixed_types_as_string(mixed_types_as_string); + .normalize_whitespace(static_cast(normalize_whitespace)) + .mixed_types_as_string(mixed_types_as_string) + .keep_quotes(keep_quotes); auto result = std::make_unique(cudf::io::read_json(opts.build())); @@ -1461,8 +1462,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, - jboolean recover_with_null, jboolean normalize_single_quotes, jboolean mixed_types_as_string, - jboolean keep_quotes) { + jboolean recover_with_null, jboolean normalize_single_quotes, jboolean normalize_whitespace, + jboolean mixed_types_as_string, jboolean keep_quotes) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1484,8 +1485,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( .lines(static_cast(lines)) .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) - .keep_quotes(keep_quotes) - .mixed_types_as_string(mixed_types_as_string); + .normalize_whitespace(static_cast(normalize_whitespace)) + .mixed_types_as_string(mixed_types_as_string) + .keep_quotes(keep_quotes); auto result = std::make_unique(cudf::io::read_json(opts.build())); @@ -1573,8 +1575,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types, jintArray j_scales, jboolean day_first, jboolean lines, jboolean recover_with_null, - jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes, - jlong ds_handle) { + jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string, + jboolean keep_quotes, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1606,6 +1608,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( .lines(static_cast(lines)) .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) + .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) .keep_quotes(keep_quotes); @@ -1646,7 +1649,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types, jintArray j_scales, jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, jboolean recover_with_null, - jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes) { + jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string, + jboolean keep_quotes) { bool read_buffer = true; if (buffer == 0) { @@ -1693,6 +1697,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( .lines(static_cast(lines)) .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) + .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) .keep_quotes(keep_quotes); diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index bee8d1cbb88..3f0470d854a 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -88,6 +88,7 @@ public class TableTest extends CudfTestBase { private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json"); private static final File TEST_JSON_ERROR_FILE = TestUtils.getResourceAsFile("people_with_invalid_lines.json"); private static final File TEST_JSON_SINGLE_QUOTES_FILE = TestUtils.getResourceAsFile("single_quotes.json"); + private static final File TEST_JSON_WHITESPACES_FILE = TestUtils.getResourceAsFile("whitespaces.json"); private static final File TEST_MIXED_TYPE_1_JSON = TestUtils.getResourceAsFile("mixed_types_1.json"); private static final File TEST_MIXED_TYPE_2_JSON = TestUtils.getResourceAsFile("mixed_types_2.json"); @@ -349,6 +350,39 @@ void testReadSingleQuotesJSONFile() throws IOException { } @Test + void testReadSingleQuotesJSONFileFeatureDisabled() throws IOException { + Schema schema = Schema.builder() + .column(DType.STRING, "A") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withNormalizeSingleQuotes(false) + .build(); + try (MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE)) { + assertThrows(CudfException.class, () -> + Table.readJSON(schema, opts, source)); + } + } + + @Test + void testReadWhitespacesJSONFile() throws IOException { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column("b", "50", "[1,2,3,4,5,6,7,8]", "{\"c\":\"d\"}", "b") + .build(); + MultiBufferDataSource source = sourceFrom(TEST_JSON_WHITESPACES_FILE); + Table table = Table.readJSON(schema, opts, source)) { + assertTablesAreEqual(expected, table); + } + } + void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { Schema schema = Schema.builder() .column(DType.STRING, "A") @@ -547,21 +581,6 @@ void testReadMixedType2JSONFile() throws IOException { } } - @Test - void testReadSingleQuotesJSONFileFeatureDisabled() throws IOException { - Schema schema = Schema.builder() - .column(DType.STRING, "A") - .build(); - JSONOptions opts = JSONOptions.builder() - .withLines(true) - .withNormalizeSingleQuotes(false) - .build(); - try (MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE)) { - assertThrows(CudfException.class, () -> - Table.readJSON(schema, opts, source)); - } - } - @Test void testReadJSONFromDataSource() throws IOException { Schema schema = Schema.builder() diff --git a/java/src/test/resources/whitespaces.json b/java/src/test/resources/whitespaces.json new file mode 100644 index 00000000000..f5ddd8cde5f --- /dev/null +++ b/java/src/test/resources/whitespaces.json @@ -0,0 +1,5 @@ +{"a":"b"} + { "a" : "50" } +{"a": [1, 2, 3, 4, 5, 6, 7, 8]} +{"a": {"c": "d"}} +{"a": "b"}