From dbdcc31fe1cbe902d495428da3c68dc59d289dc5 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 4 Mar 2024 18:22:49 +0000
Subject: [PATCH 1/4] Expose new stable_sort and finish stream_compaction in
 pylibcudf (#15175)

Completes coverage of `sorting.hpp` and `stream_compaction.hpp`

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15175
---
 python/cudf/cudf/_lib/cpp/sorting.pxd         |   7 +-
 .../cudf/cudf/_lib/cpp/stream_compaction.pxd  |  43 +++-
 python/cudf/cudf/_lib/pylibcudf/sorting.pxd   |   2 +
 python/cudf/cudf/_lib/pylibcudf/sorting.pyx   |  39 +++-
 .../cudf/_lib/pylibcudf/stream_compaction.pxd |  34 +++-
 .../cudf/_lib/pylibcudf/stream_compaction.pyx | 185 ++++++++++++++++--
 python/cudf/cudf/_lib/stream_compaction.pyx   |   1 +
 7 files changed, 275 insertions(+), 36 deletions(-)

diff --git a/python/cudf/cudf/_lib/cpp/sorting.pxd b/python/cudf/cudf/_lib/cpp/sorting.pxd
index 68f01003fe6..86dc0f0de95 100644
--- a/python/cudf/cudf/_lib/cpp/sorting.pxd
+++ b/python/cudf/cudf/_lib/cpp/sorting.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -68,3 +68,8 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
         table_view source_table,
         vector[libcudf_types.order] column_order,
         vector[libcudf_types.null_order] null_precedence) except +
+
+    cdef unique_ptr[table] stable_sort(
+        table_view source_table,
+        vector[libcudf_types.order] column_order,
+        vector[libcudf_types.null_order] null_precedence) except +
diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd
index e8539ecb9c3..55854a9444f 100644
--- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd
+++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd
@@ -30,21 +30,28 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil:
                                       vector[size_type] keys,
                                       size_type keep_threshold) except +
 
+    cdef unique_ptr[table] drop_nans(table_view source_table,
+                                     vector[size_type] keys,
+                                     size_type keep_threshold) except +
+
     cdef unique_ptr[table] apply_boolean_mask(
         table_view source_table,
         column_view boolean_mask
     ) except +
 
-    cdef size_type distinct_count(
-        column_view source_table,
-        null_policy null_handling,
-        nan_policy nan_handling) except +
+    cdef unique_ptr[table] unique(
+        table_view input,
+        vector[size_type] keys,
+        duplicate_keep_option keep,
+        null_equality nulls_equal,
+    ) except +
 
-    cdef unique_ptr[table] stable_distinct(
+    cdef unique_ptr[table] distinct(
         table_view input,
         vector[size_type] keys,
         duplicate_keep_option keep,
         null_equality nulls_equal,
+        nan_equality nans_equals,
     ) except +
 
     cdef unique_ptr[column] distinct_indices(
@@ -53,3 +60,29 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil:
         null_equality nulls_equal,
         nan_equality nans_equal,
     ) except +
+
+    cdef unique_ptr[table] stable_distinct(
+        table_view input,
+        vector[size_type] keys,
+        duplicate_keep_option keep,
+        null_equality nulls_equal,
+        nan_equality nans_equal,
+    ) except +
+
+    cdef size_type unique_count(
+        column_view column,
+        null_policy null_handling,
+        nan_policy nan_handling) except +
+
+    cdef size_type unique_count(
+        table_view source_table,
+        null_policy null_handling) except +
+
+    cdef size_type distinct_count(
+        column_view column,
+        null_policy null_handling,
+        nan_policy nan_handling) except +
+
+    cdef size_type distinct_count(
+        table_view source_table,
+        null_policy null_handling) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd b/python/cudf/cudf/_lib/pylibcudf/sorting.pxd
index fb22da0b0fd..3ed241622c0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/sorting.pxd
@@ -59,3 +59,5 @@ cpdef Table stable_sort_by_key(
 )
 
 cpdef Table sort(Table source_table, list column_order, list null_precedence)
+
+cpdef Table stable_sort(Table source_table, list column_order, list null_precedence)
diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx b/python/cudf/cudf/_lib/pylibcudf/sorting.pyx
index 4e73760720a..1668a3efc7c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/sorting.pyx
@@ -50,7 +50,8 @@ cpdef Column stable_sorted_order(
     list column_order,
     list null_precedence,
 ):
-    """Computes the row indices required to sort the table, maintaining input order.
+    """Computes the row indices required to sort the table,
+    preserving order of equal elements.
 
     Parameters
     ----------
@@ -206,7 +207,8 @@ cpdef Table stable_segmented_sort_by_key(
     list column_order,
     list null_precedence,
 ):
-    """Sorts the table by key, within segments, maintaining input order.
+    """Sorts the table by key preserving order of equal elements,
+    within segments.
 
     Parameters
     ----------
@@ -287,7 +289,7 @@ cpdef Table stable_sort_by_key(
     list column_order,
     list null_precedence,
 ):
-    """Sorts the table by key, maintaining input order.
+    """Sorts the table by key preserving order of equal elements.
 
     Parameters
     ----------
@@ -349,3 +351,34 @@ cpdef Table sort(Table source_table, list column_order, list null_precedence):
             )
         )
     return Table.from_libcudf(move(c_result))
+
+
+cpdef Table stable_sort(Table source_table, list column_order, list null_precedence):
+    """Sorts the table preserving order of equal elements.
+
+    Parameters
+    ----------
+    source_table : Table
+        The table to sort.
+    column_order : List[ColumnOrder]
+        Whether each column should be sorted in ascending or descending order.
+    null_precedence : List[NullOrder]
+        Whether nulls should come before or after non-nulls.
+
+    Returns
+    -------
+    Table
+        The sorted table.
+    """
+    cdef unique_ptr[table] c_result
+    cdef vector[order] c_orders = column_order
+    cdef vector[null_order] c_null_precedence = null_precedence
+    with nogil:
+        c_result = move(
+            cpp_sorting.stable_sort(
+                source_table.view(),
+                c_orders,
+                c_null_precedence,
+            )
+        )
+    return Table.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd
index 78adb20021c..29acc21fc05 100644
--- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd
@@ -15,19 +15,21 @@ from .table cimport Table
 
 cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold)
 
-cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask)
+cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold)
 
-cpdef size_type distinct_count(
-    Column source_table,
-    null_policy null_handling,
-    nan_policy nan_handling
+cpdef Table unique(
+    Table input,
+    list keys,
+    duplicate_keep_option keep,
+    null_equality nulls_equal,
 )
 
-cpdef Table stable_distinct(
+cpdef Table distinct(
     Table input,
     list keys,
     duplicate_keep_option keep,
     null_equality nulls_equal,
+    nan_equality nans_equal,
 )
 
 cpdef Column distinct_indices(
@@ -36,3 +38,23 @@ cpdef Column distinct_indices(
     null_equality nulls_equal,
     nan_equality nans_equal,
 )
+
+cpdef Table stable_distinct(
+    Table input,
+    list keys,
+    duplicate_keep_option keep,
+    null_equality nulls_equal,
+    nan_equality nans_equal,
+)
+
+cpdef size_type unique_count(
+    Column column,
+    null_policy null_handling,
+    nan_policy nan_handling
+)
+
+cpdef size_type distinct_count(
+    Column column,
+    null_policy null_handling,
+    nan_policy nan_handling
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx
index 0357866980a..af7a85d31bf 100644
--- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx
@@ -51,6 +51,34 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold):
     return Table.from_libcudf(move(c_result))
 
 
+cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold):
+    """Filters out rows from the input table based on the presence of NaNs.
+
+    Parameters
+    ----------
+    source_table : Table
+        The input table to filter.
+    keys : List[size_type]
+        The list of column indexes to consider for NaN filtering.
+    keep_threshold : size_type
+        The minimum number of non-NaNs required to keep a row.
+
+    Returns
+    -------
+    Table
+        A new table with rows removed based on NaNs.
+    """
+    cdef unique_ptr[table] c_result
+    cdef vector[size_type] c_keys = keys
+    with nogil:
+        c_result = move(
+            cpp_stream_compaction.drop_nulls(
+                source_table.view(), c_keys, keep_threshold
+            )
+        )
+    return Table.from_libcudf(move(c_result))
+
+
 cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask):
     """Filters out rows from the input table based on a boolean mask.
 
@@ -76,39 +104,55 @@ cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask):
     return Table.from_libcudf(move(c_result))
 
 
-cpdef size_type distinct_count(
-    Column source_table,
-    null_policy null_handling,
-    nan_policy nan_handling
+cpdef Table unique(
+    Table input,
+    list keys,
+    duplicate_keep_option keep,
+    null_equality nulls_equal,
 ):
-    """Returns the number of unique elements in the input column.
+    """Filter duplicate consecutive rows from the input table.
 
     Parameters
     ----------
-    source_table : Column
-        The input column to count the unique elements of.
-    null_handling : null_policy
-        Flag to include or exclude nulls from the count.
-    nan_handling : nan_policy
-        Flag to include or exclude NaNs from the count.
+    input : Table
+        The input table to filter
+    keys : list[int]
+        The list of column indexes to consider for filtering.
+    keep : duplicate_keep_option
+        The option to specify which rows to keep in the case of duplicates.
+    nulls_equal : null_equality
+        The option to specify how nulls are handled in the comparison.
 
     Returns
     -------
-    size_type
-        The number of unique elements in the input column.
+    Table
+        New Table with unique rows from each sequence of equivalent rows
+        as specified by keep. In the same order as the input table.
+
+    Notes
+    -----
+    If the input columns to be filtered on are sorted, then
+    unique can produce the same result as stable_distinct, but faster.
     """
-    return cpp_stream_compaction.distinct_count(
-        source_table.view(), null_handling, nan_handling
-    )
+    cdef unique_ptr[table] c_result
+    cdef vector[size_type] c_keys = keys
+    with nogil:
+        c_result = move(
+            cpp_stream_compaction.unique(
+                input.view(), c_keys, keep, nulls_equal
+            )
+        )
+    return Table.from_libcudf(move(c_result))
 
 
-cpdef Table stable_distinct(
+cpdef Table distinct(
     Table input,
     list keys,
     duplicate_keep_option keep,
     null_equality nulls_equal,
+    nan_equality nans_equal,
 ):
-    """Get the distinct rows from the input table, preserving input order.
+    """Get the distinct rows from the input table.
 
     Parameters
     ----------
@@ -120,18 +164,21 @@ cpdef Table stable_distinct(
         The option to specify which rows to keep in the case of duplicates.
     nulls_equal : null_equality
         The option to specify how nulls are handled in the comparison.
+    nans_equal : nan_equality
+        The option to specify how NaNs are handled in the comparison.
 
     Returns
     -------
     Table
-        A new table with distinct rows from the input table.
+        A new table with distinct rows from the input table. The
+        output will not necessarily be in the same order as the input.
     """
     cdef unique_ptr[table] c_result
     cdef vector[size_type] c_keys = keys
     with nogil:
         c_result = move(
-            cpp_stream_compaction.stable_distinct(
-                input.view(), c_keys, keep, nulls_equal
+            cpp_stream_compaction.distinct(
+                input.view(), c_keys, keep, nulls_equal, nans_equal
             )
         )
     return Table.from_libcudf(move(c_result))
@@ -169,3 +216,99 @@ cpdef Column distinct_indices(
             )
         )
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Table stable_distinct(
+    Table input,
+    list keys,
+    duplicate_keep_option keep,
+    null_equality nulls_equal,
+    nan_equality nans_equal,
+):
+    """Get the distinct rows from the input table, preserving input order.
+
+    Parameters
+    ----------
+    input : Table
+        The input table to filter.
+    keys : list
+        The list of column indexes to consider for distinct filtering.
+    keep : duplicate_keep_option
+        The option to specify which rows to keep in the case of duplicates.
+    nulls_equal : null_equality
+        The option to specify how nulls are handled in the comparison.
+    nans_equal : nan_equality
+        The option to specify how NaNs are handled in the comparison.
+
+    Returns
+    -------
+    Table
+        A new table with distinct rows from the input table, preserving
+        the input table order.
+    """
+    cdef unique_ptr[table] c_result
+    cdef vector[size_type] c_keys = keys
+    with nogil:
+        c_result = move(
+            cpp_stream_compaction.stable_distinct(
+                input.view(), c_keys, keep, nulls_equal, nans_equal
+            )
+        )
+    return Table.from_libcudf(move(c_result))
+
+
+cpdef size_type unique_count(
+    Column source,
+    null_policy null_handling,
+    nan_policy nan_handling
+):
+    """Returns the number of unique consecutive elements in the input column.
+
+    Parameters
+    ----------
+    source : Column
+        The input column to count the unique elements of.
+    null_handling : null_policy
+        Flag to include or exclude nulls from the count.
+    nan_handling : nan_policy
+        Flag to include or exclude NaNs from the count.
+
+    Returns
+    -------
+    size_type
+        The number of unique consecutive elements in the input column.
+
+    Notes
+    -----
+    If the input column is sorted, then unique_count can produce the
+    same result as distinct_count, but faster.
+    """
+    return cpp_stream_compaction.unique_count(
+        source.view(), null_handling, nan_handling
+    )
+
+
+cpdef size_type distinct_count(
+    Column source,
+    null_policy null_handling,
+    nan_policy nan_handling
+):
+    """Returns the number of distinct elements in the input column.
+
+    Parameters
+    ----------
+    source : Column
+        The input column to count the unique elements of.
+    null_handling : null_policy
+        Flag to include or exclude nulls from the count.
+    nan_handling : nan_policy
+        Flag to include or exclude NaNs from the count.
+
+    Returns
+    -------
+    size_type
+        The number of distinct elements in the input column.
+    """
+    return cpp_stream_compaction.distinct_count(
+        source.view(), null_handling, nan_handling
+    )
diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx
index 04883eac559..834f91f48d9 100644
--- a/python/cudf/cudf/_lib/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/stream_compaction.pyx
@@ -109,6 +109,7 @@ def drop_duplicates(list columns,
             keep_option,
             pylibcudf.types.NullEquality.EQUAL
             if nulls_are_equal else pylibcudf.types.NullEquality.UNEQUAL,
+            pylibcudf.types.NanEquality.ALL_EQUAL,
         )
     )
 

From da113015aade79d78628d00578dff22a4dd5cf35 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 4 Mar 2024 13:17:33 -0600
Subject: [PATCH 2/4] Switch `pytest-xdist` algo to `worksteal` (#15207)

This PR switches `pytest-xdist` distribution algorithm to a much more efficient algorithm `worksteal`, that will assign any idle pytest worker to pickup remaining pytests.

I see a 25% time savings when this switch is made locally:
```
`loadscope`:
== 101421 passed, 2115 skipped, 867 xfailed in 1179.48s (0:19:39) ==
`worksteal`:
== 101423 passed, 2115 skipped, 867 xfailed in 891.79s (0:14:51) ==
```

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15207
---
 ci/test_python_cudf.sh  | 6 +++---
 ci/test_python_other.sh | 4 ++--
 ci/test_wheel_cudf.sh   | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh
index ace71bb0b75..bacb54b3896 100755
--- a/ci/test_python_cudf.sh
+++ b/ci/test_python_cudf.sh
@@ -18,7 +18,7 @@ rapids-logger "pytest cudf"
 ./ci/run_cudf_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \
   --numprocesses=8 \
-  --dist=loadscope \
+  --dist=worksteal \
   --cov-config=../.coveragerc \
   --cov=cudf \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-coverage.xml" \
@@ -32,7 +32,7 @@ rapids-logger "pytest cudf"
 rapids-logger "pytest for cudf benchmarks"
 ./ci/run_cudf_pytest_benchmarks.sh \
   --numprocesses=8 \
-  --dist=loadscope \
+  --dist=worksteal \
   --cov-config=.coveragerc \
   --cov=cudf \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-benchmark-coverage.xml" \
@@ -41,7 +41,7 @@ rapids-logger "pytest for cudf benchmarks"
 rapids-logger "pytest for cudf benchmarks using pandas"
 ./ci/run_cudf_pandas_pytest_benchmarks.sh \
   --numprocesses=8 \
-  --dist=loadscope \
+  --dist=worksteal \
   --cov-config=.coveragerc \
   --cov=cudf \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-benchmark-pandas-coverage.xml" \
diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh
index bc15747b26a..9cdceb295db 100755
--- a/ci/test_python_other.sh
+++ b/ci/test_python_other.sh
@@ -23,7 +23,7 @@ rapids-logger "pytest dask_cudf"
 ./ci/run_dask_cudf_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
   --numprocesses=8 \
-  --dist=loadscope \
+  --dist=worksteal \
   --cov-config=../.coveragerc \
   --cov=dask_cudf \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \
@@ -33,7 +33,7 @@ rapids-logger "pytest custreamz"
 ./ci/run_custreamz_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-custreamz.xml" \
   --numprocesses=8 \
-  --dist=loadscope \
+  --dist=worksteal \
   --cov-config=../.coveragerc \
   --cov=custreamz \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/custreamz-coverage.xml" \
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index b7e8f862ed5..af5779f478a 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -37,7 +37,7 @@ else
       --cache-clear \
       --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \
       --numprocesses=8 \
-      --dist=loadscope \
+      --dist=worksteal \
       .
     popd
 fi

From 0ff5a2c59cb62d6b3c473885ebbe883d1aae8c4f Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Mon, 4 Mar 2024 15:20:32 -0500
Subject: [PATCH 3/4] Replace local copyright check with pre-commit-hooks
 verify-copyright (#14917)

The local `copyright.py` script is bug-prone. Replace it with a more robust centralized script from `pre-commit-hooks`.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Jake Awe (https://github.com/AyodeAwe)
  - Karthikeyan (https://github.com/karthikeyann)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14917
---
 .pre-commit-config.yaml |  13 +-
 ci/checks/copyright.py  | 277 ----------------------------------------
 2 files changed, 7 insertions(+), 283 deletions(-)
 delete mode 100644 ci/checks/copyright.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d302543368e..9235c80bdc9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -126,12 +126,6 @@ repos:
           - cmakelang==0.6.13
         verbose: true
         require_serial: true
-      - id: copyright-check
-        name: copyright-check
-        entry: python ./ci/checks/copyright.py --git-modified-only --update-current-year
-        language: python
-        pass_filenames: false
-        additional_dependencies: [gitpython]
       - id: doxygen-check
         name: doxygen-check
         entry: ./ci/checks/doxygen.sh
@@ -161,6 +155,13 @@ repos:
     hooks:
       - id: ruff
         files: python/.*$
+  - repo: https://github.com/rapidsai/pre-commit-hooks
+    rev: v0.0.1
+    hooks:
+      - id: verify-copyright
+        exclude: |
+          (?x)
+              cpp/include/cudf_test/cxxopts[.]hpp$
 
 
 default_language_version:
diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py
deleted file mode 100644
index dd89b092496..00000000000
--- a/ci/checks/copyright.py
+++ /dev/null
@@ -1,277 +0,0 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import argparse
-import datetime
-import os
-import re
-import sys
-
-import git
-
-FilesToCheck = [
-    re.compile(r"[.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx)$"),
-    re.compile(r"CMakeLists[.]txt$"),
-    re.compile(r"CMakeLists_standalone[.]txt$"),
-    re.compile(r"setup[.]cfg$"),
-    re.compile(r"meta[.]yaml$"),
-]
-ExemptFiles = [
-    re.compile(r"cpp/include/cudf_test/cxxopts.hpp"),
-]
-
-# this will break starting at year 10000, which is probably OK :)
-CheckSimple = re.compile(
-    r"Copyright *(?:\(c\))? *(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)"
-)
-CheckDouble = re.compile(
-    r"Copyright *(?:\(c\))? *(\d{4})-(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)"  # noqa: E501
-)
-
-
-def checkThisFile(f):
-    if isinstance(f, git.Diff):
-        if f.deleted_file or f.b_blob.size == 0:
-            return False
-        f = f.b_path
-    elif not os.path.exists(f) or os.stat(f).st_size == 0:
-        # This check covers things like symlinks which point to files that DNE
-        return False
-    for exempt in ExemptFiles:
-        if exempt.search(f):
-            return False
-    for checker in FilesToCheck:
-        if checker.search(f):
-            return True
-    return False
-
-
-def modifiedFiles():
-    """Get a set of all modified files, as Diff objects.
-
-    The files returned have been modified in git since the merge base of HEAD
-    and the upstream of the target branch. We return the Diff objects so that
-    we can read only the staged changes.
-    """
-    repo = git.Repo()
-    # Use the environment variable TARGET_BRANCH or RAPIDS_BASE_BRANCH (defined in CI) if possible
-    target_branch = os.environ.get("TARGET_BRANCH", os.environ.get("RAPIDS_BASE_BRANCH"))
-    if target_branch is None:
-        # Fall back to the closest branch if not on CI
-        target_branch = repo.git.describe(
-            all=True, tags=True, match="branch-*", abbrev=0
-        ).lstrip("heads/")
-
-    upstream_target_branch = None
-    if target_branch in repo.heads:
-        # Use the tracking branch of the local reference if it exists. This
-        # returns None if no tracking branch is set.
-        upstream_target_branch = repo.heads[target_branch].tracking_branch()
-    if upstream_target_branch is None:
-        # Fall back to the remote with the newest target_branch. This code
-        # path is used on CI because the only local branch reference is
-        # current-pr-branch, and thus target_branch is not in repo.heads.
-        # This also happens if no tracking branch is defined for the local
-        # target_branch. We use the remote with the latest commit if
-        # multiple remotes are defined.
-        candidate_branches = [
-            remote.refs[target_branch] for remote in repo.remotes
-            if target_branch in remote.refs
-        ]
-        if len(candidate_branches) > 0:
-            upstream_target_branch = sorted(
-                candidate_branches,
-                key=lambda branch: branch.commit.committed_datetime,
-            )[-1]
-        else:
-            # If no remotes are defined, try to use the local version of the
-            # target_branch. If this fails, the repo configuration must be very
-            # strange and we can fix this script on a case-by-case basis.
-            upstream_target_branch = repo.heads[target_branch]
-    merge_base = repo.merge_base("HEAD", upstream_target_branch.commit)[0]
-    diff = merge_base.diff()
-    changed_files = {f for f in diff if f.b_path is not None}
-    return changed_files
-
-
-def getCopyrightYears(line):
-    res = CheckSimple.search(line)
-    if res:
-        return int(res.group(1)), int(res.group(1))
-    res = CheckDouble.search(line)
-    if res:
-        return int(res.group(1)), int(res.group(2))
-    return None, None
-
-
-def replaceCurrentYear(line, start, end):
-    # first turn a simple regex into double (if applicable). then update years
-    res = CheckSimple.sub(r"Copyright (c) \1-\1, NVIDIA CORPORATION", line)
-    res = CheckDouble.sub(
-        rf"Copyright (c) {start:04d}-{end:04d}, NVIDIA CORPORATION",
-        res,
-    )
-    return res
-
-
-def checkCopyright(f, update_current_year):
-    """Checks for copyright headers and their years."""
-    errs = []
-    thisYear = datetime.datetime.now().year
-    lineNum = 0
-    crFound = False
-    yearMatched = False
-
-    if isinstance(f, git.Diff):
-        path = f.b_path
-        lines = f.b_blob.data_stream.read().decode().splitlines(keepends=True)
-    else:
-        path = f
-        with open(f, encoding="utf-8") as fp:
-            lines = fp.readlines()
-
-    for line in lines:
-        lineNum += 1
-        start, end = getCopyrightYears(line)
-        if start is None:
-            continue
-        crFound = True
-        if start > end:
-            e = [
-                path,
-                lineNum,
-                "First year after second year in the copyright "
-                "header (manual fix required)",
-                None,
-            ]
-            errs.append(e)
-        elif thisYear < start or thisYear > end:
-            e = [
-                path,
-                lineNum,
-                "Current year not included in the copyright header",
-                None,
-            ]
-            if thisYear < start:
-                e[-1] = replaceCurrentYear(line, thisYear, end)
-            if thisYear > end:
-                e[-1] = replaceCurrentYear(line, start, thisYear)
-            errs.append(e)
-        else:
-            yearMatched = True
-    # copyright header itself not found
-    if not crFound:
-        e = [
-            path,
-            0,
-            "Copyright header missing or formatted incorrectly "
-            "(manual fix required)",
-            None,
-        ]
-        errs.append(e)
-    # even if the year matches a copyright header, make the check pass
-    if yearMatched:
-        errs = []
-
-    if update_current_year:
-        errs_update = [x for x in errs if x[-1] is not None]
-        if len(errs_update) > 0:
-            lines_changed = ", ".join(str(x[1]) for x in errs_update)
-            print(f"File: {path}. Changing line(s) {lines_changed}")
-            for _, lineNum, __, replacement in errs_update:
-                lines[lineNum - 1] = replacement
-            with open(path, "w", encoding="utf-8") as out_file:
-                out_file.writelines(lines)
-
-    return errs
-
-
-def getAllFilesUnderDir(root, pathFilter=None):
-    retList = []
-    for dirpath, dirnames, filenames in os.walk(root):
-        for fn in filenames:
-            filePath = os.path.join(dirpath, fn)
-            if pathFilter(filePath):
-                retList.append(filePath)
-    return retList
-
-
-def checkCopyright_main():
-    """
-    Checks for copyright headers in all the modified files. In case of local
-    repo, this script will just look for uncommitted files and in case of CI
-    it compares between branches "$PR_TARGET_BRANCH" and "current-pr-branch"
-    """
-    retVal = 0
-
-    argparser = argparse.ArgumentParser(
-        "Checks for a consistent copyright header in git's modified files"
-    )
-    argparser.add_argument(
-        "--update-current-year",
-        dest="update_current_year",
-        action="store_true",
-        required=False,
-        help="If set, "
-        "update the current year if a header is already "
-        "present and well formatted.",
-    )
-    argparser.add_argument(
-        "--git-modified-only",
-        dest="git_modified_only",
-        action="store_true",
-        required=False,
-        help="If set, "
-        "only files seen as modified by git will be "
-        "processed.",
-    )
-
-    args, dirs = argparser.parse_known_args()
-
-    if args.git_modified_only:
-        files = [f for f in modifiedFiles() if checkThisFile(f)]
-    else:
-        files = []
-        for d in [os.path.abspath(d) for d in dirs]:
-            if not os.path.isdir(d):
-                raise ValueError(f"{d} is not a directory.")
-            files += getAllFilesUnderDir(d, pathFilter=checkThisFile)
-
-    errors = []
-    for f in files:
-        errors += checkCopyright(f, args.update_current_year)
-
-    if len(errors) > 0:
-        if any(e[-1] is None for e in errors):
-            print("Copyright headers incomplete in some of the files!")
-        for e in errors:
-            print("  %s:%d Issue: %s" % (e[0], e[1], e[2]))
-        print("")
-        n_fixable = sum(1 for e in errors if e[-1] is not None)
-        path_parts = os.path.abspath(__file__).split(os.sep)
-        file_from_repo = os.sep.join(path_parts[path_parts.index("ci") :])
-        if n_fixable > 0 and not args.update_current_year:
-            print(
-                f"You can run `python {file_from_repo} --git-modified-only "
-                "--update-current-year` and stage the results in git to "
-                f"fix {n_fixable} of these errors.\n"
-            )
-        retVal = 1
-
-    return retVal
-
-
-if __name__ == "__main__":
-    sys.exit(checkCopyright_main())

From d158ccdbe651952bd649cb0f17c41467c5209824 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Mon, 4 Mar 2024 15:25:51 -0500
Subject: [PATCH 4/4] API for JSON unquoted whitespace normalization (#15033)

This work is a follow-up to PR #14931 which provided a proof-of-concept for using the a FST to normalize unquoted whitespaces. This PR implements the pre-processing FST in cuIO and adds a JSON reader option that needs to be set to true to invoke the normalizer.
Addresses feature request #14865

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15033
---
 cpp/CMakeLists.txt                            |   2 +-
 cpp/include/cudf/io/detail/json.hpp           |  10 +
 cpp/include/cudf/io/json.hpp                  |  31 +++
 ...normalization.cu => json_normalization.cu} | 142 ++++++++++++-
 cpp/src/io/json/read_json.cu                  |   7 +
 .../io/json_whitespace_normalization_test.cu  | 201 ++++--------------
 .../main/java/ai/rapids/cudf/JSONOptions.java |  15 ++
 java/src/main/java/ai/rapids/cudf/Table.java  |   9 +
 java/src/main/native/src/TableJni.cpp         |  27 ++-
 .../test/java/ai/rapids/cudf/TableTest.java   |  49 +++--
 java/src/test/resources/whitespaces.json      |   5 +
 11 files changed, 314 insertions(+), 184 deletions(-)
 rename cpp/src/io/json/{json_quote_normalization.cu => json_normalization.cu} (57%)
 create mode 100644 java/src/test/resources/whitespaces.json

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5fd6cd3544a..c74963be50d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -376,7 +376,7 @@ add_library(
   src/io/functions.cpp
   src/io/json/byte_range_info.cu
   src/io/json/json_column.cu
-  src/io/json/json_quote_normalization.cu
+  src/io/json/json_normalization.cu
   src/io/json/json_tree.cu
   src/io/json/nested_json_gpu.cu
   src/io/json/read_json.cu
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 0eb0e17ea10..3f7f7e9bb32 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -63,4 +63,14 @@ rmm::device_uvector<char> normalize_single_quotes(rmm::device_uvector<char>&& in
                                                   rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Normalize unquoted whitespace (space and tab characters) using FST
+ *
+ * @param inbuf Input device buffer
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
+ */
+rmm::device_uvector<char> normalize_whitespace(rmm::device_uvector<char>&& inbuf,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr);
 }  // namespace cudf::io::json::detail
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index f0c3d48ab7e..593dd044d51 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -118,6 +118,9 @@ class json_reader_options {
   // Normalize single quotes
   bool _normalize_single_quotes = false;
 
+  // Normalize unquoted spaces and tabs
+  bool _normalize_whitespace = false;
+
   // Whether to recover after an invalid JSON line
   json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL;
 
@@ -265,6 +268,13 @@ class json_reader_options {
    */
   bool is_enabled_normalize_single_quotes() const { return _normalize_single_quotes; }
 
+  /**
+   * @brief Whether the reader should normalize unquoted whitespace characters
+   *
+   * @returns true if the reader should normalize whitespace, false otherwise
+   */
+  bool is_enabled_normalize_whitespace() const { return _normalize_whitespace; }
+
   /**
    * @brief Queries the JSON reader's behavior on invalid JSON lines.
    *
@@ -358,6 +368,14 @@ class json_reader_options {
    */
   void enable_normalize_single_quotes(bool val) { _normalize_single_quotes = val; }
 
+  /**
+   * @brief Set whether the reader should enable normalization of unquoted whitespace
+   *
+   * @param val Boolean value to indicate whether the reader should normalize unquoted whitespace
+   * characters i.e. tabs and spaces
+   */
+  void enable_normalize_whitespace(bool val) { _normalize_whitespace = val; }
+
   /**
    * @brief Specifies the JSON reader's behavior on invalid JSON lines.
    *
@@ -533,6 +551,19 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set whether the reader should normalize unquoted whitespace
+   *
+   * @param val Boolean value to indicate whether the reader should normalize unquoted
+   * whitespace
+   * @return this for chaining
+   */
+  json_reader_options_builder& normalize_whitespace(bool val)
+  {
+    options._normalize_whitespace = val;
+    return *this;
+  }
+
   /**
    * @brief Specifies the JSON reader's behavior on invalid JSON lines.
    *
diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_normalization.cu
similarity index 57%
rename from cpp/src/io/json/json_quote_normalization.cu
rename to cpp/src/io/json/json_normalization.cu
index a13b6e0b016..86e4da664a8 100644
--- a/cpp/src/io/json/json_quote_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -32,13 +32,15 @@
 
 namespace cudf::io::json {
 
-using SymbolT       = char;
-using StateT        = char;
+// Type used to represent the atomic symbol type used within the finite-state machine
+using SymbolT = char;
+using StateT  = char;
+
+// Type sufficiently large to index symbols within the input and output (may be unsigned)
 using SymbolOffsetT = uint32_t;
 
 namespace normalize_quotes {
 
-// Type sufficiently large to index symbols within the input and output (may be unsigned)
 enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES };
 enum class dfa_symbol_group_id : uint32_t {
   DOUBLE_QUOTE_CHAR,  ///< Quote character SG: "
@@ -172,6 +174,116 @@ struct TransduceToNormalizedQuotes {
 
 }  // namespace normalize_quotes
 
+namespace normalize_whitespace {
+
+enum class dfa_symbol_group_id : uint32_t {
+  DOUBLE_QUOTE_CHAR,   ///< Quote character SG: "
+  ESCAPE_CHAR,         ///< Escape character SG: '\\'
+  NEWLINE_CHAR,        ///< Newline character SG: '\n'
+  WHITESPACE_SYMBOLS,  ///< Whitespace characters SG: '\t' or ' '
+  OTHER_SYMBOLS,       ///< SG implicitly matching all other characters
+  NUM_SYMBOL_GROUPS    ///< Total number of symbol groups
+};
+// Alias for readability of symbol group ids
+constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
+// The i-th string representing all the characters of a symbol group
+std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{
+  {{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}};
+
+/**
+ * -------- FST states ---------
+ * -----------------------------
+ * TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double
+ *        |   quotes as well as any other character not enclosed by a string. Also handles
+ *        |   newline character present within a string
+ * TT_DQS | Double-quoted string state handling all characters within double quotes except
+ *        |   newline character
+ * TT_DEC | State handling escaped characters inside double-quoted string. Note that this
+ *        |   state is necessary to process escaped double-quote characters. Without this
+ *        |   state, whitespaces following escaped double quotes inside strings may be removed.
+ *
+ * NOTE: An important case NOT handled by this FST is that of whitespace following newline
+ * characters within a string. Consider the following example
+ * Input:           {"a":"x\n y"}
+ * FST output:      {"a":"x\ny"}
+ * Expected output: {"a":"x\n y"}
+ * Such strings are not part of the JSON standard (characters allowed within quotes should
+ * have ASCII at least 0x20 i.e. space character and above) but may be encountered while
+ * reading JSON files
+ */
+enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES };
+// Aliases for readability of the transition table
+constexpr auto TT_OOS        = dfa_states::TT_OOS;
+constexpr auto TT_DQS        = dfa_states::TT_DQS;
+constexpr auto TT_DEC        = dfa_states::TT_DEC;
+constexpr auto TT_NUM_STATES = static_cast<StateT>(dfa_states::TT_NUM_STATES);
+
+// Transition table
+std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const wna_state_tt{
+  {/* IN_STATE      "       \       \n    <SPC>   OTHER  */
+   /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}},
+   /* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}},
+   /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}};
+
+// The DFA's starting state
+constexpr StateT start_state = static_cast<StateT>(TT_OOS);
+
+struct TransduceToNormalizedWS {
+  /**
+   * @brief Returns the <relative_offset>-th output symbol on the transition (state_id, match_id).
+   */
+  template <typename StateT, typename SymbolGroupT, typename RelativeOffsetT, typename SymbolT>
+  constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id,
+                                                SymbolGroupT const match_id,
+                                                RelativeOffsetT const relative_offset,
+                                                SymbolT const read_symbol) const
+  {
+    // -------- TRANSLATION TABLE ------------
+    //      Let the alphabet set be Sigma
+    // ---------------------------------------
+    // ---------- NON-SPECIAL CASES: ----------
+    //      Output symbol same as input symbol <s>
+    // state | read_symbol <s>  -> output_symbol <s>
+    // DQS   | Sigma            -> Sigma
+    // OOS   | Sigma\{<SPC>,\t} -> Sigma\{<SPC>,\t}
+    // DEC   | Sigma            -> Sigma
+    // ---------- SPECIAL CASES: --------------
+    //    Input symbol translates to output symbol
+    // OOS   | {<SPC>}          -> <nop>
+    // OOS   | {\t}             -> <nop>
+
+    // Case when read symbol is a space or tab but is unquoted
+    // This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function
+    // However, since there is no output in this case i.e. the count returned by
+    // operator()(state_id, match_id, read_symbol) is zero, this function is never called.
+    // So skipping the check for this case.
+
+    // In all other cases, we have an output symbol for the input symbol.
+    // We simply output the input symbol
+    return read_symbol;
+  }
+
+  /**
+   * @brief Returns the number of output characters for a given transition.
+   * During whitespace normalization, we always emit one output character i.e., the input
+   * character, except when we need to remove the space/tab character
+   */
+  template <typename StateT, typename SymbolGroupT, typename SymbolT>
+  constexpr CUDF_HOST_DEVICE uint32_t operator()(StateT const state_id,
+                                                 SymbolGroupT const match_id,
+                                                 SymbolT const read_symbol) const
+  {
+    // Case when read symbol is a space or tab but is unquoted
+    if (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::WHITESPACE_SYMBOLS) &&
+        state_id == static_cast<StateT>(dfa_states::TT_OOS)) {
+      return 0;
+    }
+    return 1;
+  }
+};
+
+}  // namespace normalize_whitespace
+
 namespace detail {
 
 rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT>&& inbuf,
@@ -198,5 +310,29 @@ rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT
   return outbuf;
 }
 
+rmm::device_uvector<SymbolT> normalize_whitespace(rmm::device_uvector<SymbolT>&& inbuf,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
+{
+  auto parser = fst::detail::make_fst(
+    fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
+    fst::detail::make_transition_table(normalize_whitespace::wna_state_tt),
+    fst::detail::make_translation_functor(normalize_whitespace::TransduceToNormalizedWS{}),
+    stream);
+
+  rmm::device_uvector<SymbolT> outbuf(inbuf.size(), stream, mr);
+  rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
+  parser.Transduce(inbuf.data(),
+                   static_cast<SymbolOffsetT>(inbuf.size()),
+                   outbuf.data(),
+                   thrust::make_discard_iterator(),
+                   outbuf_size.data(),
+                   normalize_whitespace::start_state,
+                   stream);
+
+  outbuf.resize(outbuf_size.value(stream), stream);
+  return outbuf;
+}
+
 }  // namespace detail
 }  // namespace cudf::io::json
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index ba8acf2d47a..506d7b6cddc 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -235,6 +235,13 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
       normalize_single_quotes(std::move(buffer), stream, rmm::mr::get_current_device_resource());
   }
 
+  // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is
+  // enabled, invoke pre-processing FST
+  if (reader_opts.is_enabled_normalize_whitespace()) {
+    buffer =
+      normalize_whitespace(std::move(buffer), stream, rmm::mr::get_current_device_resource());
+  }
+
   return device_parse_nested_json(buffer, reader_opts, stream, mr);
   // For debug purposes, use host_parse_nested_json()
 }
diff --git a/cpp/tests/io/json_whitespace_normalization_test.cu b/cpp/tests/io/json_whitespace_normalization_test.cu
index 545d8d2c4f9..336d360063f 100644
--- a/cpp/tests/io/json_whitespace_normalization_test.cu
+++ b/cpp/tests/io/json_whitespace_normalization_test.cu
@@ -13,177 +13,41 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "io/fst/lookup_tables.cuh"
-#include "io/utilities/hostdevice_vector.hpp"
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/default_stream.hpp>
+#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
-#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/detail/json.hpp>
+#include <cudf/io/json.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
-#include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
-#include <thrust/iterator/discard_iterator.h>
-
-#include <cstdlib>
 #include <string>
 
-namespace {
-// Type used to represent the atomic symbol type used within the finite-state machine
-using SymbolT = char;
-using StateT  = char;
-
-// Type sufficiently large to index symbols within the input and output (may be unsigned)
-using SymbolOffsetT = uint32_t;
-
-enum class dfa_symbol_group_id : uint32_t {
-  DOUBLE_QUOTE_CHAR,   ///< Quote character SG: "
-  ESCAPE_CHAR,         ///< Escape character SG: '\\'
-  NEWLINE_CHAR,        ///< Newline character SG: '\n'
-  WHITESPACE_SYMBOLS,  ///< Whitespace characters SG: '\t' or ' '
-  OTHER_SYMBOLS,       ///< SG implicitly matching all other characters
-  NUM_SYMBOL_GROUPS    ///< Total number of symbol groups
-};
-// Alias for readability of symbol group ids
-constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
-// The i-th string representing all the characters of a symbol group
-std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{
-  {{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}};
-
-/**
- * -------- FST states ---------
- * -----------------------------
- * TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double
- *        |   quotes as well as any other character not enclosed by a string. Also handles
- *        |   newline character present within a string
- * TT_DQS | Double-quoted string state handling all characters within double quotes except
- *        |   newline character
- * TT_DEC | State handling escaped characters inside double-quoted string. Note that this
- *        |   state is necessary to process escaped double-quote characters. Without this
- *        |   state, whitespaces following escaped double quotes inside strings may be removed.
- *
- * NOTE: An important case NOT handled by this FST is that of whitespace following newline
- * characters within a string. Consider the following example
- * Input:           {"a":"x\n y"}
- * FST output:      {"a":"x\ny"}
- * Expected output: {"a":"x\n y"}
- */
-enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES };
-// Aliases for readability of the transition table
-constexpr auto TT_OOS        = dfa_states::TT_OOS;
-constexpr auto TT_DQS        = dfa_states::TT_DQS;
-constexpr auto TT_DEC        = dfa_states::TT_DEC;
-constexpr auto TT_NUM_STATES = static_cast<StateT>(dfa_states::TT_NUM_STATES);
-
-// Transition table
-std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const wna_state_tt{
-  {/* IN_STATE      "       \       \n    <SPC>   OTHER  */
-   /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}},
-   /* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}},
-   /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}};
-
-// The DFA's starting state
-constexpr StateT start_state = static_cast<StateT>(TT_OOS);
-
-struct TransduceToNormalizedWS {
-  /**
-   * @brief Returns the <relative_offset>-th output symbol on the transition (state_id, match_id).
-   */
-  template <typename StateT, typename SymbolGroupT, typename RelativeOffsetT, typename SymbolT>
-  constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id,
-                                                SymbolGroupT const match_id,
-                                                RelativeOffsetT const relative_offset,
-                                                SymbolT const read_symbol) const
-  {
-    // -------- TRANSLATION TABLE ------------
-    //      Let the alphabet set be Sigma
-    // ---------------------------------------
-    // ---------- NON-SPECIAL CASES: ----------
-    //      Output symbol same as input symbol <s>
-    // state | read_symbol <s>  -> output_symbol <s>
-    // DQS   | Sigma            -> Sigma
-    // OOS   | Sigma\{<SPC>,\t} -> Sigma\{<SPC>,\t}
-    // DEC   | Sigma            -> Sigma
-    // ---------- SPECIAL CASES: --------------
-    //    Input symbol translates to output symbol
-    // OOS   | {<SPC>}          -> <nop>
-    // OOS   | {\t}             -> <nop>
-
-    // Case when read symbol is a space or tab but is unquoted
-    // This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function
-    // However, since there is no output in this case i.e. the count returned by
-    // operator()(state_id, match_id, read_symbol) is zero, this function is never called.
-    // So skipping the check for this case.
-
-    // In all other cases, we have an output symbol for the input symbol.
-    // We simply output the input symbol
-    return read_symbol;
-  }
-
-  /**
-   * @brief Returns the number of output characters for a given transition.
-   * During whitespace normalization, we always emit one output character i.e., the input
-   * character, except when we need to remove the space/tab character
-   */
-  template <typename StateT, typename SymbolGroupT, typename SymbolT>
-  constexpr CUDF_HOST_DEVICE uint32_t operator()(StateT const state_id,
-                                                 SymbolGroupT const match_id,
-                                                 SymbolT const read_symbol) const
-  {
-    // Case when read symbol is a space or tab but is unquoted
-    if (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::WHITESPACE_SYMBOLS) &&
-        state_id == static_cast<StateT>(dfa_states::TT_OOS)) {
-      return 0;
-    }
-    return 1;
-  }
-};
-}  // namespace
-
 // Base test fixture for tests
 struct JsonWSNormalizationTest : public cudf::test::BaseFixture {};
 
-void run_test(std::string const& input, std::string const& output)
+void run_test(std::string const& host_input, std::string const& expected_host_output)
 {
-  auto parser = cudf::io::fst::detail::make_fst(
-    cudf::io::fst::detail::make_symbol_group_lut(wna_sgs),
-    cudf::io::fst::detail::make_transition_table(wna_state_tt),
-    cudf::io::fst::detail::make_translation_functor(TransduceToNormalizedWS{}),
-    cudf::test::get_default_stream());
-
-  auto d_input_scalar = cudf::make_string_scalar(input, cudf::test::get_default_stream());
-  auto& d_input       = static_cast<cudf::scalar_type_t<std::string>&>(*d_input_scalar);
+  auto stream_view  = cudf::get_default_stream();
+  auto device_input = cudf::detail::make_device_uvector_async(
+    host_input, stream_view, rmm::mr::get_current_device_resource());
 
-  // Prepare input & output buffers
-  constexpr std::size_t single_item = 1;
-  cudf::detail::hostdevice_vector<SymbolT> output_gpu(input.size(),
-                                                      cudf::test::get_default_stream());
-  cudf::detail::hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item,
-                                                                 cudf::test::get_default_stream());
+  // Preprocessing FST
+  auto device_fst_output = cudf::io::json::detail::normalize_whitespace(
+    std::move(device_input), stream_view, rmm::mr::get_current_device_resource());
 
-  // Allocate device-side temporary storage & run algorithm
-  parser.Transduce(d_input.data(),
-                   static_cast<SymbolOffsetT>(d_input.size()),
-                   output_gpu.device_ptr(),
-                   thrust::make_discard_iterator(),
-                   output_gpu_size.device_ptr(),
-                   start_state,
-                   cudf::test::get_default_stream());
+  auto const preprocessed_host_output =
+    cudf::detail::make_std_vector_sync(device_fst_output, stream_view);
 
-  // Async copy results from device to host
-  output_gpu.device_to_host_async(cudf::test::get_default_stream());
-  output_gpu_size.device_to_host_async(cudf::test::get_default_stream());
-
-  // Make sure results have been copied back to host
-  cudf::test::get_default_stream().synchronize();
-
-  // Verify results
-  ASSERT_EQ(output_gpu_size[0], output.size());
-  CUDF_TEST_EXPECT_VECTOR_EQUAL(output_gpu, output, output.size());
+  ASSERT_EQ(preprocessed_host_output.size(), expected_host_output.size());
+  CUDF_TEST_EXPECT_VECTOR_EQUAL(
+    preprocessed_host_output, expected_host_output, preprocessed_host_output.size());
 }
 
 TEST_F(JsonWSNormalizationTest, GroundTruth_Spaces)
@@ -259,4 +123,33 @@ TEST_F(JsonWSNormalizationTest, GroundTruth_InvalidInput)
   run_test(input, output);
 }
 
+TEST_F(JsonWSNormalizationTest, ReadJsonOption)
+{
+  // When mixed type fields are read as strings, the table read will differ depending the
+  // value of normalize_whitespace
+
+  // Test input
+  std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}";
+  cudf::io::json_reader_options input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{host_input.data(), host_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(true);
+
+  cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options);
+
+  // Expected table
+  std::string const expected_input = R"({ "a" : {"b":"c"}})";
+  cudf::io::json_reader_options expected_input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{expected_input.data(), expected_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(false);
+
+  cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index 62496e32f7a..b37d0d88ec9 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -31,6 +31,7 @@ public final class JSONOptions extends ColumnFilterOptions {
   private final boolean lines;
   private final boolean recoverWithNull;
   private final boolean normalizeSingleQuotes;
+  private final boolean normalizeWhitespace;
   private final boolean mixedTypesAsStrings;
   private final boolean keepStringQuotes;
 
@@ -40,6 +41,7 @@ private JSONOptions(Builder builder) {
     lines = builder.lines;
     recoverWithNull = builder.recoverWithNull;
     normalizeSingleQuotes = builder.normalizeSingleQuotes;
+    normalizeWhitespace = builder.normalizeWhitespace;
     mixedTypesAsStrings = builder.mixedTypesAsStrings;
     keepStringQuotes = builder.keepQuotes;
   }
@@ -61,6 +63,10 @@ public boolean isNormalizeSingleQuotes() {
     return normalizeSingleQuotes;
   }
 
+  public boolean isNormalizeWhitespace() {
+    return normalizeWhitespace;
+  }
+
   public boolean isMixedTypesAsStrings() {
     return mixedTypesAsStrings;
   }
@@ -84,6 +90,7 @@ public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptio
 
     private boolean recoverWithNull = false;
     private boolean normalizeSingleQuotes = false;
+    private boolean normalizeWhitespace = false;
 
     private boolean mixedTypesAsStrings = false;
     private boolean keepQuotes = false;
@@ -131,6 +138,14 @@ public Builder withNormalizeSingleQuotes(boolean normalizeSingleQuotes) {
       return this;
     }
 
+    /**
+     * Should the unquoted whitespace be removed.
+     */
+    public Builder withNormalizeWhitespace(boolean normalizeWhitespace) {
+      this.normalizeWhitespace = normalizeWhitespace;
+      return this;
+    }
+
     /**
      * Specify how to handle columns that contain mixed types.
      *
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index c562e08b4c8..a1bdfe9a796 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -252,6 +252,7 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
                                         boolean dayFirst, boolean lines,
                                         boolean recoverWithNulls,
                                         boolean normalizeSingleQuotes,
+                                        boolean normalizeWhitespace,
                                         boolean mixedTypesAsStrings,
                                         boolean keepStringQuotes) throws CudfException;
 
@@ -260,6 +261,7 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
                                       boolean dayFirst, boolean lines,
                                       boolean recoverWithNulls,
                                       boolean normalizeSingleQuotes,
+                                      boolean normalizeWhitespace,
                                       boolean mixedTypesAsStrings,
                                       boolean keepStringQuotes,
                                       long dsHandle) throws CudfException;
@@ -267,6 +269,7 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
   private static native long readAndInferJSONFromDataSource(boolean dayFirst, boolean lines,
                                       boolean recoverWithNulls,
                                       boolean normalizeSingleQuotes,
+                                      boolean normalizeWhitespace,
                                       boolean mixedTypesAsStrings,
                                       boolean keepStringQuotes,
                                       long dsHandle) throws CudfException;
@@ -275,6 +278,7 @@ private static native long readAndInferJSON(long address, long length,
                                               boolean lines,
                                               boolean recoverWithNulls,
                                               boolean normalizeSingleQuotes,
+                                              boolean normalizeWhitespace,
                                               boolean mixedTypesAsStrings,
                                               boolean keepStringQuotes) throws CudfException;
 
@@ -1257,6 +1261,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
                     0, 0,
                     opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
                     opts.isNormalizeSingleQuotes(),
+                    opts.isNormalizeWhitespace(),
                     opts.isMixedTypesAsStrings(),
                 opts.keepStringQuotes()))) {
 
@@ -1312,6 +1317,7 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
     return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len,
         opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
         opts.isNormalizeSingleQuotes(),
+        opts.isNormalizeWhitespace(),
         opts.isMixedTypesAsStrings(), opts.keepStringQuotes()));
   }
 
@@ -1327,6 +1333,7 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) {
           opts.isLines(),
           opts.isRecoverWithNull(),
           opts.isNormalizeSingleQuotes(),
+          opts.isNormalizeWhitespace(),
           opts.isMixedTypesAsStrings(),
           opts.keepStringQuotes(),
           dsHandle));
@@ -1358,6 +1365,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
             schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null,
             buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(),
             opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
+            opts.isNormalizeWhitespace(),
             opts.isMixedTypesAsStrings(), opts.keepStringQuotes()))) {
       return gatherJSONColumns(schema, twm);
     }
@@ -1375,6 +1383,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
     try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(),
         schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(),
         opts.isLines(), opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
+        opts.isNormalizeWhitespace(),
         opts.isMixedTypesAsStrings(), opts.keepStringQuotes(), dsHandle))) {
       return gatherJSONColumns(schema, twm);
     } finally {
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 84f1174fd3f..357705824d2 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1429,8 +1429,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(
     JNIEnv *env, jclass, jboolean day_first, jboolean lines, jboolean recover_with_null,
-    jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes,
-    jlong ds_handle) {
+    jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string,
+    jboolean keep_quotes, jlong ds_handle) {
 
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
@@ -1448,8 +1448,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource
             .lines(static_cast<bool>(lines))
             .recovery_mode(recovery_mode)
             .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
-            .keep_quotes(keep_quotes)
-            .mixed_types_as_string(mixed_types_as_string);
+            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+            .mixed_types_as_string(mixed_types_as_string)
+            .keep_quotes(keep_quotes);
 
     auto result =
         std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
@@ -1461,8 +1462,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
     JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
-    jboolean recover_with_null, jboolean normalize_single_quotes, jboolean mixed_types_as_string,
-    jboolean keep_quotes) {
+    jboolean recover_with_null, jboolean normalize_single_quotes, jboolean normalize_whitespace,
+    jboolean mixed_types_as_string, jboolean keep_quotes) {
 
   JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
   if (buffer_length <= 0) {
@@ -1484,8 +1485,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
             .lines(static_cast<bool>(lines))
             .recovery_mode(recovery_mode)
             .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
-            .keep_quotes(keep_quotes)
-            .mixed_types_as_string(mixed_types_as_string);
+            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+            .mixed_types_as_string(mixed_types_as_string)
+            .keep_quotes(keep_quotes);
 
     auto result =
         std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
@@ -1573,8 +1575,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
     JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types,
     jintArray j_scales, jboolean day_first, jboolean lines, jboolean recover_with_null,
-    jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes,
-    jlong ds_handle) {
+    jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string,
+    jboolean keep_quotes, jlong ds_handle) {
 
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
@@ -1606,6 +1608,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
             .lines(static_cast<bool>(lines))
             .recovery_mode(recovery_mode)
             .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
+            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
             .mixed_types_as_string(mixed_types_as_string)
             .keep_quotes(keep_quotes);
 
@@ -1646,7 +1649,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
     JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types,
     jintArray j_scales, jstring inputfilepath, jlong buffer, jlong buffer_length,
     jboolean day_first, jboolean lines, jboolean recover_with_null,
-    jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes) {
+    jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string,
+    jboolean keep_quotes) {
 
   bool read_buffer = true;
   if (buffer == 0) {
@@ -1693,6 +1697,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
             .lines(static_cast<bool>(lines))
             .recovery_mode(recovery_mode)
             .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
+            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
             .mixed_types_as_string(mixed_types_as_string)
             .keep_quotes(keep_quotes);
 
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index bee8d1cbb88..3f0470d854a 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -88,6 +88,7 @@ public class TableTest extends CudfTestBase {
   private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json");
   private static final File TEST_JSON_ERROR_FILE = TestUtils.getResourceAsFile("people_with_invalid_lines.json");
   private static final File TEST_JSON_SINGLE_QUOTES_FILE = TestUtils.getResourceAsFile("single_quotes.json");
+  private static final File TEST_JSON_WHITESPACES_FILE = TestUtils.getResourceAsFile("whitespaces.json");
   private static final File TEST_MIXED_TYPE_1_JSON = TestUtils.getResourceAsFile("mixed_types_1.json");
   private static final File TEST_MIXED_TYPE_2_JSON = TestUtils.getResourceAsFile("mixed_types_2.json");
 
@@ -349,6 +350,39 @@ void testReadSingleQuotesJSONFile() throws IOException {
   }
 
   @Test
+  void testReadSingleQuotesJSONFileFeatureDisabled() throws IOException {
+    Schema schema = Schema.builder()
+      .column(DType.STRING, "A")
+      .build();
+    JSONOptions opts = JSONOptions.builder()
+      .withLines(true)
+      .withNormalizeSingleQuotes(false)
+      .build();
+    try (MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE)) {
+      assertThrows(CudfException.class, () ->
+        Table.readJSON(schema, opts, source));
+    }
+  }
+
+  @Test
+  void testReadWhitespacesJSONFile() throws IOException {
+    Schema schema = Schema.builder()
+            .column(DType.STRING, "a")
+            .build();
+    JSONOptions opts = JSONOptions.builder()
+            .withLines(true)
+            .withMixedTypesAsStrings(true)
+            .withNormalizeWhitespace(true)
+            .build();
+    try (Table expected = new Table.TestBuilder()
+            .column("b", "50", "[1,2,3,4,5,6,7,8]", "{\"c\":\"d\"}", "b")
+            .build();
+         MultiBufferDataSource source = sourceFrom(TEST_JSON_WHITESPACES_FILE);
+         Table table = Table.readJSON(schema, opts, source)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   void testReadSingleQuotesJSONFileKeepQuotes() throws IOException {
     Schema schema = Schema.builder()
         .column(DType.STRING, "A")
@@ -547,21 +581,6 @@ void testReadMixedType2JSONFile() throws IOException {
     }
   }
 
-  @Test
-  void testReadSingleQuotesJSONFileFeatureDisabled() throws IOException {
-    Schema schema = Schema.builder()
-      .column(DType.STRING, "A")
-      .build();
-    JSONOptions opts = JSONOptions.builder()
-      .withLines(true)
-      .withNormalizeSingleQuotes(false)
-      .build();
-    try (MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE)) {
-      assertThrows(CudfException.class, () ->
-        Table.readJSON(schema, opts, source));
-    }
-  }
-
   @Test
   void testReadJSONFromDataSource() throws IOException {
     Schema schema = Schema.builder()
diff --git a/java/src/test/resources/whitespaces.json b/java/src/test/resources/whitespaces.json
new file mode 100644
index 00000000000..f5ddd8cde5f
--- /dev/null
+++ b/java/src/test/resources/whitespaces.json
@@ -0,0 +1,5 @@
+{"a":"b"}
+ { "a" : "50" }
+{"a": [1, 2, 3, 4, 5, 6, 7, 8]}
+{"a": {"c": "d"}}
+{"a":   "b"}