From d158ccdbe651952bd649cb0f17c41467c5209824 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Mon, 4 Mar 2024 15:25:51 -0500
Subject: [PATCH] API for JSON unquoted whitespace normalization (#15033)

This work is a follow-up to PR #14931 which provided a proof-of-concept for using the a FST to normalize unquoted whitespaces. This PR implements the pre-processing FST in cuIO and adds a JSON reader option that needs to be set to true to invoke the normalizer.
Addresses feature request #14865

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15033
---
 cpp/CMakeLists.txt                            |   2 +-
 cpp/include/cudf/io/detail/json.hpp           |  10 +
 cpp/include/cudf/io/json.hpp                  |  31 +++
 ...normalization.cu => json_normalization.cu} | 142 ++++++++++++-
 cpp/src/io/json/read_json.cu                  |   7 +
 .../io/json_whitespace_normalization_test.cu  | 201 ++++--------------
 .../main/java/ai/rapids/cudf/JSONOptions.java |  15 ++
 java/src/main/java/ai/rapids/cudf/Table.java  |   9 +
 java/src/main/native/src/TableJni.cpp         |  27 ++-
 .../test/java/ai/rapids/cudf/TableTest.java   |  49 +++--
 java/src/test/resources/whitespaces.json      |   5 +
 11 files changed, 314 insertions(+), 184 deletions(-)
 rename cpp/src/io/json/{json_quote_normalization.cu => json_normalization.cu} (57%)
 create mode 100644 java/src/test/resources/whitespaces.json
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5fd6cd3544a..c74963be50d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -376,7 +376,7 @@ add_library(
   src/io/functions.cpp
   src/io/json/byte_range_info.cu
   src/io/json/json_column.cu
-  src/io/json/json_quote_normalization.cu
+  src/io/json/json_normalization.cu
   src/io/json/json_tree.cu
   src/io/json/nested_json_gpu.cu
   src/io/json/read_json.cu
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 0eb0e17ea10..3f7f7e9bb32 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -63,4 +63,14 @@ rmm::device_uvector<char> normalize_single_quotes(rmm::device_uvector<char>&& in
                                                   rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Normalize unquoted whitespace (space and tab characters) using FST
+ *
+ * @param inbuf Input device buffer
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
+ */
+rmm::device_uvector<char> normalize_whitespace(rmm::device_uvector<char>&& inbuf,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr);
 }  // namespace cudf::io::json::detail
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index f0c3d48ab7e..593dd044d51 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -118,6 +118,9 @@ class json_reader_options {
   // Normalize single quotes
   bool _normalize_single_quotes = false;
 
+  // Normalize unquoted spaces and tabs
+  bool _normalize_whitespace = false;
+
   // Whether to recover after an invalid JSON line
   json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL;
 
@@ -265,6 +268,13 @@ class json_reader_options {
    */
   bool is_enabled_normalize_single_quotes() const { return _normalize_single_quotes; }
 
+  /**
+   * @brief Whether the reader should normalize unquoted whitespace characters
+   *
+   * @returns true if the reader should normalize whitespace, false otherwise
+   */
+  bool is_enabled_normalize_whitespace() const { return _normalize_whitespace; }
+
   /**
    * @brief Queries the JSON reader's behavior on invalid JSON lines.
    *
@@ -358,6 +368,14 @@ class json_reader_options {
    */
   void enable_normalize_single_quotes(bool val) { _normalize_single_quotes = val; }
 
+  /**
+   * @brief Set whether the reader should enable normalization of unquoted whitespace
+   *
+   * @param val Boolean value to indicate whether the reader should normalize unquoted whitespace
+   * characters i.e. tabs and spaces
+   */
+  void enable_normalize_whitespace(bool val) { _normalize_whitespace = val; }
+
   /**
    * @brief Specifies the JSON reader's behavior on invalid JSON lines.
    *
@@ -533,6 +551,19 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set whether the reader should normalize unquoted whitespace
+   *
+   * @param val Boolean value to indicate whether the reader should normalize unquoted
+   * whitespace
+   * @return this for chaining
+   */
+  json_reader_options_builder& normalize_whitespace(bool val)
+  {
+    options._normalize_whitespace = val;
+    return *this;
+  }
+
   /**
    * @brief Specifies the JSON reader's behavior on invalid JSON lines.
    *
diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_normalization.cu
similarity index 57%
rename from cpp/src/io/json/json_quote_normalization.cu
rename to cpp/src/io/json/json_normalization.cu
index a13b6e0b016..86e4da664a8 100644
--- a/cpp/src/io/json/json_quote_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -32,13 +32,15 @@
 
 namespace cudf::io::json {
 
-using SymbolT       = char;
-using StateT        = char;
+// Type used to represent the atomic symbol type used within the finite-state machine
+using SymbolT = char;
+using StateT  = char;
+
+// Type sufficiently large to index symbols within the input and output (may be unsigned)
 using SymbolOffsetT = uint32_t;
 
 namespace normalize_quotes {
 
-// Type sufficiently large to index symbols within the input and output (may be unsigned)
 enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES };
 enum class dfa_symbol_group_id : uint32_t {
   DOUBLE_QUOTE_CHAR,  ///< Quote character SG: "
@@ -172,6 +174,116 @@ struct TransduceToNormalizedQuotes {
 
 }  // namespace normalize_quotes
 
+namespace normalize_whitespace {
+
+enum class dfa_symbol_group_id : uint32_t {
+  DOUBLE_QUOTE_CHAR,   ///< Quote character SG: "
+  ESCAPE_CHAR,         ///< Escape character SG: '\\'
+  NEWLINE_CHAR,        ///< Newline character SG: '\n'
+  WHITESPACE_SYMBOLS,  ///< Whitespace characters SG: '\t' or ' '
+  OTHER_SYMBOLS,       ///< SG implicitly matching all other characters
+  NUM_SYMBOL_GROUPS    ///< Total number of symbol groups
+};
+// Alias for readability of symbol group ids
+constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
+// The i-th string representing all the characters of a symbol group
+std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{
+  {{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}};
+
+/**
+ * -------- FST states ---------
+ * -----------------------------
+ * TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double
+ *        |   quotes as well as any other character not enclosed by a string. Also handles
+ *        |   newline character present within a string
+ * TT_DQS | Double-quoted string state handling all characters within double quotes except
+ *        |   newline character
+ * TT_DEC | State handling escaped characters inside double-quoted string. Note that this
+ *        |   state is necessary to process escaped double-quote characters. Without this
+ *        |   state, whitespaces following escaped double quotes inside strings may be removed.
+ *
+ * NOTE: An important case NOT handled by this FST is that of whitespace following newline
+ * characters within a string. Consider the following example
+ * Input:           {"a":"x\n y"}
+ * FST output:      {"a":"x\ny"}
+ * Expected output: {"a":"x\n y"}
+ * Such strings are not part of the JSON standard (characters allowed within quotes should
+ * have ASCII at least 0x20 i.e. space character and above) but may be encountered while
+ * reading JSON files
+ */
+enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES };
+// Aliases for readability of the transition table
+constexpr auto TT_OOS        = dfa_states::TT_OOS;
+constexpr auto TT_DQS        = dfa_states::TT_DQS;
+constexpr auto TT_DEC        = dfa_states::TT_DEC;
+constexpr auto TT_NUM_STATES = static_cast<StateT>(dfa_states::TT_NUM_STATES);
+
+// Transition table
+std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const wna_state_tt{
+  {/* IN_STATE      "       \       \n    <SPC>   OTHER  */
+   /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}},
+   /* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}},
+   /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}};
+
+// The DFA's starting state
+constexpr StateT start_state = static_cast<StateT>(TT_OOS);
+
+struct TransduceToNormalizedWS {
+  /**
+   * @brief Returns the <relative_offset>-th output symbol on the transition (state_id, match_id).
+   */
+  template <typename StateT, typename SymbolGroupT, typename RelativeOffsetT, typename SymbolT>
+  constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id,
+                                                SymbolGroupT const match_id,
+                                                RelativeOffsetT const relative_offset,
+                                                SymbolT const read_symbol) const
+  {
+    // -------- TRANSLATION TABLE ------------
+    //      Let the alphabet set be Sigma
+    // ---------------------------------------
+    // ---------- NON-SPECIAL CASES: ----------
+    //      Output symbol same as input symbol <s>
+    // state | read_symbol <s>  -> output_symbol <s>
+    // DQS   | Sigma            -> Sigma
+    // OOS   | Sigma\{<SPC>,\t} -> Sigma\{<SPC>,\t}
+    // DEC   | Sigma            -> Sigma
+    // ---------- SPECIAL CASES: --------------
+    //    Input symbol translates to output symbol
+    // OOS   | {<SPC>}          -> <nop>
+    // OOS   | {\t}             -> <nop>
+
+    // Case when read symbol is a space or tab but is unquoted
+    // This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function
+    // However, since there is no output in this case i.e. the count returned by
+    // operator()(state_id, match_id, read_symbol) is zero, this function is never called.
+    // So skipping the check for this case.
+
+    // In all other cases, we have an output symbol for the input symbol.
+    // We simply output the input symbol
+    return read_symbol;
+  }
+
+  /**
+   * @brief Returns the number of output characters for a given transition.
+   * During whitespace normalization, we always emit one output character i.e., the input
+   * character, except when we need to remove the space/tab character
+   */
+  template <typename StateT, typename SymbolGroupT, typename SymbolT>
+  constexpr CUDF_HOST_DEVICE uint32_t operator()(StateT const state_id,
+                                                 SymbolGroupT const match_id,
+                                                 SymbolT const read_symbol) const
+  {
+    // Case when read symbol is a space or tab but is unquoted
+    if (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::WHITESPACE_SYMBOLS) &&
+        state_id == static_cast<StateT>(dfa_states::TT_OOS)) {
+      return 0;
+    }
+    return 1;
+  }
+};
+
+}  // namespace normalize_whitespace
+
 namespace detail {
 
 rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT>&& inbuf,
@@ -198,5 +310,29 @@ rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT
   return outbuf;
 }
 
+rmm::device_uvector<SymbolT> normalize_whitespace(rmm::device_uvector<SymbolT>&& inbuf,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
+{
+  auto parser = fst::detail::make_fst(
+    fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
+    fst::detail::make_transition_table(normalize_whitespace::wna_state_tt),
+    fst::detail::make_translation_functor(normalize_whitespace::TransduceToNormalizedWS{}),
+    stream);
+
+  rmm::device_uvector<SymbolT> outbuf(inbuf.size(), stream, mr);
+  rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
+  parser.Transduce(inbuf.data(),
+                   static_cast<SymbolOffsetT>(inbuf.size()),
+                   outbuf.data(),
+                   thrust::make_discard_iterator(),
+                   outbuf_size.data(),
+                   normalize_whitespace::start_state,
+                   stream);
+
+  outbuf.resize(outbuf_size.value(stream), stream);
+  return outbuf;
+}
+
 }  // namespace detail
 }  // namespace cudf::io::json
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index ba8acf2d47a..506d7b6cddc 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -235,6 +235,13 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
       normalize_single_quotes(std::move(buffer), stream, rmm::mr::get_current_device_resource());
   }
 
+  // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is
+  // enabled, invoke pre-processing FST
+  if (reader_opts.is_enabled_normalize_whitespace()) {
+    buffer =
+      normalize_whitespace(std::move(buffer), stream, rmm::mr::get_current_device_resource());
+  }
+
   return device_parse_nested_json(buffer, reader_opts, stream, mr);
   // For debug purposes, use host_parse_nested_json()
 }
diff --git a/cpp/tests/io/json_whitespace_normalization_test.cu b/cpp/tests/io/json_whitespace_normalization_test.cu
index 545d8d2c4f9..336d360063f 100644
--- a/cpp/tests/io/json_whitespace_normalization_test.cu
+++ b/cpp/tests/io/json_whitespace_normalization_test.cu
@@ -13,177 +13,41 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "io/fst/lookup_tables.cuh"
-#include "io/utilities/hostdevice_vector.hpp"
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/default_stream.hpp>
+#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
-#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/detail/json.hpp>
+#include <cudf/io/json.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
-#include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
-#include <thrust/iterator/discard_iterator.h>
-
-#include <cstdlib>
 #include <string>
 
-namespace {
-// Type used to represent the atomic symbol type used within the finite-state machine
-using SymbolT = char;
-using StateT  = char;
-
-// Type sufficiently large to index symbols within the input and output (may be unsigned)
-using SymbolOffsetT = uint32_t;
-
-enum class dfa_symbol_group_id : uint32_t {
-  DOUBLE_QUOTE_CHAR,   ///< Quote character SG: "
-  ESCAPE_CHAR,         ///< Escape character SG: '\\'
-  NEWLINE_CHAR,        ///< Newline character SG: '\n'
-  WHITESPACE_SYMBOLS,  ///< Whitespace characters SG: '\t' or ' '
-  OTHER_SYMBOLS,       ///< SG implicitly matching all other characters
-  NUM_SYMBOL_GROUPS    ///< Total number of symbol groups
-};
-// Alias for readability of symbol group ids
-constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
-// The i-th string representing all the characters of a symbol group
-std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{
-  {{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}};
-
-/**
- * -------- FST states ---------
- * -----------------------------
- * TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double
- *        |   quotes as well as any other character not enclosed by a string. Also handles
- *        |   newline character present within a string
- * TT_DQS | Double-quoted string state handling all characters within double quotes except
- *        |   newline character
- * TT_DEC | State handling escaped characters inside double-quoted string. Note that this
- *        |   state is necessary to process escaped double-quote characters. Without this
- *        |   state, whitespaces following escaped double quotes inside strings may be removed.
- *
- * NOTE: An important case NOT handled by this FST is that of whitespace following newline
- * characters within a string. Consider the following example
- * Input:           {"a":"x\n y"}
- * FST output:      {"a":"x\ny"}
- * Expected output: {"a":"x\n y"}
- */
-enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES };
-// Aliases for readability of the transition table
-constexpr auto TT_OOS        = dfa_states::TT_OOS;
-constexpr auto TT_DQS        = dfa_states::TT_DQS;
-constexpr auto TT_DEC        = dfa_states::TT_DEC;
-constexpr auto TT_NUM_STATES = static_cast<StateT>(dfa_states::TT_NUM_STATES);
-
-// Transition table
-std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const wna_state_tt{
-  {/* IN_STATE      "       \       \n    <SPC>   OTHER  */
-   /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}},
-   /* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}},
-   /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}};
-
-// The DFA's starting state
-constexpr StateT start_state = static_cast<StateT>(TT_OOS);
-
-struct TransduceToNormalizedWS {
-  /**
-   * @brief Returns the <relative_offset>-th output symbol on the transition (state_id, match_id).
-   */
-  template <typename StateT, typename SymbolGroupT, typename RelativeOffsetT, typename SymbolT>
-  constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id,
-                                                SymbolGroupT const match_id,
-                                                RelativeOffsetT const relative_offset,
-                                                SymbolT const read_symbol) const
-  {
-    // -------- TRANSLATION TABLE ------------
-    //      Let the alphabet set be Sigma
-    // ---------------------------------------
-    // ---------- NON-SPECIAL CASES: ----------
-    //      Output symbol same as input symbol <s>
-    // state | read_symbol <s>  -> output_symbol <s>
-    // DQS   | Sigma            -> Sigma
-    // OOS   | Sigma\{<SPC>,\t} -> Sigma\{<SPC>,\t}
-    // DEC   | Sigma            -> Sigma
-    // ---------- SPECIAL CASES: --------------
-    //    Input symbol translates to output symbol
-    // OOS   | {<SPC>}          -> <nop>
-    // OOS   | {\t}             -> <nop>
-
-    // Case when read symbol is a space or tab but is unquoted
-    // This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function
-    // However, since there is no output in this case i.e. the count returned by
-    // operator()(state_id, match_id, read_symbol) is zero, this function is never called.
-    // So skipping the check for this case.
-
-    // In all other cases, we have an output symbol for the input symbol.
-    // We simply output the input symbol
-    return read_symbol;
-  }
-
-  /**
-   * @brief Returns the number of output characters for a given transition.
-   * During whitespace normalization, we always emit one output character i.e., the input
-   * character, except when we need to remove the space/tab character
-   */
-  template <typename StateT, typename SymbolGroupT, typename SymbolT>
-  constexpr CUDF_HOST_DEVICE uint32_t operator()(StateT const state_id,
-                                                 SymbolGroupT const match_id,
-                                                 SymbolT const read_symbol) const
-  {
-    // Case when read symbol is a space or tab but is unquoted
-    if (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::WHITESPACE_SYMBOLS) &&
-        state_id == static_cast<StateT>(dfa_states::TT_OOS)) {
-      return 0;
-    }
-    return 1;
-  }
-};
-}  // namespace
-
 // Base test fixture for tests
 struct JsonWSNormalizationTest : public cudf::test::BaseFixture {};
 
-void run_test(std::string const& input, std::string const& output)
+void run_test(std::string const& host_input, std::string const& expected_host_output)
 {
-  auto parser = cudf::io::fst::detail::make_fst(
-    cudf::io::fst::detail::make_symbol_group_lut(wna_sgs),
-    cudf::io::fst::detail::make_transition_table(wna_state_tt),
-    cudf::io::fst::detail::make_translation_functor(TransduceToNormalizedWS{}),
-    cudf::test::get_default_stream());
-
-  auto d_input_scalar = cudf::make_string_scalar(input, cudf::test::get_default_stream());
-  auto& d_input       = static_cast<cudf::scalar_type_t<std::string>&>(*d_input_scalar);
+  auto stream_view  = cudf::get_default_stream();
+  auto device_input = cudf::detail::make_device_uvector_async(
+    host_input, stream_view, rmm::mr::get_current_device_resource());
 
-  // Prepare input & output buffers
-  constexpr std::size_t single_item = 1;
-  cudf::detail::hostdevice_vector<SymbolT> output_gpu(input.size(),
-                                                      cudf::test::get_default_stream());
-  cudf::detail::hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item,
-                                                                 cudf::test::get_default_stream());
+  // Preprocessing FST
+  auto device_fst_output = cudf::io::json::detail::normalize_whitespace(
+    std::move(device_input), stream_view, rmm::mr::get_current_device_resource());
 
-  // Allocate device-side temporary storage & run algorithm
-  parser.Transduce(d_input.data(),
-                   static_cast<SymbolOffsetT>(d_input.size()),
-                   output_gpu.device_ptr(),
-                   thrust::make_discard_iterator(),
-                   output_gpu_size.device_ptr(),
-                   start_state,
-                   cudf::test::get_default_stream());
+  auto const preprocessed_host_output =
+    cudf::detail::make_std_vector_sync(device_fst_output, stream_view);
 
-  // Async copy results from device to host
-  output_gpu.device_to_host_async(cudf::test::get_default_stream());
-  output_gpu_size.device_to_host_async(cudf::test::get_default_stream());
-
-  // Make sure results have been copied back to host
-  cudf::test::get_default_stream().synchronize();
-
-  // Verify results
-  ASSERT_EQ(output_gpu_size[0], output.size());
-  CUDF_TEST_EXPECT_VECTOR_EQUAL(output_gpu, output, output.size());
+  ASSERT_EQ(preprocessed_host_output.size(), expected_host_output.size());
+  CUDF_TEST_EXPECT_VECTOR_EQUAL(
+    preprocessed_host_output, expected_host_output, preprocessed_host_output.size());
 }
 
 TEST_F(JsonWSNormalizationTest, GroundTruth_Spaces)
@@ -259,4 +123,33 @@ TEST_F(JsonWSNormalizationTest, GroundTruth_InvalidInput)
   run_test(input, output);
 }
 
+TEST_F(JsonWSNormalizationTest, ReadJsonOption)
+{
+  // When mixed type fields are read as strings, the table read will differ depending the
+  // value of normalize_whitespace
+
+  // Test input
+  std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}";
+  cudf::io::json_reader_options input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{host_input.data(), host_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(true);
+
+  cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options);
+
+  // Expected table
+  std::string const expected_input = R"({ "a" : {"b":"c"}})";
+  cudf::io::json_reader_options expected_input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{expected_input.data(), expected_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(false);
+
+  cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index 62496e32f7a..b37d0d88ec9 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -31,6 +31,7 @@ public final class JSONOptions extends ColumnFilterOptions {
   private final boolean lines;
   private final boolean recoverWithNull;
   private final boolean normalizeSingleQuotes;
+  private final boolean normalizeWhitespace;
   private final boolean mixedTypesAsStrings;
   private final boolean keepStringQuotes;
 
@@ -40,6 +41,7 @@ private JSONOptions(Builder builder) {
     lines = builder.lines;
     recoverWithNull = builder.recoverWithNull;
     normalizeSingleQuotes = builder.normalizeSingleQuotes;
+    normalizeWhitespace = builder.normalizeWhitespace;
     mixedTypesAsStrings = builder.mixedTypesAsStrings;
     keepStringQuotes = builder.keepQuotes;
   }
@@ -61,6 +63,10 @@ public boolean isNormalizeSingleQuotes() {
     return normalizeSingleQuotes;
   }
 
+  public boolean isNormalizeWhitespace() {
+    return normalizeWhitespace;
+  }
+
   public boolean isMixedTypesAsStrings() {
     return mixedTypesAsStrings;
   }
@@ -84,6 +90,7 @@ public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptio
 
     private boolean recoverWithNull = false;
     private boolean normalizeSingleQuotes = false;
+    private boolean normalizeWhitespace = false;
 
     private boolean mixedTypesAsStrings = false;
     private boolean keepQuotes = false;
@@ -131,6 +138,14 @@ public Builder withNormalizeSingleQuotes(boolean normalizeSingleQuotes) {
       return this;
     }
 
+    /**
+     * Should the unquoted whitespace be removed.
+     */
+    public Builder withNormalizeWhitespace(boolean normalizeWhitespace) {
+      this.normalizeWhitespace = normalizeWhitespace;
+      return this;
+    }
+
     /**
      * Specify how to handle columns that contain mixed types.
      *
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index c562e08b4c8..a1bdfe9a796 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -252,6 +252,7 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
                                         boolean dayFirst, boolean lines,
                                         boolean recoverWithNulls,
                                         boolean normalizeSingleQuotes,
+                                        boolean normalizeWhitespace,
                                         boolean mixedTypesAsStrings,
                                         boolean keepStringQuotes) throws CudfException;
 
@@ -260,6 +261,7 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
                                       boolean dayFirst, boolean lines,
                                       boolean recoverWithNulls,
                                       boolean normalizeSingleQuotes,
+                                      boolean normalizeWhitespace,
                                       boolean mixedTypesAsStrings,
                                       boolean keepStringQuotes,
                                       long dsHandle) throws CudfException;
@@ -267,6 +269,7 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
   private static native long readAndInferJSONFromDataSource(boolean dayFirst, boolean lines,
                                       boolean recoverWithNulls,
                                       boolean normalizeSingleQuotes,
+                                      boolean normalizeWhitespace,
                                       boolean mixedTypesAsStrings,
                                       boolean keepStringQuotes,
                                       long dsHandle) throws CudfException;
@@ -275,6 +278,7 @@ private static native long readAndInferJSON(long address, long length,
                                               boolean lines,
                                               boolean recoverWithNulls,
                                               boolean normalizeSingleQuotes,
+                                              boolean normalizeWhitespace,
                                               boolean mixedTypesAsStrings,
                                               boolean keepStringQuotes) throws CudfException;
 
@@ -1257,6 +1261,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
                     0, 0,
                     opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
                     opts.isNormalizeSingleQuotes(),
+                    opts.isNormalizeWhitespace(),
                     opts.isMixedTypesAsStrings(),
                 opts.keepStringQuotes()))) {
 
@@ -1312,6 +1317,7 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
     return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len,
         opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
         opts.isNormalizeSingleQuotes(),
+        opts.isNormalizeWhitespace(),
         opts.isMixedTypesAsStrings(), opts.keepStringQuotes()));
   }
 
@@ -1327,6 +1333,7 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) {
           opts.isLines(),
           opts.isRecoverWithNull(),
           opts.isNormalizeSingleQuotes(),
+          opts.isNormalizeWhitespace(),
           opts.isMixedTypesAsStrings(),
           opts.keepStringQuotes(),
           dsHandle));
@@ -1358,6 +1365,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
             schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null,
             buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(),
             opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
+            opts.isNormalizeWhitespace(),
             opts.isMixedTypesAsStrings(), opts.keepStringQuotes()))) {
       return gatherJSONColumns(schema, twm);
     }
@@ -1375,6 +1383,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
     try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(),
         schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(),
         opts.isLines(), opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
+        opts.isNormalizeWhitespace(),
         opts.isMixedTypesAsStrings(), opts.keepStringQuotes(), dsHandle))) {
       return gatherJSONColumns(schema, twm);
     } finally {
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 84f1174fd3f..357705824d2 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1429,8 +1429,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(
     JNIEnv *env, jclass, jboolean day_first, jboolean lines, jboolean recover_with_null,
-    jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes,
-    jlong ds_handle) {
+    jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string,
+    jboolean keep_quotes, jlong ds_handle) {
 
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
@@ -1448,8 +1448,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource
             .lines(static_cast<bool>(lines))
             .recovery_mode(recovery_mode)
             .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
-            .keep_quotes(keep_quotes)
-            .mixed_types_as_string(mixed_types_as_string);
+            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+            .mixed_types_as_string(mixed_types_as_string)
+            .keep_quotes(keep_quotes);
 
     auto result =
         std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
@@ -1461,8 +1462,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
     JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
-    jboolean recover_with_null, jboolean normalize_single_quotes, jboolean mixed_types_as_string,
-    jboolean keep_quotes) {
+    jboolean recover_with_null, jboolean normalize_single_quotes, jboolean normalize_whitespace,
+    jboolean mixed_types_as_string, jboolean keep_quotes) {
 
   JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
   if (buffer_length <= 0) {
@@ -1484,8 +1485,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
             .lines(static_cast<bool>(lines))
             .recovery_mode(recovery_mode)
             .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
-            .keep_quotes(keep_quotes)
-            .mixed_types_as_string(mixed_types_as_string);
+            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+            .mixed_types_as_string(mixed_types_as_string)
+            .keep_quotes(keep_quotes);
 
     auto result =
         std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
@@ -1573,8 +1575,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
     JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types,
     jintArray j_scales, jboolean day_first, jboolean lines, jboolean recover_with_null,
-    jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes,
-    jlong ds_handle) {
+    jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string,
+    jboolean keep_quotes, jlong ds_handle) {
 
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
@@ -1606,6 +1608,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
             .lines(static_cast<bool>(lines))
             .recovery_mode(recovery_mode)
             .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
+            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
             .mixed_types_as_string(mixed_types_as_string)
             .keep_quotes(keep_quotes);
 
@@ -1646,7 +1649,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
     JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types,
     jintArray j_scales, jstring inputfilepath, jlong buffer, jlong buffer_length,
     jboolean day_first, jboolean lines, jboolean recover_with_null,
-    jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes) {
+    jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string,
+    jboolean keep_quotes) {
 
   bool read_buffer = true;
   if (buffer == 0) {
@@ -1693,6 +1697,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
             .lines(static_cast<bool>(lines))
             .recovery_mode(recovery_mode)
             .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
+            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
             .mixed_types_as_string(mixed_types_as_string)
             .keep_quotes(keep_quotes);
 
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index bee8d1cbb88..3f0470d854a 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -88,6 +88,7 @@ public class TableTest extends CudfTestBase {
   private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json");
   private static final File TEST_JSON_ERROR_FILE = TestUtils.getResourceAsFile("people_with_invalid_lines.json");
   private static final File TEST_JSON_SINGLE_QUOTES_FILE = TestUtils.getResourceAsFile("single_quotes.json");
+  private static final File TEST_JSON_WHITESPACES_FILE = TestUtils.getResourceAsFile("whitespaces.json");
   private static final File TEST_MIXED_TYPE_1_JSON = TestUtils.getResourceAsFile("mixed_types_1.json");
   private static final File TEST_MIXED_TYPE_2_JSON = TestUtils.getResourceAsFile("mixed_types_2.json");
 
@@ -349,6 +350,39 @@ void testReadSingleQuotesJSONFile() throws IOException {
   }
 
   @Test
+  void testReadSingleQuotesJSONFileFeatureDisabled() throws IOException {
+    Schema schema = Schema.builder()
+      .column(DType.STRING, "A")
+      .build();
+    JSONOptions opts = JSONOptions.builder()
+      .withLines(true)
+      .withNormalizeSingleQuotes(false)
+      .build();
+    try (MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE)) {
+      assertThrows(CudfException.class, () ->
+        Table.readJSON(schema, opts, source));
+    }
+  }
+
+  @Test
+  void testReadWhitespacesJSONFile() throws IOException {
+    Schema schema = Schema.builder()
+            .column(DType.STRING, "a")
+            .build();
+    JSONOptions opts = JSONOptions.builder()
+            .withLines(true)
+            .withMixedTypesAsStrings(true)
+            .withNormalizeWhitespace(true)
+            .build();
+    try (Table expected = new Table.TestBuilder()
+            .column("b", "50", "[1,2,3,4,5,6,7,8]", "{\"c\":\"d\"}", "b")
+            .build();
+         MultiBufferDataSource source = sourceFrom(TEST_JSON_WHITESPACES_FILE);
+         Table table = Table.readJSON(schema, opts, source)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   void testReadSingleQuotesJSONFileKeepQuotes() throws IOException {
     Schema schema = Schema.builder()
         .column(DType.STRING, "A")
@@ -547,21 +581,6 @@ void testReadMixedType2JSONFile() throws IOException {
     }
   }
 
-  @Test
-  void testReadSingleQuotesJSONFileFeatureDisabled() throws IOException {
-    Schema schema = Schema.builder()
-      .column(DType.STRING, "A")
-      .build();
-    JSONOptions opts = JSONOptions.builder()
-      .withLines(true)
-      .withNormalizeSingleQuotes(false)
-      .build();
-    try (MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE)) {
-      assertThrows(CudfException.class, () ->
-        Table.readJSON(schema, opts, source));
-    }
-  }
-
   @Test
   void testReadJSONFromDataSource() throws IOException {
     Schema schema = Schema.builder()
diff --git a/java/src/test/resources/whitespaces.json b/java/src/test/resources/whitespaces.json
new file mode 100644
index 00000000000..f5ddd8cde5f
--- /dev/null
+++ b/java/src/test/resources/whitespaces.json
@@ -0,0 +1,5 @@
+{"a":"b"}
+ { "a" : "50" }
+{"a": [1, 2, 3, 4, 5, 6, 7, 8]}
+{"a": {"c": "d"}}
+{"a":   "b"}