From 4423d201a3871aa7bb51e415306274890ac5bc82 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 30 Sep 2022 16:03:37 +0530
Subject: [PATCH 01/39] trim quotes for non-string values in nested json
 parsing

---
 cpp/include/cudf/io/detail/data_casting.cuh | 12 +++++++-----
 cpp/src/io/utilities/parsing_utils.cuh      | 21 +++++++++++++++++++++
 2 files changed, 28 insertions(+), 5 deletions(-)
diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index 628c00ad603..4b8b016f7f5 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -381,10 +381,12 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
     col_size,
     [str_tuples, col = *output_dv_ptr, options, col_type] __device__(size_type row) {
       if (col.is_null(row)) { return; }
-      auto const in = str_tuples[row];
+      // If this is a string value, remove quotes
+      auto [in_begin, in_end] = trim_quotes(
+        str_tuples[row].first, str_tuples[row].first + str_tuples[row].second, options.quotechar);
 
-      auto const is_null_literal =
-        serialized_trie_contains(options.trie_na, {in.first, static_cast<size_t>(in.second)});
+      auto const is_null_literal = serialized_trie_contains(
+        options.trie_na, {in_begin, static_cast<size_t>(in_end - in_begin)});
 
       if (is_null_literal) {
         col.set_null(row);
@@ -393,8 +395,8 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
 
       auto const is_parsed = cudf::type_dispatcher(col_type,
                                                    ConvertFunctor{},
-                                                   in.first,
-                                                   in.first + in.second,
+                                                   in_begin,
+                                                   in_end,
                                                    col.data<char>(),
                                                    row,
                                                    col_type,
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 388c9b28001..7964f43b783 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -523,6 +523,27 @@ __inline__ __device__ std::pair<char const*, char const*> trim_whitespaces_quote
   return {skip_character(trim_begin, quotechar), skip_character(trim_end, quotechar).base()};
 }
 
+/**
+ * @brief Adjusts the range to ignore starting/trailing quotation characters.
+ *
+ * @param[in] begin Pointer to the first character in the parsing range
+ * @param[in] end pointer to the first character after the parsing range
+ * @param[in] quotechar The character used to denote quotes; '\0' if none
+ *
+ * @return Trimmed range
+ */
+__inline__ __device__ std::pair<char const*, char const*> trim_quotes(char const* begin,
+                                                                      char const* end,
+                                                                      char quotechar = '\0')
+{
+  if ((thrust::distance(begin, end) >= 2 && *begin == quotechar &&
+       *thrust::prev(end) == quotechar)) {
+    thrust::advance(begin, 1);
+    thrust::advance(end, -1);
+  }
+  return {begin, end};
+}
+
 /**
  * @brief Decodes a numeric value base on templated cudf type T with specified
  * base.

From 1d69a69a863bbb26da274b04005a07d132ac34df Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 30 Sep 2022 17:18:09 +0530
Subject: [PATCH 02/39] add pytest compare cudf, cudf_experimental on values
 with quotes

---
 python/cudf/cudf/tests/test_json.py | 52 +++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 1fdef44546a..8e3c6fc6bda 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -669,6 +669,58 @@ def test_json_types_data():
     assert df.to_arrow().equals(pa_table_pdf)
 
 
+@pytest.mark.parametrize(
+    "col_type,json_str",
+    [
+        # without quotes
+        ("int", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'),
+        # with quotes
+        ("int", '[{"k": "1"}, {"k": "2"}]'),
+        # with quotes, mixed
+        ("int", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'),
+        # with quotes, null, mixed
+        ("int", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'),
+        # without quotes, null
+        ("int", '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]'),
+        # without quotes
+        ("float", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'),
+        # with quotes
+        ("float", '[{"k": "1"}, {"k": "2"}]'),
+        # with quotes, mixed
+        ("float", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'),
+        # with quotes, null, mixed
+        ("float", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'),
+        # with quotes, NAN
+        ("float", '[{"k": "1"}, {"k": "2"}, {"k": NaN}, {"k": "4"}]'),
+        # without quotes
+        ("str", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'),
+        # with quotes
+        ("str", '[{"k": "1"}, {"k": "2"}]'),
+        # with quotes, mixed
+        ("str", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'),
+        # with quotes, null, mixed
+        ("str", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'),
+        # without quotes, null
+        ("str", '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]'),
+    ],
+)
+def test_json_quoted_values_with_schema(col_type, json_str):
+    experimental_df = cudf.read_json(
+        StringIO(json_str),
+        engine="cudf_experimental",
+        orient="records",
+        dtype={"k": col_type},
+    )
+    cudf_df = cudf.read_json(
+        StringIO(json_str.replace(",", "\n")[1:-1]),
+        engine="cudf",
+        orient="records",
+        lines=True,
+        dtype={"k": col_type},
+    )
+    assert_eq(cudf_df, experimental_df)
+
+
 @pytest.mark.parametrize(
     "keep_quotes,result",
     [

From b680e3f4cd652310cb012eb6afa9d15503854801 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 30 Sep 2022 19:03:37 +0530
Subject: [PATCH 03/39] remove unnecessary quote removal in timestamp and
 duration decode_value

---
 cpp/src/io/utilities/parsing_utils.cuh | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 7964f43b783..a075fa18531 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -584,22 +584,12 @@ __inline__ __device__ T decode_value(char const* begin,
                                      char const* end,
                                      parse_options_view const& opts)
 {
-  // If this is a string value, remove quotes
-  if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) {
-    thrust::advance(begin, 1);
-    thrust::advance(end, -1);
-  }
   return to_timestamp<T>(begin, end, opts.dayfirst);
 }
 
 template <typename T, CUDF_ENABLE_IF(cudf::is_duration<T>())>
 __inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const&)
 {
-  // If this is a string value, remove quotes
-  if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) {
-    thrust::advance(begin, 1);
-    thrust::advance(end, -1);
-  }
   return to_duration<T>(begin, end);
 }
 

From 46663cfc79107b97351718a088c24365de066a54 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Sat, 22 Oct 2022 03:51:20 +0530
Subject: [PATCH 04/39] null check before removing quotes

---
 cpp/include/cudf/io/detail/data_casting.cuh | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index 4b8b016f7f5..aba9ec07bc6 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -381,18 +381,19 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
     col_size,
     [str_tuples, col = *output_dv_ptr, options, col_type] __device__(size_type row) {
       if (col.is_null(row)) { return; }
-      // If this is a string value, remove quotes
-      auto [in_begin, in_end] = trim_quotes(
-        str_tuples[row].first, str_tuples[row].first + str_tuples[row].second, options.quotechar);
+      auto const in = str_tuples[row];
 
-      auto const is_null_literal = serialized_trie_contains(
-        options.trie_na, {in_begin, static_cast<size_t>(in_end - in_begin)});
+      auto const is_null_literal =
+        serialized_trie_contains(options.trie_na, {in.first, static_cast<size_t>(in.second)});
 
       if (is_null_literal) {
         col.set_null(row);
         return;
       }
 
+      // If this is a string value, remove quotes
+      auto [in_begin, in_end] = trim_quotes(in.first, in.first + in.second, options.quotechar);
+
       auto const is_parsed = cudf::type_dispatcher(col_type,
                                                    ConvertFunctor{},
                                                    in_begin,

From d2b76e356073d3e9e781d6166e23f0fd76e1debb Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Sat, 22 Oct 2022 03:52:20 +0530
Subject: [PATCH 05/39] return std::optional from parse_numeric, for validating
 parsing

---
 cpp/src/io/utilities/parsing_utils.cuh | 115 +++++++++----------------
 cpp/src/strings/json/json_path.cu      |   7 +-
 2 files changed, 44 insertions(+), 78 deletions(-)

diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index a075fa18531..b37ab9d2b55 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -127,12 +127,14 @@ struct parse_options {
  *
  * @return uint8_t Numeric value of the character, or `0`
  */
-template <typename T, CUDF_ENABLE_IF(std::is_integral_v<T>)>
+template <typename T, bool as_hex = false, CUDF_ENABLE_IF(std::is_integral_v<T>)>
 constexpr uint8_t decode_digit(char c, bool* valid_flag)
 {
   if (c >= '0' && c <= '9') return c - '0';
-  if (c >= 'a' && c <= 'f') return c - 'a' + 10;
-  if (c >= 'A' && c <= 'F') return c - 'A' + 10;
+  if constexpr (as_hex) {
+    if (c >= 'a' && c <= 'f') return c - 'a' + 10;
+    if (c >= 'A' && c <= 'F') return c - 'A' + 10;
+  }
 
   *valid_flag = false;
   return 0;
@@ -148,7 +150,7 @@ constexpr uint8_t decode_digit(char c, bool* valid_flag)
  *
  * @return uint8_t Numeric value of the character, or `0`
  */
-template <typename T, CUDF_ENABLE_IF(!std::is_integral_v<T>)>
+template <typename T, bool as_hex = false, CUDF_ENABLE_IF(!std::is_integral_v<T>)>
 constexpr uint8_t decode_digit(char c, bool* valid_flag)
 {
   if (c >= '0' && c <= '9') return c - '0';
@@ -194,13 +196,13 @@ constexpr bool is_infinity(char const* begin, char const* end)
  * @return The parsed and converted value
  */
 template <typename T, int base = 10>
-constexpr T parse_numeric(char const* begin,
-                          char const* end,
-                          parse_options_view const& opts,
-                          T error_result = std::numeric_limits<T>::quiet_NaN())
+__host__ __device__ std::optional<T> parse_numeric(char const* begin,
+                                                   char const* end,
+                                                   parse_options_view const& opts)
 {
   T value{};
   bool all_digits_valid = true;
+  constexpr bool as_hex = (base == 16);
 
   // Handle negative values if necessary
   int32_t sign = (*begin == '-') ? -1 : 1;
@@ -223,7 +225,7 @@ constexpr T parse_numeric(char const* begin,
     } else if (base == 10 && (*begin == 'e' || *begin == 'E')) {
       break;
     } else if (*begin != opts.thousands && *begin != '+') {
-      value = (value * base) + decode_digit<T>(*begin, &all_digits_valid);
+      value = (value * base) + decode_digit<T, as_hex>(*begin, &all_digits_valid);
     }
     ++begin;
   }
@@ -237,7 +239,7 @@ constexpr T parse_numeric(char const* begin,
         break;
       } else if (*begin != opts.thousands && *begin != '+') {
         divisor /= base;
-        value += decode_digit<T>(*begin, &all_digits_valid) * divisor;
+        value += decode_digit<T, as_hex>(*begin, &all_digits_valid) * divisor;
       }
       ++begin;
     }
@@ -248,12 +250,12 @@ constexpr T parse_numeric(char const* begin,
       if (*begin == '-' || *begin == '+') { ++begin; }
       int32_t exponent = 0;
       while (begin < end) {
-        exponent = (exponent * 10) + decode_digit<T>(*(begin++), &all_digits_valid);
+        exponent = (exponent * 10) + decode_digit<T, as_hex>(*(begin++), &all_digits_valid);
       }
       if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); }
     }
   }
-  if (!all_digits_valid) { return error_result; }
+  if (!all_digits_valid) { return {}; }
 
   return value * sign;
 }
@@ -544,55 +546,6 @@ __inline__ __device__ std::pair<char const*, char const*> trim_quotes(char const
   return {begin, end};
 }
 
-/**
- * @brief Decodes a numeric value base on templated cudf type T with specified
- * base.
- *
- * @param[in] begin Beginning of the character string
- * @param[in] end End of the character string
- * @param opts The global parsing behavior options
- *
- * @return The parsed numeric value
- */
-template <typename T, int base>
-__inline__ __device__ T decode_value(char const* begin,
-                                     char const* end,
-                                     parse_options_view const& opts)
-{
-  return cudf::io::parse_numeric<T, base>(begin, end, opts);
-}
-
-/**
- * @brief Decodes a numeric value base on templated cudf type T
- *
- * @param[in] begin Beginning of the character string
- * @param[in] end End of the character string
- * @param opts The global parsing behavior options
- *
- * @return The parsed numeric value
- */
-template <typename T, CUDF_ENABLE_IF(!cudf::is_timestamp<T>() and !cudf::is_duration<T>())>
-__inline__ __device__ T decode_value(char const* begin,
-                                     char const* end,
-                                     parse_options_view const& opts)
-{
-  return cudf::io::parse_numeric<T>(begin, end, opts);
-}
-
-template <typename T, CUDF_ENABLE_IF(cudf::is_timestamp<T>())>
-__inline__ __device__ T decode_value(char const* begin,
-                                     char const* end,
-                                     parse_options_view const& opts)
-{
-  return to_timestamp<T>(begin, end, opts.dayfirst);
-}
-
-template <typename T, CUDF_ENABLE_IF(cudf::is_duration<T>())>
-__inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const&)
-{
-  return to_duration<T>(begin, end);
-}
-
 struct ConvertFunctor {
   /**
    * @brief Dispatch for numeric types whose values can be convertible to
@@ -612,15 +565,17 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex = false)
   {
-    static_cast<T*>(out_buffer)[row] = [as_hex, &opts, begin, end]() -> T {
+    auto const value = [as_hex, &opts, begin, end]() -> std::optional<T> {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
       if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return 1; }
       if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return 0; }
-      return as_hex ? decode_value<T, 16>(begin, end, opts) : decode_value<T>(begin, end, opts);
+      return as_hex ? cudf::io::parse_numeric<T, 16>(begin, end, opts)
+                    : cudf::io::parse_numeric<T>(begin, end, opts);
     }();
+    if (value.has_value()) { static_cast<T*>(out_buffer)[row] = *value; }
 
-    return true;
+    return value.has_value();
   }
 
   /**
@@ -637,13 +592,14 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
+    bool all_digits_valid = true;
     static_cast<device_storage_type_t<T>*>(out_buffer)[row] =
-      [&opts, output_type, begin, end]() -> device_storage_type_t<T> {
+      [&opts, &all_digits_valid, output_type, begin, end]() -> device_storage_type_t<T> {
       return strings::detail::parse_decimal<device_storage_type_t<T>>(
         begin, end, output_type.scale());
     }();
 
-    return true;
+    return all_digits_valid;
   }
 
   /**
@@ -658,15 +614,16 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    static_cast<T*>(out_buffer)[row] = [&opts, begin, end]() {
+    auto const value = [&opts, begin, end]() -> std::optional<T> {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
       if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; }
       if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; }
-      return decode_value<T>(begin, end, opts);
+      return cudf::io::parse_numeric<T>(begin, end, opts);
     }();
+    if (value.has_value()) { static_cast<T*>(out_buffer)[row] = *value; }
 
-    return true;
+    return value.has_value();
   }
 
   /**
@@ -682,10 +639,16 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    T const value                    = decode_value<T>(begin, end, opts);
-    static_cast<T*>(out_buffer)[row] = value;
+    auto const value = [&opts, begin, end]() -> std::optional<T> {
+      // Check for user-specified true/false values
+      auto const field_len = static_cast<size_t>(end - begin);
+      if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; }
+      if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; }
+      return cudf::io::parse_numeric<T>(begin, end, opts);
+    }();
+    static_cast<T*>(out_buffer)[row] = value.value_or(std::numeric_limits<T>::quiet_NaN());
 
-    return !std::isnan(value);
+    return value.has_value() and !std::isnan(*value);
   }
 
   /**
@@ -702,12 +665,14 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    if constexpr (cudf::is_timestamp<T>() or cudf::is_duration<T>()) {
-      static_cast<T*>(out_buffer)[row] = decode_value<T>(begin, end, opts);
-      return true;
+    if constexpr (cudf::is_timestamp<T>()) {
+      static_cast<T*>(out_buffer)[row] = to_timestamp<T>(begin, end, opts.dayfirst);
+    } else if constexpr (cudf::is_duration<T>()) {
+      static_cast<T*>(out_buffer)[row] = to_duration<T>(begin, end);
     } else {
       return false;
     }
+    return true;
   }
 };
 
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index 9ec1ec248e5..528c62e7875 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -570,9 +570,10 @@ class path_state : private parser {
               op.type          = path_operator_type::CHILD;
               op.expected_type = OBJECT;
             } else {
-              op.type  = path_operator_type::CHILD_INDEX;
-              op.index = cudf::io::parse_numeric<int>(
-                op.name.data(), op.name.data() + op.name.size_bytes(), json_opts, -1);
+              op.type          = path_operator_type::CHILD_INDEX;
+              auto const value = cudf::io::parse_numeric<int>(
+                op.name.data(), op.name.data() + op.name.size_bytes(), json_opts);
+              op.index = value.value_or(-1);
               CUDF_EXPECTS(op.index >= 0, "Invalid numeric index specified in JSONPath");
               op.expected_type = ARRAY;
             }

From c953410bc8f29becce9f594c2bcdc708f4611fa1 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Sat, 22 Oct 2022 03:53:21 +0530
Subject: [PATCH 06/39] add unit tests of corner cases

---
 cpp/tests/io/json_test.cpp | 90 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)

diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index b8cd4622484..a6b477e2c42 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -18,6 +18,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
@@ -29,6 +30,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <limits>
 #include <thrust/iterator/constant_iterator.h>
 
 #include <arrow/io/api.h>
@@ -1451,4 +1453,92 @@ TEST_F(JsonReaderTest, JsonNestedDtypeSchema)
                                  int_wrapper{{1, 1, 2}, {true, true, true}});
 }
 
+TEST_P(JsonReaderParamTest, JsonDtypeParsing)
+{
+  auto const test_opt          = GetParam();
+  bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient);
+  // All corner cases of dtype parsing
+  //  0, "0", " 0", 1, "1", " 1", "a", "z", null, true, false,  "null", "true", "false", nan, "nan"
+  // Test for dtypes: bool, int, float, str, duration, timestamp
+  std::string row_orient =
+    "[0]\n[\"0\"]\n[\" 0\"]\n[1]\n[\"1\"]\n[\" 1\"]\n[\"a\"]\n[\"z\"]\n"
+    "[null]\n[true]\n[false]\n[\"null\"]\n[\"true\"]\n[\"false\"]\n[nan]\n[\"nan\"]\n";
+  std::string record_orient = to_records_orient({{{"0", "0"}},
+                                                 {{"0", "\"0\""}},
+                                                 {{"0", "\" 0\""}},
+                                                 {{"0", "1"}},
+                                                 {{"0", "\"1\""}},
+                                                 {{"0", "\" 1\""}},
+                                                 {{"0", "\"a\""}},
+                                                 {{"0", "\"z\""}},
+                                                 {{"0", "null"}},
+                                                 {{"0", "true"}},
+                                                 {{"0", "false"}},
+                                                 {{"0", "\"null\""}},
+                                                 {{"0", "\"true\""}},
+                                                 {{"0", "\"false\""}},
+                                                 {{"0", "nan"}},
+                                                 {{"0", "\"nan\""}}},
+                                                "\n");
+
+  std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient;
+
+  auto make_validity = [](std::vector<int> const& validity) {
+    return cudf::detail::make_counting_transform_iterator(
+      0, [=](auto i) -> bool { return static_cast<bool>(validity[i]); });
+  };
+
+  constexpr int iNA    = -1;
+  constexpr double dNA = std::numeric_limits<double>::quiet_NaN();
+  constexpr bool bNA   = false;
+
+  std::vector<int> const validity_int   = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0};
+  std::vector<int> const validity_float = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0};
+  std::vector<int> const validity_bool  = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0};
+
+  auto int_col = int_wrapper{{0, 0, iNA, 1, 1, iNA, iNA, iNA, iNA, 1, 0, iNA, 1, 0, iNA, iNA},
+                             make_validity(validity_int)};
+  auto float_col =
+    float_wrapper{{0.0, 0.0, dNA, 1.0, 1.0, dNA, dNA, dNA, dNA, 1.0, 0.0, dNA, 1.0, 0.0, dNA, dNA},
+                  make_validity(validity_float)};
+  auto str_col =
+    cudf::test::strings_column_wrapper{// clang-format off
+    {"0", "0", " 0", "1", "1", " 1", "a", "z", "", "true", "false", "null", "true", "false", "nan", "nan"},
+     cudf::test::iterators::nulls_at(std::vector<int>{8, 11})};
+  // clang-format on
+  auto bool_col = bool_wrapper{
+    {false, false, bNA, true, true, bNA, bNA, bNA, bNA, true, false, bNA, true, false, bNA, bNA},
+    make_validity(validity_bool)};
+
+  // Types to test
+  const std::vector<data_type> dtypes = {
+    dtype<int32_t>(), dtype<float>(), dtype<cudf::string_view>(), dtype<bool>()};
+  const std::vector<cudf::column_view> cols{cudf::column_view(int_col),
+                                            cudf::column_view(float_col),
+                                            cudf::column_view(str_col),
+                                            cudf::column_view(bool_col)};
+  for (size_t col_type = 0; col_type < cols.size(); col_type++) {
+    std::map<std::string, cudf::io::schema_element> dtype_schema{{"0", {dtypes[col_type]}}};
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+        .dtypes(dtype_schema)
+        .lines(true)
+        .experimental(test_experimental);
+
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+    EXPECT_EQ(result.tbl->num_columns(), 1);
+    EXPECT_EQ(result.tbl->num_rows(), 16);
+
+    EXPECT_EQ(result.tbl->get_column(0).type().id(), dtypes[col_type].id());
+
+    EXPECT_EQ(result.metadata.schema_info[0].name, "0");
+
+    std::cout << "Column: " << col_type << std::endl;
+    cudf::test::print(result.tbl->get_column(0));
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), cols[col_type]);
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 1646bcc4e4462d123c5e461efac35be865cd9c17 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Sat, 22 Oct 2022 05:01:25 +0530
Subject: [PATCH 07/39] fix old json reader to treat "null" as string.

---
 cpp/src/io/json/json_gpu.cu | 5 ++++-
 cpp/tests/io/json_test.cpp  | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index dbfcca7d37a..e4fb7e2edf5 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -255,13 +255,16 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
     auto const desc =
       next_field_descriptor(current, row_data_range.second, opts, input_field_index, col_map);
     auto const value_len = static_cast<size_t>(std::max(desc.value_end - desc.value_begin, 0L));
+    const bool is_quoted =
+      *(desc.value_begin - 1) == opts.quotechar && *desc.value_end == opts.quotechar;
 
     current = desc.value_end + 1;
 
     using string_index_pair = thrust::pair<const char*, size_type>;
 
     // Empty fields are not legal values
-    if (!serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) {
+    if (!serialized_trie_contains(opts.trie_na,
+                                  {desc.value_begin - is_quoted, value_len + is_quoted * 2})) {
       // Type dispatcher does not handle strings
       if (column_types[desc.column].id() == type_id::STRING) {
         auto str_list           = static_cast<string_index_pair*>(output_columns[desc.column]);
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index a6b477e2c42..c7cd8bef9e3 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1504,7 +1504,7 @@ TEST_P(JsonReaderParamTest, JsonDtypeParsing)
   auto str_col =
     cudf::test::strings_column_wrapper{// clang-format off
     {"0", "0", " 0", "1", "1", " 1", "a", "z", "", "true", "false", "null", "true", "false", "nan", "nan"},
-     cudf::test::iterators::nulls_at(std::vector<int>{8, 11})};
+     cudf::test::iterators::nulls_at(std::vector<int>{8})};
   // clang-format on
   auto bool_col = bool_wrapper{
     {false, false, bNA, true, true, bNA, bNA, bNA, bNA, true, false, bNA, true, false, bNA, bNA},

From f0c468bd071e878626bfbb069e67eb08b0bf9995 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Sat, 22 Oct 2022 05:01:55 +0530
Subject: [PATCH 08/39] cleanup decode_digit

---
 cpp/src/io/utilities/parsing_utils.cuh | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index b37ab9d2b55..daf73118fc1 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -118,7 +118,8 @@ struct parse_options {
 
 /**
  * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization
- * for integral types. Handles hexadecimal digits, both uppercase and lowercase.
+ * for integral types. Handles hexadecimal digits, both uppercase and lowercase
+ * for integral types and only decimal digits for floating point types.
  * If the character is not a valid numeric digit then `0` is returned and
  * valid_flag is set to false.
  *
@@ -127,11 +128,11 @@ struct parse_options {
  *
  * @return uint8_t Numeric value of the character, or `0`
  */
-template <typename T, bool as_hex = false, CUDF_ENABLE_IF(std::is_integral_v<T>)>
+template <typename T, bool as_hex = false>
 constexpr uint8_t decode_digit(char c, bool* valid_flag)
 {
   if (c >= '0' && c <= '9') return c - '0';
-  if constexpr (as_hex) {
+  if constexpr (as_hex and std::is_integral_v<T>) {
     if (c >= 'a' && c <= 'f') return c - 'a' + 10;
     if (c >= 'A' && c <= 'F') return c - 'A' + 10;
   }
@@ -140,25 +141,6 @@ constexpr uint8_t decode_digit(char c, bool* valid_flag)
   return 0;
 }
 
-/**
- * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization
- * for non-integral types. Handles only decimal digits. If the character is not
- * a valid numeric digit then `0` is returned and valid_flag is set to false.
- *
- * @param c ASCII or UTF-8 character
- * @param valid_flag Set to false if input is not valid. Unchanged otherwise.
- *
- * @return uint8_t Numeric value of the character, or `0`
- */
-template <typename T, bool as_hex = false, CUDF_ENABLE_IF(!std::is_integral_v<T>)>
-constexpr uint8_t decode_digit(char c, bool* valid_flag)
-{
-  if (c >= '0' && c <= '9') return c - '0';
-
-  *valid_flag = false;
-  return 0;
-}
-
 // Converts character to lowercase.
 constexpr char to_lower(char const c) { return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; }
 

From cad413cf138caa36cfb4cafbf6fcb3f61f7faa72 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Sat, 22 Oct 2022 05:11:19 +0530
Subject: [PATCH 09/39] fix doc

---
 cpp/src/io/utilities/parsing_utils.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index daf73118fc1..dabb39f5e3a 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -117,8 +117,8 @@ struct parse_options {
 };
 
 /**
- * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization
- * for integral types. Handles hexadecimal digits, both uppercase and lowercase
+ * @brief Returns the numeric value of an ASCII/UTF-8 character.
+ * Handles hexadecimal digits, both uppercase and lowercase
  * for integral types and only decimal digits for floating point types.
  * If the character is not a valid numeric digit then `0` is returned and
  * valid_flag is set to false.

From bbb434a619b3170b0f38ddbfa1e0b697f5e1fa28 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Sat, 22 Oct 2022 05:13:53 +0530
Subject: [PATCH 10/39] add TODO invalid input for decimal, duration, timestamp

---
 cpp/src/io/utilities/parsing_utils.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index dabb39f5e3a..414375147f2 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -574,6 +574,7 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
+    // TODO what's invalid input
     bool all_digits_valid = true;
     static_cast<device_storage_type_t<T>*>(out_buffer)[row] =
       [&opts, &all_digits_valid, output_type, begin, end]() -> device_storage_type_t<T> {
@@ -647,6 +648,7 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
+    // TODO what's invalid input
     if constexpr (cudf::is_timestamp<T>()) {
       static_cast<T*>(out_buffer)[row] = to_timestamp<T>(begin, end, opts.dayfirst);
     } else if constexpr (cudf::is_duration<T>()) {

From 0910f2e7b17db2bfba7c821481660507474998df Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Wed, 26 Oct 2022 20:43:45 +0530
Subject: [PATCH 11/39] fix bug in csv_reader_options construction in cython

---
 python/cudf/cudf/_lib/csv.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index f1a75baa951..52ef8d6c92d 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -304,7 +304,7 @@ cdef csv_reader_options make_csv_reader_options(
 
     if false_values is not None:
         c_false_values.reserve(len(false_values))
-        for fv in c_false_values:
+        for fv in false_values:
             c_false_values.push_back(fv.encode())
         csv_reader_options_c.set_false_values(c_false_values)
 

From 31636ea0a80e55b71465fc3d160aad7744fbdabe Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Wed, 26 Oct 2022 20:55:41 +0530
Subject: [PATCH 12/39] add pytest for breaking behaviour in read_csv

---
 python/cudf/cudf/tests/test_csv.py | 37 ++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index b91893d8991..580b5a2b2df 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -766,6 +766,43 @@ def test_csv_reader_bools(tmpdir, names, dtypes, data, trues, falses):
     assert_eq(df_out, out)
 
 
+def test_csv_reader_bools_NA():
+    names = ["text", "int"]
+    dtypes = ["str", "int"]
+    trues = ["foo"]
+    falses = ["bar"]
+    lines = [
+        ",".join(names),
+        "true,true",
+        "false,false",
+        "foo,foo",
+        "bar,bar",
+        "qux,qux",
+    ]
+
+    buffer = "\n".join(lines)
+
+    df = read_csv(
+        StringIO(buffer),
+        names=names,
+        dtype=dtypes,
+        skiprows=1,
+        true_values=trues,
+        false_values=falses,
+    )
+    assert len(df.columns) == 2
+    assert df["text"].dtype == np.dtype("object")
+    assert df["int"].dtype == np.dtype("int64")
+    expected = pd.DataFrame(
+        {
+            "text": ["true", "false", "foo", "bar", "qux"],
+            "int": [1.0, 0.0, 1.0, 0.0, np.nan],
+        }
+    )
+    # breaking behaviour is np.nan for qux
+    assert_eq(df, expected)
+
+
 def test_csv_quotednumbers(tmpdir):
     fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file12.csv")
 

From d2757dc20343f25035da4c91653afc0ea8c27719 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Wed, 26 Oct 2022 20:43:45 +0530
Subject: [PATCH 13/39] fix bug in csv_reader_options construction in cython

---
 python/cudf/cudf/_lib/csv.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index f1a75baa951..52ef8d6c92d 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -304,7 +304,7 @@ cdef csv_reader_options make_csv_reader_options(
 
     if false_values is not None:
         c_false_values.reserve(len(false_values))
-        for fv in c_false_values:
+        for fv in false_values:
             c_false_values.push_back(fv.encode())
         csv_reader_options_c.set_false_values(c_false_values)
 

From e26b3d8ebbb554d1fe8b02aa4041e7d7f9699b15 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Wed, 26 Oct 2022 20:55:41 +0530
Subject: [PATCH 14/39] add pytest for breaking behaviour in read_csv

---
 python/cudf/cudf/tests/test_csv.py | 37 ++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index b91893d8991..580b5a2b2df 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -766,6 +766,43 @@ def test_csv_reader_bools(tmpdir, names, dtypes, data, trues, falses):
     assert_eq(df_out, out)
 
 
+def test_csv_reader_bools_NA():
+    names = ["text", "int"]
+    dtypes = ["str", "int"]
+    trues = ["foo"]
+    falses = ["bar"]
+    lines = [
+        ",".join(names),
+        "true,true",
+        "false,false",
+        "foo,foo",
+        "bar,bar",
+        "qux,qux",
+    ]
+
+    buffer = "\n".join(lines)
+
+    df = read_csv(
+        StringIO(buffer),
+        names=names,
+        dtype=dtypes,
+        skiprows=1,
+        true_values=trues,
+        false_values=falses,
+    )
+    assert len(df.columns) == 2
+    assert df["text"].dtype == np.dtype("object")
+    assert df["int"].dtype == np.dtype("int64")
+    expected = pd.DataFrame(
+        {
+            "text": ["true", "false", "foo", "bar", "qux"],
+            "int": [1.0, 0.0, 1.0, 0.0, np.nan],
+        }
+    )
+    # breaking behaviour is np.nan for qux
+    assert_eq(df, expected)
+
+
 def test_csv_quotednumbers(tmpdir):
     fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file12.csv")
 

From e119facfadf2481f40302e05981ec7e9b4489113 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 28 Oct 2022 15:47:00 +0530
Subject: [PATCH 15/39] parsing error in numeric types should make that element
 null

---
 cpp/src/io/utilities/parsing_utils.cuh | 150 +++++++++----------------
 cpp/src/strings/json/json_path.cu      |   7 +-
 cpp/tests/io/json_test.cpp             |  90 +++++++++++++++
 python/cudf/cudf/tests/test_json.py    |  52 +++++++++
 4 files changed, 201 insertions(+), 98 deletions(-)

diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 388c9b28001..414375147f2 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -117,8 +117,9 @@ struct parse_options {
 };
 
 /**
- * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization
- * for integral types. Handles hexadecimal digits, both uppercase and lowercase.
+ * @brief Returns the numeric value of an ASCII/UTF-8 character.
+ * Handles hexadecimal digits, both uppercase and lowercase
+ * for integral types and only decimal digits for floating point types.
  * If the character is not a valid numeric digit then `0` is returned and
  * valid_flag is set to false.
  *
@@ -127,31 +128,14 @@ struct parse_options {
  *
  * @return uint8_t Numeric value of the character, or `0`
  */
-template <typename T, CUDF_ENABLE_IF(std::is_integral_v<T>)>
-constexpr uint8_t decode_digit(char c, bool* valid_flag)
-{
-  if (c >= '0' && c <= '9') return c - '0';
-  if (c >= 'a' && c <= 'f') return c - 'a' + 10;
-  if (c >= 'A' && c <= 'F') return c - 'A' + 10;
-
-  *valid_flag = false;
-  return 0;
-}
-
-/**
- * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization
- * for non-integral types. Handles only decimal digits. If the character is not
- * a valid numeric digit then `0` is returned and valid_flag is set to false.
- *
- * @param c ASCII or UTF-8 character
- * @param valid_flag Set to false if input is not valid. Unchanged otherwise.
- *
- * @return uint8_t Numeric value of the character, or `0`
- */
-template <typename T, CUDF_ENABLE_IF(!std::is_integral_v<T>)>
+template <typename T, bool as_hex = false>
 constexpr uint8_t decode_digit(char c, bool* valid_flag)
 {
   if (c >= '0' && c <= '9') return c - '0';
+  if constexpr (as_hex and std::is_integral_v<T>) {
+    if (c >= 'a' && c <= 'f') return c - 'a' + 10;
+    if (c >= 'A' && c <= 'F') return c - 'A' + 10;
+  }
 
   *valid_flag = false;
   return 0;
@@ -194,13 +178,13 @@ constexpr bool is_infinity(char const* begin, char const* end)
  * @return The parsed and converted value
  */
 template <typename T, int base = 10>
-constexpr T parse_numeric(char const* begin,
-                          char const* end,
-                          parse_options_view const& opts,
-                          T error_result = std::numeric_limits<T>::quiet_NaN())
+__host__ __device__ std::optional<T> parse_numeric(char const* begin,
+                                                   char const* end,
+                                                   parse_options_view const& opts)
 {
   T value{};
   bool all_digits_valid = true;
+  constexpr bool as_hex = (base == 16);
 
   // Handle negative values if necessary
   int32_t sign = (*begin == '-') ? -1 : 1;
@@ -223,7 +207,7 @@ constexpr T parse_numeric(char const* begin,
     } else if (base == 10 && (*begin == 'e' || *begin == 'E')) {
       break;
     } else if (*begin != opts.thousands && *begin != '+') {
-      value = (value * base) + decode_digit<T>(*begin, &all_digits_valid);
+      value = (value * base) + decode_digit<T, as_hex>(*begin, &all_digits_valid);
     }
     ++begin;
   }
@@ -237,7 +221,7 @@ constexpr T parse_numeric(char const* begin,
         break;
       } else if (*begin != opts.thousands && *begin != '+') {
         divisor /= base;
-        value += decode_digit<T>(*begin, &all_digits_valid) * divisor;
+        value += decode_digit<T, as_hex>(*begin, &all_digits_valid) * divisor;
       }
       ++begin;
     }
@@ -248,12 +232,12 @@ constexpr T parse_numeric(char const* begin,
       if (*begin == '-' || *begin == '+') { ++begin; }
       int32_t exponent = 0;
       while (begin < end) {
-        exponent = (exponent * 10) + decode_digit<T>(*(begin++), &all_digits_valid);
+        exponent = (exponent * 10) + decode_digit<T, as_hex>(*(begin++), &all_digits_valid);
       }
       if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); }
     }
   }
-  if (!all_digits_valid) { return error_result; }
+  if (!all_digits_valid) { return {}; }
 
   return value * sign;
 }
@@ -524,62 +508,24 @@ __inline__ __device__ std::pair<char const*, char const*> trim_whitespaces_quote
 }
 
 /**
- * @brief Decodes a numeric value base on templated cudf type T with specified
- * base.
- *
- * @param[in] begin Beginning of the character string
- * @param[in] end End of the character string
- * @param opts The global parsing behavior options
- *
- * @return The parsed numeric value
- */
-template <typename T, int base>
-__inline__ __device__ T decode_value(char const* begin,
-                                     char const* end,
-                                     parse_options_view const& opts)
-{
-  return cudf::io::parse_numeric<T, base>(begin, end, opts);
-}
-
-/**
- * @brief Decodes a numeric value base on templated cudf type T
+ * @brief Adjusts the range to ignore starting/trailing quotation characters.
  *
- * @param[in] begin Beginning of the character string
- * @param[in] end End of the character string
- * @param opts The global parsing behavior options
+ * @param[in] begin Pointer to the first character in the parsing range
+ * @param[in] end pointer to the first character after the parsing range
+ * @param[in] quotechar The character used to denote quotes; '\0' if none
  *
- * @return The parsed numeric value
+ * @return Trimmed range
  */
-template <typename T, CUDF_ENABLE_IF(!cudf::is_timestamp<T>() and !cudf::is_duration<T>())>
-__inline__ __device__ T decode_value(char const* begin,
-                                     char const* end,
-                                     parse_options_view const& opts)
-{
-  return cudf::io::parse_numeric<T>(begin, end, opts);
-}
-
-template <typename T, CUDF_ENABLE_IF(cudf::is_timestamp<T>())>
-__inline__ __device__ T decode_value(char const* begin,
-                                     char const* end,
-                                     parse_options_view const& opts)
+__inline__ __device__ std::pair<char const*, char const*> trim_quotes(char const* begin,
+                                                                      char const* end,
+                                                                      char quotechar = '\0')
 {
-  // If this is a string value, remove quotes
-  if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) {
+  if ((thrust::distance(begin, end) >= 2 && *begin == quotechar &&
+       *thrust::prev(end) == quotechar)) {
     thrust::advance(begin, 1);
     thrust::advance(end, -1);
   }
-  return to_timestamp<T>(begin, end, opts.dayfirst);
-}
-
-template <typename T, CUDF_ENABLE_IF(cudf::is_duration<T>())>
-__inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const&)
-{
-  // If this is a string value, remove quotes
-  if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) {
-    thrust::advance(begin, 1);
-    thrust::advance(end, -1);
-  }
-  return to_duration<T>(begin, end);
+  return {begin, end};
 }
 
 struct ConvertFunctor {
@@ -601,15 +547,17 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex = false)
   {
-    static_cast<T*>(out_buffer)[row] = [as_hex, &opts, begin, end]() -> T {
+    auto const value = [as_hex, &opts, begin, end]() -> std::optional<T> {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
       if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return 1; }
       if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return 0; }
-      return as_hex ? decode_value<T, 16>(begin, end, opts) : decode_value<T>(begin, end, opts);
+      return as_hex ? cudf::io::parse_numeric<T, 16>(begin, end, opts)
+                    : cudf::io::parse_numeric<T>(begin, end, opts);
     }();
+    if (value.has_value()) { static_cast<T*>(out_buffer)[row] = *value; }
 
-    return true;
+    return value.has_value();
   }
 
   /**
@@ -626,13 +574,15 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
+    // TODO what's invalid input
+    bool all_digits_valid = true;
     static_cast<device_storage_type_t<T>*>(out_buffer)[row] =
-      [&opts, output_type, begin, end]() -> device_storage_type_t<T> {
+      [&opts, &all_digits_valid, output_type, begin, end]() -> device_storage_type_t<T> {
       return strings::detail::parse_decimal<device_storage_type_t<T>>(
         begin, end, output_type.scale());
     }();
 
-    return true;
+    return all_digits_valid;
   }
 
   /**
@@ -647,15 +597,16 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    static_cast<T*>(out_buffer)[row] = [&opts, begin, end]() {
+    auto const value = [&opts, begin, end]() -> std::optional<T> {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
       if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; }
       if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; }
-      return decode_value<T>(begin, end, opts);
+      return cudf::io::parse_numeric<T>(begin, end, opts);
     }();
+    if (value.has_value()) { static_cast<T*>(out_buffer)[row] = *value; }
 
-    return true;
+    return value.has_value();
   }
 
   /**
@@ -671,10 +622,16 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    T const value                    = decode_value<T>(begin, end, opts);
-    static_cast<T*>(out_buffer)[row] = value;
+    auto const value = [&opts, begin, end]() -> std::optional<T> {
+      // Check for user-specified true/false values
+      auto const field_len = static_cast<size_t>(end - begin);
+      if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; }
+      if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; }
+      return cudf::io::parse_numeric<T>(begin, end, opts);
+    }();
+    static_cast<T*>(out_buffer)[row] = value.value_or(std::numeric_limits<T>::quiet_NaN());
 
-    return !std::isnan(value);
+    return value.has_value() and !std::isnan(*value);
   }
 
   /**
@@ -691,12 +648,15 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    if constexpr (cudf::is_timestamp<T>() or cudf::is_duration<T>()) {
-      static_cast<T*>(out_buffer)[row] = decode_value<T>(begin, end, opts);
-      return true;
+    // TODO what's invalid input
+    if constexpr (cudf::is_timestamp<T>()) {
+      static_cast<T*>(out_buffer)[row] = to_timestamp<T>(begin, end, opts.dayfirst);
+    } else if constexpr (cudf::is_duration<T>()) {
+      static_cast<T*>(out_buffer)[row] = to_duration<T>(begin, end);
     } else {
       return false;
     }
+    return true;
   }
 };
 
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index 303c35ea7fb..5ce00eeee4a 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -570,9 +570,10 @@ class path_state : private parser {
               op.type          = path_operator_type::CHILD;
               op.expected_type = OBJECT;
             } else {
-              op.type  = path_operator_type::CHILD_INDEX;
-              op.index = cudf::io::parse_numeric<int>(
-                op.name.data(), op.name.data() + op.name.size_bytes(), json_opts, -1);
+              op.type          = path_operator_type::CHILD_INDEX;
+              auto const value = cudf::io::parse_numeric<int>(
+                op.name.data(), op.name.data() + op.name.size_bytes(), json_opts);
+              op.index = value.value_or(-1);
               CUDF_EXPECTS(op.index >= 0, "Invalid numeric index specified in JSONPath");
               op.expected_type = ARRAY;
             }
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index b8cd4622484..c7cd8bef9e3 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -18,6 +18,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
@@ -29,6 +30,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <limits>
 #include <thrust/iterator/constant_iterator.h>
 
 #include <arrow/io/api.h>
@@ -1451,4 +1453,92 @@ TEST_F(JsonReaderTest, JsonNestedDtypeSchema)
                                  int_wrapper{{1, 1, 2}, {true, true, true}});
 }
 
+TEST_P(JsonReaderParamTest, JsonDtypeParsing)
+{
+  auto const test_opt          = GetParam();
+  bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient);
+  // All corner cases of dtype parsing
+  //  0, "0", " 0", 1, "1", " 1", "a", "z", null, true, false,  "null", "true", "false", nan, "nan"
+  // Test for dtypes: bool, int, float, str, duration, timestamp
+  std::string row_orient =
+    "[0]\n[\"0\"]\n[\" 0\"]\n[1]\n[\"1\"]\n[\" 1\"]\n[\"a\"]\n[\"z\"]\n"
+    "[null]\n[true]\n[false]\n[\"null\"]\n[\"true\"]\n[\"false\"]\n[nan]\n[\"nan\"]\n";
+  std::string record_orient = to_records_orient({{{"0", "0"}},
+                                                 {{"0", "\"0\""}},
+                                                 {{"0", "\" 0\""}},
+                                                 {{"0", "1"}},
+                                                 {{"0", "\"1\""}},
+                                                 {{"0", "\" 1\""}},
+                                                 {{"0", "\"a\""}},
+                                                 {{"0", "\"z\""}},
+                                                 {{"0", "null"}},
+                                                 {{"0", "true"}},
+                                                 {{"0", "false"}},
+                                                 {{"0", "\"null\""}},
+                                                 {{"0", "\"true\""}},
+                                                 {{"0", "\"false\""}},
+                                                 {{"0", "nan"}},
+                                                 {{"0", "\"nan\""}}},
+                                                "\n");
+
+  std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient;
+
+  auto make_validity = [](std::vector<int> const& validity) {
+    return cudf::detail::make_counting_transform_iterator(
+      0, [=](auto i) -> bool { return static_cast<bool>(validity[i]); });
+  };
+
+  constexpr int iNA    = -1;
+  constexpr double dNA = std::numeric_limits<double>::quiet_NaN();
+  constexpr bool bNA   = false;
+
+  std::vector<int> const validity_int   = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0};
+  std::vector<int> const validity_float = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0};
+  std::vector<int> const validity_bool  = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0};
+
+  auto int_col = int_wrapper{{0, 0, iNA, 1, 1, iNA, iNA, iNA, iNA, 1, 0, iNA, 1, 0, iNA, iNA},
+                             make_validity(validity_int)};
+  auto float_col =
+    float_wrapper{{0.0, 0.0, dNA, 1.0, 1.0, dNA, dNA, dNA, dNA, 1.0, 0.0, dNA, 1.0, 0.0, dNA, dNA},
+                  make_validity(validity_float)};
+  auto str_col =
+    cudf::test::strings_column_wrapper{// clang-format off
+    {"0", "0", " 0", "1", "1", " 1", "a", "z", "", "true", "false", "null", "true", "false", "nan", "nan"},
+     cudf::test::iterators::nulls_at(std::vector<int>{8})};
+  // clang-format on
+  auto bool_col = bool_wrapper{
+    {false, false, bNA, true, true, bNA, bNA, bNA, bNA, true, false, bNA, true, false, bNA, bNA},
+    make_validity(validity_bool)};
+
+  // Types to test
+  const std::vector<data_type> dtypes = {
+    dtype<int32_t>(), dtype<float>(), dtype<cudf::string_view>(), dtype<bool>()};
+  const std::vector<cudf::column_view> cols{cudf::column_view(int_col),
+                                            cudf::column_view(float_col),
+                                            cudf::column_view(str_col),
+                                            cudf::column_view(bool_col)};
+  for (size_t col_type = 0; col_type < cols.size(); col_type++) {
+    std::map<std::string, cudf::io::schema_element> dtype_schema{{"0", {dtypes[col_type]}}};
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+        .dtypes(dtype_schema)
+        .lines(true)
+        .experimental(test_experimental);
+
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+    EXPECT_EQ(result.tbl->num_columns(), 1);
+    EXPECT_EQ(result.tbl->num_rows(), 16);
+
+    EXPECT_EQ(result.tbl->get_column(0).type().id(), dtypes[col_type].id());
+
+    EXPECT_EQ(result.metadata.schema_info[0].name, "0");
+
+    std::cout << "Column: " << col_type << std::endl;
+    cudf::test::print(result.tbl->get_column(0));
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), cols[col_type]);
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index fb2c24b3757..8d06d3e55f3 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -687,6 +687,58 @@ def test_json_types_data():
     assert df.to_arrow().equals(pa_table_pdf)
 
 
+@pytest.mark.parametrize(
+    "col_type,json_str",
+    [
+        # without quotes
+        ("int", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'),
+        # with quotes
+        ("int", '[{"k": "1"}, {"k": "2"}]'),
+        # with quotes, mixed
+        ("int", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'),
+        # with quotes, null, mixed
+        ("int", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'),
+        # without quotes, null
+        ("int", '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]'),
+        # without quotes
+        ("float", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'),
+        # with quotes
+        ("float", '[{"k": "1"}, {"k": "2"}]'),
+        # with quotes, mixed
+        ("float", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'),
+        # with quotes, null, mixed
+        ("float", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'),
+        # with quotes, NAN
+        ("float", '[{"k": "1"}, {"k": "2"}, {"k": NaN}, {"k": "4"}]'),
+        # without quotes
+        ("str", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'),
+        # with quotes
+        ("str", '[{"k": "1"}, {"k": "2"}]'),
+        # with quotes, mixed
+        ("str", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'),
+        # with quotes, null, mixed
+        ("str", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'),
+        # without quotes, null
+        ("str", '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]'),
+    ],
+)
+def test_json_quoted_values_with_schema(col_type, json_str):
+    experimental_df = cudf.read_json(
+        StringIO(json_str),
+        engine="cudf_experimental",
+        orient="records",
+        dtype={"k": col_type},
+    )
+    cudf_df = cudf.read_json(
+        StringIO(json_str.replace(",", "\n")[1:-1]),
+        engine="cudf",
+        orient="records",
+        lines=True,
+        dtype={"k": col_type},
+    )
+    assert_eq(cudf_df, experimental_df)
+
+
 @pytest.mark.parametrize(
     "keep_quotes,result",
     [

From 3f8eb525aa7dd0ab292deb515ab580cb08f3d94e Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 28 Oct 2022 16:42:33 +0530
Subject: [PATCH 16/39] revert parsing error changes

---
 cpp/src/io/utilities/parsing_utils.cuh | 155 +++++++++++++++++--------
 cpp/src/strings/json/json_path.cu      |   7 +-
 cpp/tests/io/json_test.cpp             |  90 --------------
 python/cudf/cudf/_lib/csv.pyx          |   2 +-
 python/cudf/cudf/tests/test_csv.py     |  37 ------
 python/cudf/cudf/tests/test_json.py    |  52 ---------
 6 files changed, 112 insertions(+), 231 deletions(-)

diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 414375147f2..354abfa3985 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -117,9 +117,8 @@ struct parse_options {
 };
 
 /**
- * @brief Returns the numeric value of an ASCII/UTF-8 character.
- * Handles hexadecimal digits, both uppercase and lowercase
- * for integral types and only decimal digits for floating point types.
+ * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization
+ * for integral types. Handles hexadecimal digits, both uppercase and lowercase.
  * If the character is not a valid numeric digit then `0` is returned and
  * valid_flag is set to false.
  *
@@ -128,14 +127,31 @@ struct parse_options {
  *
  * @return uint8_t Numeric value of the character, or `0`
  */
-template <typename T, bool as_hex = false>
+template <typename T, CUDF_ENABLE_IF(std::is_integral_v<T>)>
+constexpr uint8_t decode_digit(char c, bool* valid_flag)
+{
+  if (c >= '0' && c <= '9') return c - '0';
+  if (c >= 'a' && c <= 'f') return c - 'a' + 10;
+  if (c >= 'A' && c <= 'F') return c - 'A' + 10;
+
+  *valid_flag = false;
+  return 0;
+}
+
+/**
+ * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization
+ * for non-integral types. Handles only decimal digits. If the character is not
+ * a valid numeric digit then `0` is returned and valid_flag is set to false.
+ *
+ * @param c ASCII or UTF-8 character
+ * @param valid_flag Set to false if input is not valid. Unchanged otherwise.
+ *
+ * @return uint8_t Numeric value of the character, or `0`
+ */
+template <typename T, CUDF_ENABLE_IF(!std::is_integral_v<T>)>
 constexpr uint8_t decode_digit(char c, bool* valid_flag)
 {
   if (c >= '0' && c <= '9') return c - '0';
-  if constexpr (as_hex and std::is_integral_v<T>) {
-    if (c >= 'a' && c <= 'f') return c - 'a' + 10;
-    if (c >= 'A' && c <= 'F') return c - 'A' + 10;
-  }
 
   *valid_flag = false;
   return 0;
@@ -178,13 +194,13 @@ constexpr bool is_infinity(char const* begin, char const* end)
  * @return The parsed and converted value
  */
 template <typename T, int base = 10>
-__host__ __device__ std::optional<T> parse_numeric(char const* begin,
-                                                   char const* end,
-                                                   parse_options_view const& opts)
+constexpr T parse_numeric(char const* begin,
+                          char const* end,
+                          parse_options_view const& opts,
+                          T error_result = std::numeric_limits<T>::quiet_NaN())
 {
   T value{};
   bool all_digits_valid = true;
-  constexpr bool as_hex = (base == 16);
 
   // Handle negative values if necessary
   int32_t sign = (*begin == '-') ? -1 : 1;
@@ -207,7 +223,7 @@ __host__ __device__ std::optional<T> parse_numeric(char const* begin,
     } else if (base == 10 && (*begin == 'e' || *begin == 'E')) {
       break;
     } else if (*begin != opts.thousands && *begin != '+') {
-      value = (value * base) + decode_digit<T, as_hex>(*begin, &all_digits_valid);
+      value = (value * base) + decode_digit<T>(*begin, &all_digits_valid);
     }
     ++begin;
   }
@@ -221,7 +237,7 @@ __host__ __device__ std::optional<T> parse_numeric(char const* begin,
         break;
       } else if (*begin != opts.thousands && *begin != '+') {
         divisor /= base;
-        value += decode_digit<T, as_hex>(*begin, &all_digits_valid) * divisor;
+        value += decode_digit<T>(*begin, &all_digits_valid) * divisor;
       }
       ++begin;
     }
@@ -232,12 +248,12 @@ __host__ __device__ std::optional<T> parse_numeric(char const* begin,
       if (*begin == '-' || *begin == '+') { ++begin; }
       int32_t exponent = 0;
       while (begin < end) {
-        exponent = (exponent * 10) + decode_digit<T, as_hex>(*(begin++), &all_digits_valid);
+        exponent = (exponent * 10) + decode_digit<T>(*(begin++), &all_digits_valid);
       }
       if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); }
     }
   }
-  if (!all_digits_valid) { return {}; }
+  if (!all_digits_valid) { return error_result; }
 
   return value * sign;
 }
@@ -510,9 +526,9 @@ __inline__ __device__ std::pair<char const*, char const*> trim_whitespaces_quote
 /**
  * @brief Adjusts the range to ignore starting/trailing quotation characters.
  *
- * @param[in] begin Pointer to the first character in the parsing range
- * @param[in] end pointer to the first character after the parsing range
- * @param[in] quotechar The character used to denote quotes; '\0' if none
+ * @param begin Pointer to the first character in the parsing range
+ * @param end pointer to the first character after the parsing range
+ * @param quotechar The character used to denote quotes; '\0' if none
  *
  * @return Trimmed range
  */
@@ -528,6 +544,65 @@ __inline__ __device__ std::pair<char const*, char const*> trim_quotes(char const
   return {begin, end};
 }
 
+/**
+ * @brief Decodes a numeric value base on templated cudf type T with specified
+ * base.
+ *
+ * @param[in] begin Beginning of the character string
+ * @param[in] end End of the character string
+ * @param opts The global parsing behavior options
+ *
+ * @return The parsed numeric value
+ */
+template <typename T, int base>
+__inline__ __device__ T decode_value(char const* begin,
+                                     char const* end,
+                                     parse_options_view const& opts)
+{
+  return cudf::io::parse_numeric<T, base>(begin, end, opts);
+}
+
+/**
+ * @brief Decodes a numeric value base on templated cudf type T
+ *
+ * @param[in] begin Beginning of the character string
+ * @param[in] end End of the character string
+ * @param opts The global parsing behavior options
+ *
+ * @return The parsed numeric value
+ */
+template <typename T, CUDF_ENABLE_IF(!cudf::is_timestamp<T>() and !cudf::is_duration<T>())>
+__inline__ __device__ T decode_value(char const* begin,
+                                     char const* end,
+                                     parse_options_view const& opts)
+{
+  return cudf::io::parse_numeric<T>(begin, end, opts);
+}
+
+template <typename T, CUDF_ENABLE_IF(cudf::is_timestamp<T>())>
+__inline__ __device__ T decode_value(char const* begin,
+                                     char const* end,
+                                     parse_options_view const& opts)
+{
+  // If this is a string value, remove quotes
+  if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) {
+    thrust::advance(begin, 1);
+    thrust::advance(end, -1);
+  }
+  return to_timestamp<T>(begin, end, opts.dayfirst);
+}
+
+template <typename T, CUDF_ENABLE_IF(cudf::is_duration<T>())>
+__inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const&)
+{
+  // If this is a string value, remove quotes
+  if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) {
+    thrust::advance(begin, 1);
+    thrust::advance(end, -1);
+  }
+  return to_duration<T>(begin, end);
+}
+
 struct ConvertFunctor {
   /**
    * @brief Dispatch for numeric types whose values can be convertible to
@@ -547,17 +622,15 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex = false)
   {
-    auto const value = [as_hex, &opts, begin, end]() -> std::optional<T> {
+    static_cast<T*>(out_buffer)[row] = [as_hex, &opts, begin, end]() -> T {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
       if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return 1; }
       if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return 0; }
-      return as_hex ? cudf::io::parse_numeric<T, 16>(begin, end, opts)
-                    : cudf::io::parse_numeric<T>(begin, end, opts);
+      return as_hex ? decode_value<T, 16>(begin, end, opts) : decode_value<T>(begin, end, opts);
     }();
-    if (value.has_value()) { static_cast<T*>(out_buffer)[row] = *value; }
 
-    return value.has_value();
+    return true;
   }
 
   /**
@@ -574,15 +647,13 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    // TODO what's invalid input
-    bool all_digits_valid = true;
     static_cast<device_storage_type_t<T>*>(out_buffer)[row] =
-      [&opts, &all_digits_valid, output_type, begin, end]() -> device_storage_type_t<T> {
+      [&opts, output_type, begin, end]() -> device_storage_type_t<T> {
       return strings::detail::parse_decimal<device_storage_type_t<T>>(
         begin, end, output_type.scale());
     }();
 
-    return all_digits_valid;
+    return true;
   }
 
   /**
@@ -597,16 +668,15 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    auto const value = [&opts, begin, end]() -> std::optional<T> {
+    static_cast<T*>(out_buffer)[row] = [&opts, begin, end]() {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
       if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; }
       if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; }
-      return cudf::io::parse_numeric<T>(begin, end, opts);
+      return decode_value<T>(begin, end, opts);
     }();
-    if (value.has_value()) { static_cast<T*>(out_buffer)[row] = *value; }
 
-    return value.has_value();
+    return true;
   }
 
   /**
@@ -622,16 +692,10 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    auto const value = [&opts, begin, end]() -> std::optional<T> {
-      // Check for user-specified true/false values
-      auto const field_len = static_cast<size_t>(end - begin);
-      if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; }
-      if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; }
-      return cudf::io::parse_numeric<T>(begin, end, opts);
-    }();
-    static_cast<T*>(out_buffer)[row] = value.value_or(std::numeric_limits<T>::quiet_NaN());
+    T const value                    = decode_value<T>(begin, end, opts);
+    static_cast<T*>(out_buffer)[row] = value;
 
-    return value.has_value() and !std::isnan(*value);
+    return !std::isnan(value);
   }
 
   /**
@@ -648,15 +712,12 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    // TODO what's invalid input
-    if constexpr (cudf::is_timestamp<T>()) {
-      static_cast<T*>(out_buffer)[row] = to_timestamp<T>(begin, end, opts.dayfirst);
-    } else if constexpr (cudf::is_duration<T>()) {
-      static_cast<T*>(out_buffer)[row] = to_duration<T>(begin, end);
+    if constexpr (cudf::is_timestamp<T>() or cudf::is_duration<T>()) {
+      static_cast<T*>(out_buffer)[row] = decode_value<T>(begin, end, opts);
+      return true;
     } else {
       return false;
     }
-    return true;
   }
 };
 
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index 5ce00eeee4a..303c35ea7fb 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -570,10 +570,9 @@ class path_state : private parser {
               op.type          = path_operator_type::CHILD;
               op.expected_type = OBJECT;
             } else {
-              op.type          = path_operator_type::CHILD_INDEX;
-              auto const value = cudf::io::parse_numeric<int>(
-                op.name.data(), op.name.data() + op.name.size_bytes(), json_opts);
-              op.index = value.value_or(-1);
+              op.type  = path_operator_type::CHILD_INDEX;
+              op.index = cudf::io::parse_numeric<int>(
+                op.name.data(), op.name.data() + op.name.size_bytes(), json_opts, -1);
               CUDF_EXPECTS(op.index >= 0, "Invalid numeric index specified in JSONPath");
               op.expected_type = ARRAY;
             }
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index c7cd8bef9e3..b8cd4622484 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -18,7 +18,6 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
@@ -30,7 +29,6 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
-#include <limits>
 #include <thrust/iterator/constant_iterator.h>
 
 #include <arrow/io/api.h>
@@ -1453,92 +1451,4 @@ TEST_F(JsonReaderTest, JsonNestedDtypeSchema)
                                  int_wrapper{{1, 1, 2}, {true, true, true}});
 }
 
-TEST_P(JsonReaderParamTest, JsonDtypeParsing)
-{
-  auto const test_opt          = GetParam();
-  bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient);
-  // All corner cases of dtype parsing
-  //  0, "0", " 0", 1, "1", " 1", "a", "z", null, true, false,  "null", "true", "false", nan, "nan"
-  // Test for dtypes: bool, int, float, str, duration, timestamp
-  std::string row_orient =
-    "[0]\n[\"0\"]\n[\" 0\"]\n[1]\n[\"1\"]\n[\" 1\"]\n[\"a\"]\n[\"z\"]\n"
-    "[null]\n[true]\n[false]\n[\"null\"]\n[\"true\"]\n[\"false\"]\n[nan]\n[\"nan\"]\n";
-  std::string record_orient = to_records_orient({{{"0", "0"}},
-                                                 {{"0", "\"0\""}},
-                                                 {{"0", "\" 0\""}},
-                                                 {{"0", "1"}},
-                                                 {{"0", "\"1\""}},
-                                                 {{"0", "\" 1\""}},
-                                                 {{"0", "\"a\""}},
-                                                 {{"0", "\"z\""}},
-                                                 {{"0", "null"}},
-                                                 {{"0", "true"}},
-                                                 {{"0", "false"}},
-                                                 {{"0", "\"null\""}},
-                                                 {{"0", "\"true\""}},
-                                                 {{"0", "\"false\""}},
-                                                 {{"0", "nan"}},
-                                                 {{"0", "\"nan\""}}},
-                                                "\n");
-
-  std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient;
-
-  auto make_validity = [](std::vector<int> const& validity) {
-    return cudf::detail::make_counting_transform_iterator(
-      0, [=](auto i) -> bool { return static_cast<bool>(validity[i]); });
-  };
-
-  constexpr int iNA    = -1;
-  constexpr double dNA = std::numeric_limits<double>::quiet_NaN();
-  constexpr bool bNA   = false;
-
-  std::vector<int> const validity_int   = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0};
-  std::vector<int> const validity_float = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0};
-  std::vector<int> const validity_bool  = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0};
-
-  auto int_col = int_wrapper{{0, 0, iNA, 1, 1, iNA, iNA, iNA, iNA, 1, 0, iNA, 1, 0, iNA, iNA},
-                             make_validity(validity_int)};
-  auto float_col =
-    float_wrapper{{0.0, 0.0, dNA, 1.0, 1.0, dNA, dNA, dNA, dNA, 1.0, 0.0, dNA, 1.0, 0.0, dNA, dNA},
-                  make_validity(validity_float)};
-  auto str_col =
-    cudf::test::strings_column_wrapper{// clang-format off
-    {"0", "0", " 0", "1", "1", " 1", "a", "z", "", "true", "false", "null", "true", "false", "nan", "nan"},
-     cudf::test::iterators::nulls_at(std::vector<int>{8})};
-  // clang-format on
-  auto bool_col = bool_wrapper{
-    {false, false, bNA, true, true, bNA, bNA, bNA, bNA, true, false, bNA, true, false, bNA, bNA},
-    make_validity(validity_bool)};
-
-  // Types to test
-  const std::vector<data_type> dtypes = {
-    dtype<int32_t>(), dtype<float>(), dtype<cudf::string_view>(), dtype<bool>()};
-  const std::vector<cudf::column_view> cols{cudf::column_view(int_col),
-                                            cudf::column_view(float_col),
-                                            cudf::column_view(str_col),
-                                            cudf::column_view(bool_col)};
-  for (size_t col_type = 0; col_type < cols.size(); col_type++) {
-    std::map<std::string, cudf::io::schema_element> dtype_schema{{"0", {dtypes[col_type]}}};
-    cudf::io::json_reader_options in_options =
-      cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
-        .dtypes(dtype_schema)
-        .lines(true)
-        .experimental(test_experimental);
-
-    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
-
-    EXPECT_EQ(result.tbl->num_columns(), 1);
-    EXPECT_EQ(result.tbl->num_rows(), 16);
-
-    EXPECT_EQ(result.tbl->get_column(0).type().id(), dtypes[col_type].id());
-
-    EXPECT_EQ(result.metadata.schema_info[0].name, "0");
-
-    std::cout << "Column: " << col_type << std::endl;
-    cudf::test::print(result.tbl->get_column(0));
-
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), cols[col_type]);
-  }
-}
-
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 52ef8d6c92d..f1a75baa951 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -304,7 +304,7 @@ cdef csv_reader_options make_csv_reader_options(
 
     if false_values is not None:
         c_false_values.reserve(len(false_values))
-        for fv in false_values:
+        for fv in c_false_values:
             c_false_values.push_back(fv.encode())
         csv_reader_options_c.set_false_values(c_false_values)
 
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 580b5a2b2df..b91893d8991 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -766,43 +766,6 @@ def test_csv_reader_bools(tmpdir, names, dtypes, data, trues, falses):
     assert_eq(df_out, out)
 
 
-def test_csv_reader_bools_NA():
-    names = ["text", "int"]
-    dtypes = ["str", "int"]
-    trues = ["foo"]
-    falses = ["bar"]
-    lines = [
-        ",".join(names),
-        "true,true",
-        "false,false",
-        "foo,foo",
-        "bar,bar",
-        "qux,qux",
-    ]
-
-    buffer = "\n".join(lines)
-
-    df = read_csv(
-        StringIO(buffer),
-        names=names,
-        dtype=dtypes,
-        skiprows=1,
-        true_values=trues,
-        false_values=falses,
-    )
-    assert len(df.columns) == 2
-    assert df["text"].dtype == np.dtype("object")
-    assert df["int"].dtype == np.dtype("int64")
-    expected = pd.DataFrame(
-        {
-            "text": ["true", "false", "foo", "bar", "qux"],
-            "int": [1.0, 0.0, 1.0, 0.0, np.nan],
-        }
-    )
-    # breaking behaviour is np.nan for qux
-    assert_eq(df, expected)
-
-
 def test_csv_quotednumbers(tmpdir):
     fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file12.csv")
 
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 8d06d3e55f3..fb2c24b3757 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -687,58 +687,6 @@ def test_json_types_data():
     assert df.to_arrow().equals(pa_table_pdf)
 
 
-@pytest.mark.parametrize(
-    "col_type,json_str",
-    [
-        # without quotes
-        ("int", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'),
-        # with quotes
-        ("int", '[{"k": "1"}, {"k": "2"}]'),
-        # with quotes, mixed
-        ("int", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'),
-        # with quotes, null, mixed
-        ("int", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'),
-        # without quotes, null
-        ("int", '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]'),
-        # without quotes
-        ("float", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'),
-        # with quotes
-        ("float", '[{"k": "1"}, {"k": "2"}]'),
-        # with quotes, mixed
-        ("float", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'),
-        # with quotes, null, mixed
-        ("float", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'),
-        # with quotes, NAN
-        ("float", '[{"k": "1"}, {"k": "2"}, {"k": NaN}, {"k": "4"}]'),
-        # without quotes
-        ("str", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'),
-        # with quotes
-        ("str", '[{"k": "1"}, {"k": "2"}]'),
-        # with quotes, mixed
-        ("str", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'),
-        # with quotes, null, mixed
-        ("str", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'),
-        # without quotes, null
-        ("str", '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]'),
-    ],
-)
-def test_json_quoted_values_with_schema(col_type, json_str):
-    experimental_df = cudf.read_json(
-        StringIO(json_str),
-        engine="cudf_experimental",
-        orient="records",
-        dtype={"k": col_type},
-    )
-    cudf_df = cudf.read_json(
-        StringIO(json_str.replace(",", "\n")[1:-1]),
-        engine="cudf",
-        orient="records",
-        lines=True,
-        dtype={"k": col_type},
-    )
-    assert_eq(cudf_df, experimental_df)
-
-
 @pytest.mark.parametrize(
     "keep_quotes,result",
     [

From e0f6e3d28e4a5ff7f41d9991ca3180484f24e091 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 28 Oct 2022 17:04:15 +0530
Subject: [PATCH 17/39] unit tests for string quotes

---
 python/cudf/cudf/tests/test_json.py | 52 +++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index fb2c24b3757..8d06d3e55f3 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -687,6 +687,58 @@ def test_json_types_data():
     assert df.to_arrow().equals(pa_table_pdf)
 
 
+@pytest.mark.parametrize(
+    "col_type,json_str",
+    [
+        # without quotes
+        ("int", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'),
+        # with quotes
+        ("int", '[{"k": "1"}, {"k": "2"}]'),
+        # with quotes, mixed
+        ("int", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'),
+        # with quotes, null, mixed
+        ("int", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'),
+        # without quotes, null
+        ("int", '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]'),
+        # without quotes
+        ("float", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'),
+        # with quotes
+        ("float", '[{"k": "1"}, {"k": "2"}]'),
+        # with quotes, mixed
+        ("float", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'),
+        # with quotes, null, mixed
+        ("float", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'),
+        # with quotes, NAN
+        ("float", '[{"k": "1"}, {"k": "2"}, {"k": NaN}, {"k": "4"}]'),
+        # without quotes
+        ("str", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'),
+        # with quotes
+        ("str", '[{"k": "1"}, {"k": "2"}]'),
+        # with quotes, mixed
+        ("str", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'),
+        # with quotes, null, mixed
+        ("str", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'),
+        # without quotes, null
+        ("str", '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]'),
+    ],
+)
+def test_json_quoted_values_with_schema(col_type, json_str):
+    experimental_df = cudf.read_json(
+        StringIO(json_str),
+        engine="cudf_experimental",
+        orient="records",
+        dtype={"k": col_type},
+    )
+    cudf_df = cudf.read_json(
+        StringIO(json_str.replace(",", "\n")[1:-1]),
+        engine="cudf",
+        orient="records",
+        lines=True,
+        dtype={"k": col_type},
+    )
+    assert_eq(cudf_df, experimental_df)
+
+
 @pytest.mark.parametrize(
     "keep_quotes,result",
     [

From 1317be3b5515ca6f1dd4226ed1d97c6912a609ee Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 28 Oct 2022 17:21:51 +0530
Subject: [PATCH 18/39] remove debug prints

---
 cpp/tests/io/json_test.cpp | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index c7cd8bef9e3..396072e8b8a 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1529,14 +1529,9 @@ TEST_P(JsonReaderParamTest, JsonDtypeParsing)
 
     EXPECT_EQ(result.tbl->num_columns(), 1);
     EXPECT_EQ(result.tbl->num_rows(), 16);
-
-    EXPECT_EQ(result.tbl->get_column(0).type().id(), dtypes[col_type].id());
-
     EXPECT_EQ(result.metadata.schema_info[0].name, "0");
 
-    std::cout << "Column: " << col_type << std::endl;
-    cudf::test::print(result.tbl->get_column(0));
-
+    EXPECT_EQ(result.tbl->get_column(0).type().id(), dtypes[col_type].id());
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), cols[col_type]);
   }
 }

From ce863b389fbbc5ea99e4522fcae02aad9ba9266c Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 28 Oct 2022 17:24:20 +0530
Subject: [PATCH 19/39] replace @param[in] with @param

---
 cpp/src/io/utilities/parsing_utils.cuh | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 414375147f2..405f425716d 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -469,7 +469,7 @@ cudf::size_type count_all_from_set(host_span<char const> data,
 /**
  * @brief Checks whether the given character is a whitespace character.
  *
- * @param[in] ch The character to check
+ * @param ch The character to check
  *
  * @return True if the input is whitespace, False otherwise
  */
@@ -487,9 +487,9 @@ __inline__ __device__ It skip_character(It const& it, char ch)
 /**
  * @brief Adjusts the range to ignore starting/trailing whitespace and quotation characters.
  *
- * @param[in] begin Pointer to the first character in the parsing range
- * @param[in] end pointer to the first character after the parsing range
- * @param[in] quotechar The character used to denote quotes; '\0' if none
+ * @param begin Pointer to the first character in the parsing range
+ * @param end pointer to the first character after the parsing range
+ * @param quotechar The character used to denote quotes; '\0' if none
  *
  * @return Trimmed range
  */
@@ -510,9 +510,9 @@ __inline__ __device__ std::pair<char const*, char const*> trim_whitespaces_quote
 /**
  * @brief Adjusts the range to ignore starting/trailing quotation characters.
  *
- * @param[in] begin Pointer to the first character in the parsing range
- * @param[in] end pointer to the first character after the parsing range
- * @param[in] quotechar The character used to denote quotes; '\0' if none
+ * @param begin Pointer to the first character in the parsing range
+ * @param end pointer to the first character after the parsing range
+ * @param quotechar The character used to denote quotes; '\0' if none
  *
  * @return Trimmed range
  */

From d9fc33b16390399cc2678401da68cf7a52c34c10 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Fri, 28 Oct 2022 17:31:59 +0530
Subject: [PATCH 20/39] nan fix moved to PR #12022

---
 python/cudf/cudf/tests/test_csv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 580b5a2b2df..0895da19c8b 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -796,7 +796,7 @@ def test_csv_reader_bools_NA():
     expected = pd.DataFrame(
         {
             "text": ["true", "false", "foo", "bar", "qux"],
-            "int": [1.0, 0.0, 1.0, 0.0, np.nan],
+            "int": [1, 0, 1, 0, 0],
         }
     )
     # breaking behaviour is np.nan for qux

From 2ec4bd88b6e79c7f2fd9b720580c5a5d26c2c7a6 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 28 Oct 2022 17:35:29 +0530
Subject: [PATCH 21/39] null for parsing error in bool - update pytest

---
 python/cudf/cudf/tests/test_csv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 0895da19c8b..580b5a2b2df 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -796,7 +796,7 @@ def test_csv_reader_bools_NA():
     expected = pd.DataFrame(
         {
             "text": ["true", "false", "foo", "bar", "qux"],
-            "int": [1, 0, 1, 0, 0],
+            "int": [1.0, 0.0, 1.0, 0.0, np.nan],
         }
     )
     # breaking behaviour is np.nan for qux

From ff77d5f7d6bd677ca85006cd59899a96af3e93fb Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 28 Oct 2022 18:03:27 +0530
Subject: [PATCH 22/39] doc update

---
 cpp/src/io/utilities/parsing_utils.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 405f425716d..0077d3f1fa4 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -488,7 +488,7 @@ __inline__ __device__ It skip_character(It const& it, char ch)
  * @brief Adjusts the range to ignore starting/trailing whitespace and quotation characters.
  *
  * @param begin Pointer to the first character in the parsing range
- * @param end pointer to the first character after the parsing range
+ * @param end Pointer to the first character after the parsing range
  * @param quotechar The character used to denote quotes; '\0' if none
  *
  * @return Trimmed range
@@ -511,7 +511,7 @@ __inline__ __device__ std::pair<char const*, char const*> trim_whitespaces_quote
  * @brief Adjusts the range to ignore starting/trailing quotation characters.
  *
  * @param begin Pointer to the first character in the parsing range
- * @param end pointer to the first character after the parsing range
+ * @param end Pointer to the first character after the parsing range
  * @param quotechar The character used to denote quotes; '\0' if none
  *
  * @return Trimmed range

From ba36b86e0e5c7232e70fd90f49a5289004eef26c Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 28 Oct 2022 18:03:42 +0530
Subject: [PATCH 23/39] skip writing NAN for floating type for nulls

---
 cpp/src/io/utilities/parsing_utils.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 0077d3f1fa4..f003724c77b 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -629,7 +629,7 @@ struct ConvertFunctor {
       if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; }
       return cudf::io::parse_numeric<T>(begin, end, opts);
     }();
-    static_cast<T*>(out_buffer)[row] = value.value_or(std::numeric_limits<T>::quiet_NaN());
+    if (value.has_value()) { static_cast<T*>(out_buffer)[row] = *value; }
 
     return value.has_value() and !std::isnan(*value);
   }

From 3e7148237c76643fa13295875e919c208b8472fe Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 28 Oct 2022 19:11:19 +0530
Subject: [PATCH 24/39] fix dereferencing {begin - 1} in json reader

---
 cpp/src/io/json/json_gpu.cu            | 28 ++++++++++++++++++--------
 cpp/src/io/utilities/parsing_utils.cuh | 22 ++++++++++++++++++++
 2 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index e4fb7e2edf5..ca2737a5930 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -156,6 +156,7 @@ struct field_descriptor {
   cudf::size_type column;
   char const* value_begin;
   char const* value_end;
+  bool is_quoted;
 };
 
 /**
@@ -178,7 +179,10 @@ __device__ field_descriptor next_field_descriptor(const char* begin,
   auto const desc_pre_trim =
     col_map.capacity() == 0
       // No key - column and begin are trivial
-      ? field_descriptor{field_idx, begin, cudf::io::gpu::seek_field_end(begin, end, opts, true)}
+      ? field_descriptor{field_idx,
+                         begin,
+                         cudf::io::gpu::seek_field_end(begin, end, opts, true),
+                         false}
       : [&]() {
           auto const key_range = get_next_key(begin, end, opts.quotechar);
           auto const key_hash  = cudf::detail::MurmurHash3_32<cudf::string_view>{}(
@@ -189,14 +193,23 @@ __device__ field_descriptor next_field_descriptor(const char* begin,
 
           // Skip the colon between the key and the value
           auto const value_begin = thrust::find(thrust::seq, key_range.second, end, ':') + 1;
-          return field_descriptor{
-            column, value_begin, cudf::io::gpu::seek_field_end(value_begin, end, opts, true)};
+          return field_descriptor{column,
+                                  value_begin,
+                                  cudf::io::gpu::seek_field_end(value_begin, end, opts, true),
+                                  false};
         }();
 
   // Modify start & end to ignore whitespace and quotechars
   auto const trimmed_value_range =
-    trim_whitespaces_quotes(desc_pre_trim.value_begin, desc_pre_trim.value_end, opts.quotechar);
-  return {desc_pre_trim.column, trimmed_value_range.first, trimmed_value_range.second};
+    trim_whitespaces(desc_pre_trim.value_begin, desc_pre_trim.value_end);
+  bool const is_quoted =
+    thrust::distance(trimmed_value_range.first, trimmed_value_range.second) >= 2 and
+    *trimmed_value_range.first == opts.quotechar and
+    *thrust::prev(trimmed_value_range.second) == opts.quotechar;
+  return {desc_pre_trim.column,
+          trimmed_value_range.first + is_quoted,
+          trimmed_value_range.second - is_quoted,
+          is_quoted};
 }
 
 /**
@@ -255,8 +268,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
     auto const desc =
       next_field_descriptor(current, row_data_range.second, opts, input_field_index, col_map);
     auto const value_len = static_cast<size_t>(std::max(desc.value_end - desc.value_begin, 0L));
-    const bool is_quoted =
-      *(desc.value_begin - 1) == opts.quotechar && *desc.value_end == opts.quotechar;
+    auto const is_quoted = static_cast<std::ptrdiff_t>(desc.is_quoted);
 
     current = desc.value_end + 1;
 
@@ -348,7 +360,7 @@ __global__ void detect_data_types_kernel(
       atomicAdd(&column_infos[desc.column].null_count, -1);
     }
     // Don't need counts to detect strings, any field in quotes is deduced to be a string
-    if (*(desc.value_begin - 1) == opts.quotechar && *desc.value_end == opts.quotechar) {
+    if (desc.is_quoted) {
       atomicAdd(&column_infos[desc.column].string_count, 1);
       continue;
     }
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 354abfa3985..772e706fc80 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -523,6 +523,28 @@ __inline__ __device__ std::pair<char const*, char const*> trim_whitespaces_quote
   return {skip_character(trim_begin, quotechar), skip_character(trim_end, quotechar).base()};
 }
 
+/**
+ * @brief Adjusts the range to ignore starting/trailing whitespace characters.
+ *
+ * @param[in] begin Pointer to the first character in the parsing range
+ * @param[in] end pointer to the first character after the parsing range
+ *
+ * @return Trimmed range
+ */
+__inline__ __device__ std::pair<char const*, char const*> trim_whitespaces(char const* begin,
+                                                                           char const* end)
+{
+  auto not_whitespace = [] __device__(auto c) { return !is_whitespace(c); };
+
+  auto const trim_begin = thrust::find_if(thrust::seq, begin, end, not_whitespace);
+  auto const trim_end   = thrust::find_if(thrust::seq,
+                                        thrust::make_reverse_iterator(end),
+                                        thrust::make_reverse_iterator(trim_begin),
+                                        not_whitespace);
+
+  return {trim_begin, trim_end.base()};
+}
+
 /**
  * @brief Adjusts the range to ignore starting/trailing quotation characters.
  *

From 9b64cbd954af7480de79fbb39a1a5110629e3adb Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 28 Oct 2022 19:14:19 +0530
Subject: [PATCH 25/39] doc update

---
 cpp/src/io/utilities/parsing_utils.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 772e706fc80..2da7a28e96c 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -504,7 +504,7 @@ __inline__ __device__ It skip_character(It const& it, char ch)
  * @brief Adjusts the range to ignore starting/trailing whitespace and quotation characters.
  *
  * @param[in] begin Pointer to the first character in the parsing range
- * @param[in] end pointer to the first character after the parsing range
+ * @param[in] end Pointer to the first character after the parsing range
  * @param[in] quotechar The character used to denote quotes; '\0' if none
  *
  * @return Trimmed range
@@ -527,7 +527,7 @@ __inline__ __device__ std::pair<char const*, char const*> trim_whitespaces_quote
  * @brief Adjusts the range to ignore starting/trailing whitespace characters.
  *
  * @param[in] begin Pointer to the first character in the parsing range
- * @param[in] end pointer to the first character after the parsing range
+ * @param[in] end Pointer to the first character after the parsing range
  *
  * @return Trimmed range
  */
@@ -549,7 +549,7 @@ __inline__ __device__ std::pair<char const*, char const*> trim_whitespaces(char
  * @brief Adjusts the range to ignore starting/trailing quotation characters.
  *
  * @param begin Pointer to the first character in the parsing range
- * @param end pointer to the first character after the parsing range
+ * @param end Pointer to the first character after the parsing range
  * @param quotechar The character used to denote quotes; '\0' if none
  *
  * @return Trimmed range

From 90cdf3aa5838a1cce07ffc3093847762aaa35400 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 28 Oct 2022 23:20:54 +0530
Subject: [PATCH 26/39] ignore data for nulls in
 CsvReaderTest.InvalidFloatingPoint test

---
 cpp/tests/io/csv_test.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index f532836ef95..7f975741ae8 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -1188,8 +1188,7 @@ TEST_F(CsvReaderTest, InvalidFloatingPoint)
 
   const auto col_data = cudf::test::to_host<float>(view.column(0));
   // col_data.first contains the column data
-  for (const auto& elem : col_data.first)
-    ASSERT_TRUE(std::isnan(elem));
+  // ignore all data because it is all nulls.
   // col_data.second contains the bitmasks
   ASSERT_EQ(0u, col_data.second[0]);
 }

From 6710b4701c098fcdc4de3de06be152380582a713 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Sat, 29 Oct 2022 01:06:49 +0530
Subject: [PATCH 27/39] add pandas test for custom bool literals

---
 python/cudf/cudf/tests/test_csv.py | 37 ++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 0895da19c8b..e805f4a6867 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -766,6 +766,43 @@ def test_csv_reader_bools(tmpdir, names, dtypes, data, trues, falses):
     assert_eq(df_out, out)
 
 
+def test_csv_reader_bools_custom():
+    names = ["text", "int"]
+    dtypes = ["str", "int"]
+    trues = ["foo"]
+    falses = ["bar"]
+    lines = [
+        ",".join(names),
+        "true,true",
+        "false,false",
+        "foo,foo",
+        "bar,bar",
+        "0,0",
+    ]
+    buffer = "\n".join(lines)
+
+    df = read_csv(
+        StringIO(buffer),
+        names=names,
+        dtype=dtypes,
+        skiprows=1,
+        true_values=trues,
+        false_values=falses,
+    )
+    assert len(df.columns) == 2
+    assert df["text"].dtype == np.dtype("object")
+    assert df["int"].dtype == np.dtype("int64")
+    expected = pd.read_csv(
+        StringIO(buffer),
+        names=names,
+        dtype=dtypes,
+        skiprows=1,
+        true_values=trues,
+        false_values=falses,
+    )
+    assert_eq(df, expected)
+
+
 def test_csv_reader_bools_NA():
     names = ["text", "int"]
     dtypes = ["str", "int"]

From 99eedc059a38503da43a8434e16eb169830e992d Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Mon, 31 Oct 2022 12:57:02 +0530
Subject: [PATCH 28/39] Update python/cudf/cudf/tests/test_csv.py

Co-authored-by: GALI PREM SAGAR <sagarprem75@gmail.com>
---
 python/cudf/cudf/tests/test_csv.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index e805f4a6867..8a0d15cbfe2 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -789,9 +789,6 @@ def test_csv_reader_bools_custom():
         true_values=trues,
         false_values=falses,
     )
-    assert len(df.columns) == 2
-    assert df["text"].dtype == np.dtype("object")
-    assert df["int"].dtype == np.dtype("int64")
     expected = pd.read_csv(
         StringIO(buffer),
         names=names,
@@ -800,7 +797,7 @@ def test_csv_reader_bools_custom():
         true_values=trues,
         false_values=falses,
     )
-    assert_eq(df, expected)
+    assert_eq(df, expected, check_dtype=True)
 
 
 def test_csv_reader_bools_NA():

From 0415252a9ace6aaef406893eee12a88e5146ca41 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 31 Oct 2022 13:16:52 +0530
Subject: [PATCH 29/39] static_cast<std::ptrdiff_t> for is_quoted in pointer
 arithmetic

---
 cpp/src/io/json/json_gpu.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index ca2737a5930..d2e69d086e6 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -207,8 +207,8 @@ __device__ field_descriptor next_field_descriptor(const char* begin,
     *trimmed_value_range.first == opts.quotechar and
     *thrust::prev(trimmed_value_range.second) == opts.quotechar;
   return {desc_pre_trim.column,
-          trimmed_value_range.first + is_quoted,
-          trimmed_value_range.second - is_quoted,
+          trimmed_value_range.first + static_cast<std::ptrdiff_t>(is_quoted),
+          trimmed_value_range.second - static_cast<std::ptrdiff_t>(is_quoted),
           is_quoted};
 }
 

From da4b16f6db4abf5742fe60c3111a89793f6504fa Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 31 Oct 2022 14:08:58 +0530
Subject: [PATCH 30/39] skip nan check for null elements in test

---
 cpp/tests/io/json_test.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 396072e8b8a..999f56b9808 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -914,8 +914,7 @@ TEST_P(JsonReaderParamTest, InvalidFloatingPoint)
 
   const auto col_data = cudf::test::to_host<float>(result.tbl->view().column(0));
   // col_data.first contains the column data
-  for (const auto& elem : col_data.first)
-    ASSERT_TRUE(std::isnan(elem));
+  // ignore all data because it is all nulls.
   // col_data.second contains the bitmasks
   ASSERT_EQ(0u, col_data.second[0]);
 }

From 53e0f2b6f285e7f123ee936a30d4cb01f2238c1a Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Tue, 1 Nov 2022 15:17:10 +0530
Subject: [PATCH 31/39] Apply suggestions from code review

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 cpp/src/io/utilities/parsing_utils.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 2da7a28e96c..02575767e52 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -550,13 +550,13 @@ __inline__ __device__ std::pair<char const*, char const*> trim_whitespaces(char
  *
  * @param begin Pointer to the first character in the parsing range
  * @param end Pointer to the first character after the parsing range
- * @param quotechar The character used to denote quotes; '\0' if none
+ * @param quotechar The character used to denote quotes. Provide '\0' if no quotes should be trimmed.
  *
  * @return Trimmed range
  */
 __inline__ __device__ std::pair<char const*, char const*> trim_quotes(char const* begin,
                                                                       char const* end,
-                                                                      char quotechar = '\0')
+                                                                      char quotechar)
 {
   if ((thrust::distance(begin, end) >= 2 && *begin == quotechar &&
        *thrust::prev(end) == quotechar)) {

From dd6a0ca9ec9d3eb9c06ce6d2573c73168f51c34d Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Tue, 1 Nov 2022 15:17:27 +0530
Subject: [PATCH 32/39] Update cpp/src/io/utilities/parsing_utils.cuh

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 cpp/src/io/utilities/parsing_utils.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 02575767e52..19a520e2f6f 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -526,8 +526,8 @@ __inline__ __device__ std::pair<char const*, char const*> trim_whitespaces_quote
 /**
  * @brief Adjusts the range to ignore starting/trailing whitespace characters.
  *
- * @param[in] begin Pointer to the first character in the parsing range
- * @param[in] end Pointer to the first character after the parsing range
+ * @param begin Pointer to the first character in the parsing range
+ * @param end Pointer to the first character after the parsing range
  *
  * @return Trimmed range
  */

From cc2a348038a227a6aca028cde9e92658103c8121 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Wed, 2 Nov 2022 13:18:36 +0530
Subject: [PATCH 33/39] review comments, style fix

---
 cpp/src/io/json/json_gpu.cu            | 1 -
 cpp/src/io/utilities/parsing_utils.cuh | 9 +++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index d2e69d086e6..8b6c0f9d528 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -274,7 +274,6 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
 
     using string_index_pair = thrust::pair<const char*, size_type>;
 
-    // Empty fields are not legal values
     if (!serialized_trie_contains(opts.trie_na,
                                   {desc.value_begin - is_quoted, value_len + is_quoted * 2})) {
       // Type dispatcher does not handle strings
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 19a520e2f6f..cbd417c2b5b 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -503,9 +503,9 @@ __inline__ __device__ It skip_character(It const& it, char ch)
 /**
  * @brief Adjusts the range to ignore starting/trailing whitespace and quotation characters.
  *
- * @param[in] begin Pointer to the first character in the parsing range
- * @param[in] end Pointer to the first character after the parsing range
- * @param[in] quotechar The character used to denote quotes; '\0' if none
+ * @param begin Pointer to the first character in the parsing range
+ * @param end Pointer to the first character after the parsing range
+ * @param quotechar The character used to denote quotes; '\0' if none
  *
  * @return Trimmed range
  */
@@ -550,7 +550,8 @@ __inline__ __device__ std::pair<char const*, char const*> trim_whitespaces(char
  *
  * @param begin Pointer to the first character in the parsing range
  * @param end Pointer to the first character after the parsing range
- * @param quotechar The character used to denote quotes. Provide '\0' if no quotes should be trimmed.
+ * @param quotechar The character used to denote quotes. Provide '\0' if no quotes should be
+ * trimmed.
  *
  * @return Trimmed range
  */

From dc228e5078b9b53837089936a590ffebd42bf774 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Wed, 2 Nov 2022 13:35:08 +0530
Subject: [PATCH 34/39] pandas vs cudf csv parser differences

bool literals give parsing errors as int
"0" and "1" give parsing errors as bool in pandas
---
 python/cudf/cudf/tests/test_csv.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 8a0d15cbfe2..e85d404d2c4 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -767,10 +767,10 @@ def test_csv_reader_bools(tmpdir, names, dtypes, data, trues, falses):
 
 
 def test_csv_reader_bools_custom():
-    names = ["text", "int"]
-    dtypes = ["str", "int"]
-    trues = ["foo"]
-    falses = ["bar"]
+    names = ["text", "bool"]
+    dtypes = {"text": "str", "bool": "bool"}
+    trues = ["foo", "1"]
+    falses = ["bar", "0"]
     lines = [
         ",".join(names),
         "true,true",
@@ -778,6 +778,7 @@ def test_csv_reader_bools_custom():
         "foo,foo",
         "bar,bar",
         "0,0",
+        "1,1",
     ]
     buffer = "\n".join(lines)
 
@@ -789,6 +790,9 @@ def test_csv_reader_bools_custom():
         true_values=trues,
         false_values=falses,
     )
+
+    # Note: bool literals give parsing errors as int
+    # "0" and "1" give parsing errors as bool in pandas
     expected = pd.read_csv(
         StringIO(buffer),
         names=names,

From 7e231ac6aa34eaf92d88887c0da0b077120fe8ba Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Fri, 4 Nov 2022 03:20:39 +0530
Subject: [PATCH 35/39] Apply suggestions from code review

---
 cpp/src/io/utilities/parsing_utils.cuh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 445ec871dbc..0015777279b 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -598,14 +598,13 @@ struct ConvertFunctor {
                                                       bool as_hex)
   {
     // TODO what's invalid input
-    bool all_digits_valid = true;
     static_cast<device_storage_type_t<T>*>(out_buffer)[row] =
-      [&opts, &all_digits_valid, output_type, begin, end]() -> device_storage_type_t<T> {
+      [&opts, output_type, begin, end]() -> device_storage_type_t<T> {
       return strings::detail::parse_decimal<device_storage_type_t<T>>(
         begin, end, output_type.scale());
     }();
 
-    return all_digits_valid;
+    return true;
   }
 
   /**

From d7fb88ca6157dc7059f513e46c0103139ad8f287 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Tue, 15 Nov 2022 02:58:08 +0530
Subject: [PATCH 36/39] address review comments

---
 cpp/src/io/utilities/parsing_utils.cuh | 18 +++++---
 cpp/tests/io/json_test.cpp             | 59 +++++++++++++++++++-------
 2 files changed, 56 insertions(+), 21 deletions(-)

diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 0015777279b..6c5c817eb02 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -237,7 +237,7 @@ __host__ __device__ std::optional<T> parse_numeric(char const* begin,
       if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); }
     }
   }
-  if (!all_digits_valid) { return {}; }
+  if (!all_digits_valid) { return std::nullopt; }
 
   return value * sign;
 }
@@ -622,8 +622,12 @@ struct ConvertFunctor {
     auto const value = [&opts, begin, end]() -> std::optional<T> {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
-      if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; }
-      if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; }
+      if (serialized_trie_contains(opts.trie_true, {begin, field_len})) {
+        return static_cast<T>(true);
+      }
+      if (serialized_trie_contains(opts.trie_false, {begin, field_len})) {
+        return static_cast<T>(false);
+      }
       return cudf::io::parse_numeric<T>(begin, end, opts);
     }();
     if (value.has_value()) { static_cast<T*>(out_buffer)[row] = *value; }
@@ -647,8 +651,12 @@ struct ConvertFunctor {
     auto const value = [&opts, begin, end]() -> std::optional<T> {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
-      if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; }
-      if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; }
+      if (serialized_trie_contains(opts.trie_true, {begin, field_len})) {
+        return static_cast<T>(true);
+      }
+      if (serialized_trie_contains(opts.trie_false, {begin, field_len})) {
+        return static_cast<T>(false);
+      }
       return cudf::io::parse_numeric<T>(begin, end, opts);
     }();
     if (value.has_value()) { static_cast<T*>(out_buffer)[row] = *value; }
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 999f56b9808..5da4d8969d6 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1487,27 +1487,54 @@ TEST_P(JsonReaderParamTest, JsonDtypeParsing)
       0, [=](auto i) -> bool { return static_cast<bool>(validity[i]); });
   };
 
-  constexpr int iNA    = -1;
-  constexpr double dNA = std::numeric_limits<double>::quiet_NaN();
-  constexpr bool bNA   = false;
-
-  std::vector<int> const validity_int   = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0};
-  std::vector<int> const validity_float = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0};
-  std::vector<int> const validity_bool  = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0};
-
-  auto int_col = int_wrapper{{0, 0, iNA, 1, 1, iNA, iNA, iNA, iNA, 1, 0, iNA, 1, 0, iNA, iNA},
-                             make_validity(validity_int)};
-  auto float_col =
-    float_wrapper{{0.0, 0.0, dNA, 1.0, 1.0, dNA, dNA, dNA, dNA, 1.0, 0.0, dNA, 1.0, 0.0, dNA, dNA},
-                  make_validity(validity_float)};
+  constexpr int int_NA       = 0;
+  constexpr double double_NA = std::numeric_limits<double>::quiet_NaN();
+  constexpr bool bool_NA     = false;
+
+  std::vector<int> const validity = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0};
+
+  auto int_col = int_wrapper{
+    {0, 0, int_NA, 1, 1, int_NA, int_NA, int_NA, int_NA, 1, 0, int_NA, 1, 0, int_NA, int_NA},
+    make_validity(validity)};
+  auto float_col = float_wrapper{{0.0,
+                                  0.0,
+                                  double_NA,
+                                  1.0,
+                                  1.0,
+                                  double_NA,
+                                  double_NA,
+                                  double_NA,
+                                  double_NA,
+                                  1.0,
+                                  0.0,
+                                  double_NA,
+                                  1.0,
+                                  0.0,
+                                  double_NA,
+                                  double_NA},
+                                 make_validity(validity)};
   auto str_col =
     cudf::test::strings_column_wrapper{// clang-format off
     {"0", "0", " 0", "1", "1", " 1", "a", "z", "", "true", "false", "null", "true", "false", "nan", "nan"},
      cudf::test::iterators::nulls_at(std::vector<int>{8})};
   // clang-format on
-  auto bool_col = bool_wrapper{
-    {false, false, bNA, true, true, bNA, bNA, bNA, bNA, true, false, bNA, true, false, bNA, bNA},
-    make_validity(validity_bool)};
+  auto bool_col = bool_wrapper{{false,
+                                false,
+                                bool_NA,
+                                true,
+                                true,
+                                bool_NA,
+                                bool_NA,
+                                bool_NA,
+                                bool_NA,
+                                true,
+                                false,
+                                bool_NA,
+                                true,
+                                false,
+                                bool_NA,
+                                bool_NA},
+                               make_validity(validity)};
 
   // Types to test
   const std::vector<data_type> dtypes = {

From bea7fc3aca340fd1e55c1de5ffaa8571261fa933 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Tue, 15 Nov 2022 04:57:54 +0530
Subject: [PATCH 37/39] replace std::nullopt

---
 cpp/src/io/utilities/parsing_utils.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 6c5c817eb02..a78700db484 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -237,7 +237,7 @@ __host__ __device__ std::optional<T> parse_numeric(char const* begin,
       if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); }
     }
   }
-  if (!all_digits_valid) { return std::nullopt; }
+  if (!all_digits_valid) { return std::optional<T>{}; }
 
   return value * sign;
 }

From 0e45d133ce3631ac8b74d8188dc1c9d3361ee717 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Tue, 15 Nov 2022 05:28:27 +0530
Subject: [PATCH 38/39] remove breaking behavior on nulls

---
 cpp/src/io/utilities/parsing_utils.cuh | 10 +++++-----
 cpp/tests/io/csv_test.cpp              |  3 ++-
 cpp/tests/io/json_test.cpp             |  7 ++++---
 python/cudf/cudf/tests/test_csv.py     |  2 +-
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index a78700db484..71c86553c4d 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -578,9 +578,9 @@ struct ConvertFunctor {
       return as_hex ? cudf::io::parse_numeric<T, 16>(begin, end, opts)
                     : cudf::io::parse_numeric<T>(begin, end, opts);
     }();
-    if (value.has_value()) { static_cast<T*>(out_buffer)[row] = *value; }
+    static_cast<T*>(out_buffer)[row] = value.value_or(std::numeric_limits<T>::quiet_NaN());
 
-    return value.has_value();
+    return true;
   }
 
   /**
@@ -630,9 +630,9 @@ struct ConvertFunctor {
       }
       return cudf::io::parse_numeric<T>(begin, end, opts);
     }();
-    if (value.has_value()) { static_cast<T*>(out_buffer)[row] = *value; }
+    static_cast<T*>(out_buffer)[row] = value.value_or(std::numeric_limits<T>::quiet_NaN());
 
-    return value.has_value();
+    return true;
   }
 
   /**
@@ -659,7 +659,7 @@ struct ConvertFunctor {
       }
       return cudf::io::parse_numeric<T>(begin, end, opts);
     }();
-    if (value.has_value()) { static_cast<T*>(out_buffer)[row] = *value; }
+    static_cast<T*>(out_buffer)[row] = value.value_or(std::numeric_limits<T>::quiet_NaN());
 
     return value.has_value() and !std::isnan(*value);
   }
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index af2e0f5abb9..8acc6f8f6ee 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -1170,7 +1170,8 @@ TEST_F(CsvReaderTest, InvalidFloatingPoint)
 
   const auto col_data = cudf::test::to_host<float>(view.column(0));
   // col_data.first contains the column data
-  // ignore all data because it is all nulls.
+  for (const auto& elem : col_data.first)
+    ASSERT_TRUE(std::isnan(elem));
   // col_data.second contains the bitmasks
   ASSERT_EQ(0u, col_data.second[0]);
 }
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 5da4d8969d6..f7b21008f70 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -914,7 +914,8 @@ TEST_P(JsonReaderParamTest, InvalidFloatingPoint)
 
   const auto col_data = cudf::test::to_host<float>(result.tbl->view().column(0));
   // col_data.first contains the column data
-  // ignore all data because it is all nulls.
+  for (const auto& elem : col_data.first)
+    ASSERT_TRUE(std::isnan(elem));
   // col_data.second contains the bitmasks
   ASSERT_EQ(0u, col_data.second[0]);
 }
@@ -1495,7 +1496,7 @@ TEST_P(JsonReaderParamTest, JsonDtypeParsing)
 
   auto int_col = int_wrapper{
     {0, 0, int_NA, 1, 1, int_NA, int_NA, int_NA, int_NA, 1, 0, int_NA, 1, 0, int_NA, int_NA},
-    make_validity(validity)};
+    cudf::test::iterators::nulls_at(std::vector<int>{8})};
   auto float_col = float_wrapper{{0.0,
                                   0.0,
                                   double_NA,
@@ -1534,7 +1535,7 @@ TEST_P(JsonReaderParamTest, JsonDtypeParsing)
                                 false,
                                 bool_NA,
                                 bool_NA},
-                               make_validity(validity)};
+                               cudf::test::iterators::nulls_at(std::vector<int>{8})};
 
   // Types to test
   const std::vector<data_type> dtypes = {
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 02569285efe..e85d404d2c4 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -834,7 +834,7 @@ def test_csv_reader_bools_NA():
     expected = pd.DataFrame(
         {
             "text": ["true", "false", "foo", "bar", "qux"],
-            "int": [1.0, 0.0, 1.0, 0.0, np.nan],
+            "int": [1, 0, 1, 0, 0],
         }
     )
     # breaking behaviour is np.nan for qux

From df6844c6b3263ffc9f1b7045970ffb8d80855728 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Tue, 15 Nov 2022 15:29:43 +0530
Subject: [PATCH 39/39] TODO clarify

---
 cpp/src/io/utilities/parsing_utils.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 71c86553c4d..89806956ae5 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -597,7 +597,7 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    // TODO what's invalid input
+    // TODO decide what's invalid input and update parsing functions
     static_cast<device_storage_type_t<T>*>(out_buffer)[row] =
       [&opts, output_type, begin, end]() -> device_storage_type_t<T> {
       return strings::detail::parse_decimal<device_storage_type_t<T>>(
@@ -678,7 +678,7 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    // TODO what's invalid input
+    // TODO decide what's invalid input and update parsing functions
     if constexpr (cudf::is_timestamp<T>()) {
       static_cast<T*>(out_buffer)[row] = to_timestamp<T>(begin, end, opts.dayfirst);
     } else if constexpr (cudf::is_duration<T>()) {