Fix CUDA kernel crash in GPU avro decoder.

rapidsai · Mar 16, 2023 · 4d2a317 · 4d2a317
1 parent 75dbbc9
commit 4d2a317
Show file tree

Hide file tree

Showing 2 changed files with 100 additions and 5 deletions.
diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu
@@ -96,7 +96,11 @@ avro_decode_row(schemadesc_s const* schema,
         --skip;
       }
       if (i >= schema_len || skip_after < 0) break;
-      kind = schema[i].kind;
+      kind         = schema[i].kind;
+      logical_kind = schema[i].logical_kind;
+      if (is_supported_logical_type(logical_kind)) {
+        kind = static_cast<type_kind_e>(logical_kind);
+      }
       skip = skip_after;
     }
 
@@ -110,13 +114,15 @@ avro_decode_row(schemadesc_s const* schema,
         break;
 
       case type_int: {
-        int64_t v                           = avro_decode_zigzag_varint(cur, end);
-        static_cast<int32_t*>(dataptr)[row] = static_cast<int32_t>(v);
+        int64_t v = avro_decode_zigzag_varint(cur, end);
+        if (dataptr != nullptr && row < max_rows) {
+          static_cast<int32_t*>(dataptr)[row] = static_cast<int32_t>(v);
+        }
       } break;
 
       case type_long: {
-        int64_t v                           = avro_decode_zigzag_varint(cur, end);
-        static_cast<int64_t*>(dataptr)[row] = v;
+        int64_t v = avro_decode_zigzag_varint(cur, end);
+        if (dataptr != nullptr && row < max_rows) { static_cast<int64_t*>(dataptr)[row] = v; }
       } break;
 
       case type_bytes: [[fallthrough]];

diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -14,6 +14,7 @@
 
 import datetime
 import io
+import pathlib
 from typing import Optional
 
 import fastavro
@@ -355,3 +356,91 @@ def test_can_parse_avro_date_logical_type(namespace, nullable, prepend_null):
     )
 
     assert_eq(expected, actual)
+
+
+def test_alltypes_plain_avro():
+    # During development of the logical type support, the Java avro tests were
+    # triggering CUDA kernel crashes (null pointer dereferences).  We were able
+    # to replicate the behavior in a C++ test case, and then subsequently came
+    # up with this Python unit test to also trigger the problematic code path.
+    #
+    # So, unlike the other tests, this test is inherently reactive in nature,
+    # added simply to verify we fixed the problematic code path that was
+    # causing CUDA kernel crashes.
+    #
+    # See https://github.com/rapidsai/cudf/pull/12788#issuecomment-1468822875
+    # for more information.
+    relpath = "../../../../java/src/test/resources/alltypes_plain.avro"
+    path = pathlib.Path(__file__).parent.joinpath(relpath).resolve()
+    assert path.is_file(), path
+    path = str(path)
+
+    with open(path, "rb") as f:
+        reader = fastavro.reader(f)
+        records = [record for record in reader]
+
+    # For reference:
+    #
+    # >>> from pprint import pprint
+    # >>> pprint(reader.writer_schema)
+    # {'fields': [{'name': 'id', 'type': ['int', 'null']},
+    #             {'name': 'bool_col', 'type': ['boolean', 'null']},
+    #             {'name': 'tinyint_col', 'type': ['int', 'null']},
+    #             {'name': 'smallint_col', 'type': ['int', 'null']},
+    #             {'name': 'int_col', 'type': ['int', 'null']},
+    #             {'name': 'bigint_col', 'type': ['long', 'null']},
+    #             {'name': 'float_col', 'type': ['float', 'null']},
+    #             {'name': 'double_col', 'type': ['double', 'null']},
+    #             {'name': 'date_string_col', 'type': ['bytes', 'null']},
+    #             {'name': 'string_col', 'type': ['bytes', 'null']},
+    #             {'name': 'timestamp_col',
+    #              'type': [{'logicalType': 'timestamp-micros',
+    #                        'type': 'long'},
+    #                       'null']}],
+    #  'name': 'topLevelRecord',
+    #  'type': 'record'}
+    #
+    # >>> pprint(records[0])
+    # {'bigint_col': 0,
+    #  'bool_col': True,
+    #  'date_string_col': b'03/01/09',
+    #  'double_col': 0.0,
+    #  'float_col': 0.0,
+    #  'id': 4,
+    #  'int_col': 0,
+    #  'smallint_col': 0,
+    #  'string_col': b'0',
+    #  'timestamp_col': datetime.datetime(2009, 3, 1, 0, 0,
+    #                                     tzinfo=datetime.timezone.utc),
+    #  'tinyint_col': 0}
+
+    # Nothing particularly special about these columns, other than them being
+    # the ones that @davidwendt used to coerce the crash.
+    columns = ["bool_col", "int_col", "timestamp_col"]
+
+    # This next line would trigger the fatal CUDA kernel crash.
+    actual = cudf.read_avro(path, columns=columns)
+
+    # If we get here, we haven't crashed, obviously.  Verify the returned data
+    # frame meets our expectations.  We need to fiddle with the dtypes of the
+    # expected data frame in order to correctly match the schema definition and
+    # our corresponding read_avro()-returned data frame.
+
+    data = [{column: row[column] for column in columns} for row in records]
+    expected = cudf.DataFrame(data)
+
+    # The fastavro.reader supports the `'logicalType': 'timestamp-micros'` used
+    # by the 'timestamp_col' column, which is converted into Python
+    # datetime.datetime() objects (see output of pprint(records[0]) above).
+    # As we don't support that logical type yet in cudf, we need to convert to
+    # int64, then divide by 1000 to convert from nanoseconds to microseconds.
+    timestamps = expected["timestamp_col"].astype("int64")
+    timestamps //= 1000
+    expected["timestamp_col"] = timestamps
+
+    # Furthermore, we need to force the 'int_col' into an int32, per the schema
+    # definition.  (It ends up as an int64 due to cudf.DataFrame() defaulting
+    # all Python int values to int64 sans a dtype= override.)
+    expected["int_col"] = expected["int_col"].astype("int32")
+
+    assert_eq(actual, expected)