Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

avro reader integration tests #7156

Merged
merged 7 commits into from
Feb 11, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 0 additions & 108 deletions python/cudf/cudf/tests/test_avro.py

This file was deleted.

185 changes: 185 additions & 0 deletions python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
cwharris marked this conversation as resolved.
Show resolved Hide resolved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

vuule marked this conversation as resolved.
Show resolved Hide resolved
import io

import fastavro
import pytest

import cudf
from cudf.tests.utils import assert_eq


def cudf_from_avro_util(schema, records):
schema = fastavro.parse_schema(schema)
buffer = io.BytesIO()
fastavro.writer(buffer, schema, records)
buffer.seek(0)
return cudf.read_avro(buffer)


avro_type_params = [
("boolean", "bool"),
("int", "int32"),
("long", "int64"),
("float", "float32"),
("double", "float64"),
("bytes", "str"),
("string", "str"),
]


@pytest.mark.parametrize("avro_type, expected_dtype", avro_type_params)
@pytest.mark.parametrize("namespace", [None, "root_ns"])
@pytest.mark.parametrize("nullable", [True, False])
def test_can_detect_dtype_from_avro_type(
avro_type, expected_dtype, namespace, nullable
):
avro_type = avro_type if not nullable else ["null", avro_type]

schema = fastavro.parse_schema(
{
"type": "record",
"name": "test",
"namespace": namespace,
"fields": [{"name": "prop", "type": avro_type}],
}
)

actual = cudf_from_avro_util(schema, [])

expected = cudf.DataFrame(
{"prop": cudf.Series(None, None, expected_dtype)}
)

assert_eq(expected, actual)


@pytest.mark.parametrize("avro_type, expected_dtype", avro_type_params)
@pytest.mark.parametrize("namespace", [None, "root_ns"])
@pytest.mark.parametrize("nullable", [True, False])
def test_can_detect_dtype_from_avro_type_nested(
avro_type, expected_dtype, namespace, nullable
):
avro_type = avro_type if not nullable else ["null", avro_type]

schema_leaf = {
"name": "leaf",
"type": "record",
"fields": [{"name": "prop3", "type": avro_type}],
}

schema_child = {
"name": "child",
"type": "record",
"fields": [{"name": "prop2", "type": schema_leaf}],
}

schema_root = {
"name": "root",
"type": "record",
"namespace": namespace,
"fields": [{"name": "prop1", "type": schema_child}],
}

actual = cudf_from_avro_util(schema_root, [])

col_name = "{ns}child.{ns}leaf.prop3".format(
ns="" if namespace is None else namespace + "."
)

expected = cudf.DataFrame(
{col_name: cudf.Series(None, None, expected_dtype)}
)

assert_eq(expected, actual)


@pytest.mark.parametrize(
"avro_type, cudf_type, avro_val, cudf_val",
[
("boolean", "bool", True, True),
("boolean", "bool", False, False),
("int", "int32", 1234, 1234),
("long", "int64", 1234, 1234),
("float", "float32", 12.34, 12.34),
("double", "float64", 12.34, 12.34),
("string", "str", "hey", "hey"),
cwharris marked this conversation as resolved.
Show resolved Hide resolved
# ('bytes', 'str', 'hey', 'hey'),
],
)
def test_can_parse_values(avro_type, cudf_type, avro_val, cudf_val):

schema_root = {
"name": "root",
"type": "record",
"fields": [{"name": "prop", "type": ["null", avro_type]}],
}

records = [
{"prop": avro_val},
{"prop": None},
]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the dataframe shape (1,2)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

expected and actual are the same shape. I don't know what shape that should be.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we also have some tests with a large number of rows?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can test a large number of values,. It would be nice to have a test data generator. I see we're generating random values for fuzz testing. Are we able to do that in a deterministic manner so it can be also be used for unit tests?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC the data generator optionally takes a seed value; that the output is deterministic for each seed. CC @galipremsagar for pointer to the generator + sample use.

Copy link
Contributor

@galipremsagar galipremsagar Feb 4, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we are discussing having large rows, I'd recommend staying in <30 rows range to not slow down things in pytests by a lot as that would slow down in gpu CI too. If there is a bug that only reproduces for a large column scenarion then we can widen the test coverage for large columns, else I think fuzz tests should take care of large rows testing. For using the dataset generator, here is how we can use it:

>>> import cudf
>>> from cudf.tests.dataset_generator import rand_dataframe
>>> rand_dataframe(dtypes_meta=[{"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}], 100, seed=2)
  File "<stdin>", line 1
SyntaxError: positional argument follows keyword argument
>>> rand_dataframe(dtypes_meta=[{"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}], rows=100, seed=2)
pyarrow.Table
0: int64
>>> cudf.DataFrame.from_arrow(rand_dataframe(dtypes_meta=[{"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}], rows=100, seed=2))
                       0
0   -1468954783236838137
1                   <NA>
2    2200161065918338095
3   -1193091257902529461
4   -5448271019629827509
..                   ...
95                  <NA>
96   2200161065918338095
97  -8745117541724490168
98                  <NA>
99  -4301277553722975852

[100 rows x 1 columns]

Alternatively, There is also an existing API that also returns deterministic data with the same seed values that is widely used across our pytests:
https://github.com/rapidsai/cudf/blob/branch-0.18/python/cudf/cudf/datasets.py#L60
This is much simpler to use and fits the use-case here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we rather just change this test to be a list of values(cudf_val be length 5/10) instead of 1 value?


actual = cudf_from_avro_util(schema_root, records)

expected = cudf.DataFrame(
{"prop": cudf.Series(data=[cudf_val, None], dtype=cudf_type)}
)

assert_eq(expected, actual)


# @pytest.mark.parametrize("avro_type, cudf_type", avro_type_params)
# def test_can_parse_single_null(avro_type, cudf_type):
vuule marked this conversation as resolved.
Show resolved Hide resolved

# schema_root = {
# 'name': 'root',
# 'type': 'record',
# 'fields': [ { 'name': 'prop', 'type': ['null', avro_type] } ],
# }

# records = [
# {u'prop': None}
# ]

# actual = cudf_from_avro_util(schema_root, records)

# expected = cudf.DataFrame({
# 'prop': cudf.Series(data=[None], dtype=cudf_type)
# })

# assert_eq(expected, actual)

# @pytest.mark.parametrize("avro_type, cudf_type", avro_type_params)
# def test_can_parse_multiple_values(avro_type, cudf_type):

# schema_root = {
# 'name': 'root',
# 'type': 'record',
# 'fields': [ { 'name': 'prop', 'type': ['null', avro_type] } ],
# }

# records = [
# {u'prop': None}
# {u'prop': None}
# ]

# actual = cudf_from_avro_util(schema_root, records)

# expected = cudf.DataFrame({
# 'prop': cudf.Series(data=[None], dtype=cudf_type)
# })

# assert_eq(expected, actual)