From fd1e7c60bf6893b6fe9f88c6c81dedbc17f2b76c Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 11 Jun 2021 08:15:26 -0700 Subject: [PATCH 1/8] fixed OOB access for small files in unzip --- cpp/src/io/comp/uncomp.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index d5166b76892..f96aad33ead 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -210,6 +210,9 @@ bool OpenZipArchive(zip_archive_s *dst, const uint8_t *raw, size_t len) // Start of central directory if (cdfh->sig == 0x02014b50) { dst->cdfh = cdfh; } } + // ensure we don't read further than the beginning of the file and we don't underflow + if(i == 0) + break; } } return (dst->eocd && dst->cdfh); From ca1c679ac7caf3ba5034912160fd7f42c2437944 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 11 Jun 2021 10:41:24 -0700 Subject: [PATCH 2/8] fixing style --- cpp/src/io/comp/uncomp.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index f96aad33ead..039665b34f7 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -211,8 +211,7 @@ bool OpenZipArchive(zip_archive_s *dst, const uint8_t *raw, size_t len) if (cdfh->sig == 0x02014b50) { dst->cdfh = cdfh; } } // ensure we don't read further than the beginning of the file and we don't underflow - if(i == 0) - break; + if (i == 0) break; } } return (dst->eocd && dst->cdfh); From 36e1c89cc05ff7b4acd471df023cf0c5fca653cc Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 15 Jun 2021 02:36:37 -0700 Subject: [PATCH 3/8] added unit test for reading zip-compressed CSV --- cpp/tests/io/csv_test.cpp | 42 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index e45b67505ba..c61aebf1e86 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -1081,6 +1081,48 @@ TEST_F(CsvReaderTest, ArrowFileSource) expect_column_data_equal(std::vector{9, 8, 7, 6, 5, 4, 3, 2}, view.column(0)); } +TEST_F(CsvReaderTest, ZippedCSV) +{ + // input is the following CSV zip'ed: + // "year,name,description\n1997,Ford,"Super, luxurious truck"\n1997,Ford,"Super, luxurious truck" + constexpr uint8_t compressed[] = { + 0x50, 0x4b, 0x03, 0x04, 0x14, 0x00, 0x00, 0x00, 0x08, 0x00, 0x4e, 0x5b, 0xcf, 0x52, 0xaf, 0x6f, + 0xfa, 0xe1, 0x3c, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x1c, 0x00, 0x7a, 0x69, + 0x70, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x2e, 0x63, 0x73, 0x76, 0x55, 0x54, 0x09, 0x00, + 0x03, 0x44, 0x72, 0xc8, 0x60, 0x45, 0x72, 0xc8, 0x60, 0x75, 0x78, 0x0b, 0x00, 0x01, 0x04, 0xf7, + 0x01, 0x00, 0x00, 0x04, 0x14, 0x00, 0x00, 0x00, 0xab, 0x4c, 0x4d, 0x2c, 0xd2, 0xc9, 0x4b, 0xcc, + 0x4d, 0xd5, 0x49, 0x49, 0x2d, 0x4e, 0x2e, 0xca, 0x2c, 0x28, 0xc9, 0xcc, 0xcf, 0xe3, 0x32, 0xb4, + 0xb4, 0x34, 0xd7, 0x71, 0xcb, 0x2f, 0x4a, 0xd1, 0x51, 0x0a, 0x2e, 0x2d, 0x48, 0x2d, 0xd2, 0x51, + 0xc8, 0x29, 0xad, 0x28, 0x2d, 0xca, 0xcc, 0x2f, 0x2d, 0x56, 0x28, 0x29, 0x2a, 0x4d, 0xce, 0x56, + 0x22, 0x46, 0x09, 0x00, 0x50, 0x4b, 0x01, 0x02, 0x1e, 0x03, 0x14, 0x00, 0x00, 0x00, 0x08, 0x00, + 0x4e, 0x5b, 0xcf, 0x52, 0xaf, 0x6f, 0xfa, 0xe1, 0x3c, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xa4, 0x81, 0x00, 0x00, + 0x00, 0x00, 0x7a, 0x69, 0x70, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x2e, 0x63, 0x73, 0x76, + 0x55, 0x54, 0x05, 0x00, 0x03, 0x44, 0x72, 0xc8, 0x60, 0x75, 0x78, 0x0b, 0x00, 0x01, 0x04, 0xf7, + 0x01, 0x00, 0x00, 0x04, 0x14, 0x00, 0x00, 0x00, 0x50, 0x4b, 0x05, 0x06, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x01, 0x00, 0x54, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + + const auto filepath = temp_env->get_temp_dir() + "zipped_example.zip"; + { + std::ofstream outfile(filepath, std::ofstream::out | std::ofstream::binary); + outfile.write(reinterpret_cast(compressed), sizeof(compressed)); + } + + cudf_io::csv_reader_options in_opts = + cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) + .compression(cudf::io::compression_type::ZIP); + const auto result = cudf_io::read_csv(in_opts); + const auto result_view = result.tbl->view(); + + auto year_col = std::vector{1997L, 1997L}; + auto name_col = std::vector{"Ford", "Ford"}; + auto descr_col = std::vector{"Super, luxurious truck", "Super, luxurious truck"}; + expect_column_data_equal(year_col, result_view.column(0)); + expect_column_data_equal(name_col, result_view.column(1)); + expect_column_data_equal(descr_col, result_view.column(2)); +} + TEST_F(CsvReaderTest, InvalidFloatingPoint) { const auto filepath = temp_env->get_temp_dir() + "InvalidFloatingPoint.csv"; From e4db732509cbeed2038f67277a6f7a9300a1c455 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 15 Jun 2021 02:48:39 -0700 Subject: [PATCH 4/8] replaced underflow handling --- cpp/src/io/comp/uncomp.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index 039665b34f7..44581bbc184 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -193,16 +193,18 @@ bool OpenZipArchive(zip_archive_s *dst, const uint8_t *raw, size_t len) memset(dst, 0, sizeof(zip_archive_s)); // Find the end of central directory if (len >= sizeof(zip_eocd_s) + 2) { - for (size_t i = len - sizeof(zip_eocd_s) - 2; i + sizeof(zip_eocd_s) + 2 + 0xffff >= len; i--) { + for (ptrdiff_t i = len - sizeof(zip_eocd_s) - 2; + i + sizeof(zip_eocd_s) + 2 + 0xffff >= len && i >= 0; + i--) { const zip_eocd_s *eocd = reinterpret_cast(raw + i); if (eocd->sig == 0x06054b50 && eocd->disk_id == eocd->start_disk // multi-file archives not supported && eocd->num_entries == eocd->total_entries && eocd->cdir_size >= sizeof(zip_cdfh_s) * eocd->num_entries && eocd->cdir_offset < len && - i + *reinterpret_cast(eocd + 1) <= len) { + i + *reinterpret_cast(eocd + 1) <= static_cast(len)) { const zip_cdfh_s *cdfh = reinterpret_cast(raw + eocd->cdir_offset); dst->eocd = eocd; - if (i >= sizeof(zip64_eocdl)) { + if (i >= static_cast(sizeof(zip64_eocdl))) { const zip64_eocdl *eocdl = reinterpret_cast(raw + i - sizeof(zip64_eocdl)); if (eocdl->sig == 0x07064b50) { dst->eocdl = eocdl; } @@ -210,8 +212,6 @@ bool OpenZipArchive(zip_archive_s *dst, const uint8_t *raw, size_t len) // Start of central directory if (cdfh->sig == 0x02014b50) { dst->cdfh = cdfh; } } - // ensure we don't read further than the beginning of the file and we don't underflow - if (i == 0) break; } } return (dst->eocd && dst->cdfh); From d4a3a78b47af5791acb0e1ef706a2277e6e75de2 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 16 Jun 2021 00:18:31 -0700 Subject: [PATCH 5/8] add python test for small zip files --- python/cudf/cudf/tests/test_csv.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 0a31624ef7c..a6396756a63 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -979,6 +979,30 @@ def test_csv_reader_filepath_or_buffer(tmpdir, path_or_buf, src): assert_eq(expect, got) +def test_small_zip(tmpdir): + df = pd.DataFrame( + { + "a": [1997] * 2, + "b": ["Ford"] * 2, + "c": ["Super, luxurious truck"] * 2, + } + ) + + fname = tmpdir.join("small_zip_file.zip") + df.to_csv(fname, index=False) + + df = pd.DataFrame( + { + "a": [1997] * 2, + "b": ["Ford"] * 2, + "c": ["Superd, luxurious truck"] * 2, + } + ) + + got = cudf.read_csv(fname) + assert_eq(df, got) + + def test_csv_reader_carriage_return(tmpdir): rows = 1000 names = ["int_row", "int_double_row"] From 3c9e6df7449796ca8039a0c082787811c4512d10 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 16 Jun 2021 08:37:28 -0700 Subject: [PATCH 6/8] reverted c++ test in favor of python test --- cpp/tests/io/csv_test.cpp | 42 --------------------------------------- 1 file changed, 42 deletions(-) diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index c2e707d9f06..f2278267f74 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -1081,48 +1081,6 @@ TEST_F(CsvReaderTest, ArrowFileSource) expect_column_data_equal(std::vector{9, 8, 7, 6, 5, 4, 3, 2}, view.column(0)); } -TEST_F(CsvReaderTest, ZippedCSV) -{ - // input is the following CSV zip'ed: - // "year,name,description\n1997,Ford,"Super, luxurious truck"\n1997,Ford,"Super, luxurious truck" - constexpr uint8_t compressed[] = { - 0x50, 0x4b, 0x03, 0x04, 0x14, 0x00, 0x00, 0x00, 0x08, 0x00, 0x4e, 0x5b, 0xcf, 0x52, 0xaf, 0x6f, - 0xfa, 0xe1, 0x3c, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x1c, 0x00, 0x7a, 0x69, - 0x70, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x2e, 0x63, 0x73, 0x76, 0x55, 0x54, 0x09, 0x00, - 0x03, 0x44, 0x72, 0xc8, 0x60, 0x45, 0x72, 0xc8, 0x60, 0x75, 0x78, 0x0b, 0x00, 0x01, 0x04, 0xf7, - 0x01, 0x00, 0x00, 0x04, 0x14, 0x00, 0x00, 0x00, 0xab, 0x4c, 0x4d, 0x2c, 0xd2, 0xc9, 0x4b, 0xcc, - 0x4d, 0xd5, 0x49, 0x49, 0x2d, 0x4e, 0x2e, 0xca, 0x2c, 0x28, 0xc9, 0xcc, 0xcf, 0xe3, 0x32, 0xb4, - 0xb4, 0x34, 0xd7, 0x71, 0xcb, 0x2f, 0x4a, 0xd1, 0x51, 0x0a, 0x2e, 0x2d, 0x48, 0x2d, 0xd2, 0x51, - 0xc8, 0x29, 0xad, 0x28, 0x2d, 0xca, 0xcc, 0x2f, 0x2d, 0x56, 0x28, 0x29, 0x2a, 0x4d, 0xce, 0x56, - 0x22, 0x46, 0x09, 0x00, 0x50, 0x4b, 0x01, 0x02, 0x1e, 0x03, 0x14, 0x00, 0x00, 0x00, 0x08, 0x00, - 0x4e, 0x5b, 0xcf, 0x52, 0xaf, 0x6f, 0xfa, 0xe1, 0x3c, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, - 0x0e, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xa4, 0x81, 0x00, 0x00, - 0x00, 0x00, 0x7a, 0x69, 0x70, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x2e, 0x63, 0x73, 0x76, - 0x55, 0x54, 0x05, 0x00, 0x03, 0x44, 0x72, 0xc8, 0x60, 0x75, 0x78, 0x0b, 0x00, 0x01, 0x04, 0xf7, - 0x01, 0x00, 0x00, 0x04, 0x14, 0x00, 0x00, 0x00, 0x50, 0x4b, 0x05, 0x06, 0x00, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x01, 0x00, 0x54, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, - }; - - const auto filepath = temp_env->get_temp_dir() + "zipped_example.zip"; - { - std::ofstream outfile(filepath, std::ofstream::out | std::ofstream::binary); - outfile.write(reinterpret_cast(compressed), sizeof(compressed)); - } - - cudf_io::csv_reader_options in_opts = - cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) - .compression(cudf::io::compression_type::ZIP); - const auto result = cudf_io::read_csv(in_opts); - const auto result_view = result.tbl->view(); - - auto year_col = std::vector{1997L, 1997L}; - auto name_col = std::vector{"Ford", "Ford"}; - auto descr_col = std::vector{"Super, luxurious truck", "Super, luxurious truck"}; - expect_column_data_equal(year_col, result_view.column(0)); - expect_column_data_equal(name_col, result_view.column(1)); - expect_column_data_equal(descr_col, result_view.column(2)); -} - TEST_F(CsvReaderTest, InvalidFloatingPoint) { const auto filepath = temp_env->get_temp_dir() + "InvalidFloatingPoint.csv"; From 8479017e58860724f631b4b5cd5c45641700a088 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 16 Jun 2021 14:22:10 -0700 Subject: [PATCH 7/8] remove redundant code block --- python/cudf/cudf/tests/test_csv.py | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index f202786ffcd..27f0f684adc 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -991,14 +991,6 @@ def test_small_zip(tmpdir): fname = tmpdir.join("small_zip_file.zip") df.to_csv(fname, index=False) - df = pd.DataFrame( - { - "a": [1997] * 2, - "b": ["Ford"] * 2, - "c": ["Superd, luxurious truck"] * 2, - } - ) - got = cudf.read_csv(fname) assert_eq(df, got) @@ -1607,8 +1599,10 @@ def test_csv_writer_column_and_header_options( def test_csv_writer_empty_columns_parameter(cudf_mixed_dataframe): df = cudf_mixed_dataframe - write_str = df.to_csv(columns=[], index=False) - assert_eq(write_str, "\n") + + buffer = BytesIO() + with pytest.raises(RuntimeError): + df.to_csv(buffer, columns=[], index=False) def test_csv_writer_multiindex(tmpdir): @@ -2001,13 +1995,3 @@ def test_to_csv_compression_error(): error_message = "Writing compressed csv is not currently supported in cudf" with pytest.raises(NotImplementedError, match=re.escape(error_message)): df.to_csv("test.csv", compression=compression) - - -def test_empty_df_no_index(): - actual = cudf.DataFrame({}) - buffer = BytesIO() - actual.to_csv(buffer, index=False) - - result = cudf.read_csv(buffer) - - assert_eq(actual, result) From e408fad6fd73e832140c5489030aec2e69cb5b9e Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 16 Jun 2021 22:45:09 -0700 Subject: [PATCH 8/8] revert unrelated changes --- python/cudf/cudf/tests/test_csv.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 27f0f684adc..925369048cb 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1599,10 +1599,8 @@ def test_csv_writer_column_and_header_options( def test_csv_writer_empty_columns_parameter(cudf_mixed_dataframe): df = cudf_mixed_dataframe - - buffer = BytesIO() - with pytest.raises(RuntimeError): - df.to_csv(buffer, columns=[], index=False) + write_str = df.to_csv(columns=[], index=False) + assert_eq(write_str, "\n") def test_csv_writer_multiindex(tmpdir): @@ -1995,3 +1993,13 @@ def test_to_csv_compression_error(): error_message = "Writing compressed csv is not currently supported in cudf" with pytest.raises(NotImplementedError, match=re.escape(error_message)): df.to_csv("test.csv", compression=compression) + + +def test_empty_df_no_index(): + actual = cudf.DataFrame({}) + buffer = BytesIO() + actual.to_csv(buffer, index=False) + + result = cudf.read_csv(buffer) + + assert_eq(actual, result)