From fd1e7c60bf6893b6fe9f88c6c81dedbc17f2b76c Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Fri, 11 Jun 2021 08:15:26 -0700
Subject: [PATCH 1/8] fixed OOB access for small files in unzip

---
 cpp/src/io/comp/uncomp.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index d5166b76892..f96aad33ead 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -210,6 +210,9 @@ bool OpenZipArchive(zip_archive_s *dst, const uint8_t *raw, size_t len)
         // Start of central directory
         if (cdfh->sig == 0x02014b50) { dst->cdfh = cdfh; }
       }
+      // ensure we don't read further than the beginning of the file and we don't underflow
+      if(i == 0)
+        break;
     }
   }
   return (dst->eocd && dst->cdfh);

From ca1c679ac7caf3ba5034912160fd7f42c2437944 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Fri, 11 Jun 2021 10:41:24 -0700
Subject: [PATCH 2/8] fixing style

---
 cpp/src/io/comp/uncomp.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index f96aad33ead..039665b34f7 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -211,8 +211,7 @@ bool OpenZipArchive(zip_archive_s *dst, const uint8_t *raw, size_t len)
         if (cdfh->sig == 0x02014b50) { dst->cdfh = cdfh; }
       }
       // ensure we don't read further than the beginning of the file and we don't underflow
-      if(i == 0)
-        break;
+      if (i == 0) break;
     }
   }
   return (dst->eocd && dst->cdfh);

From 36e1c89cc05ff7b4acd471df023cf0c5fca653cc Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Tue, 15 Jun 2021 02:36:37 -0700
Subject: [PATCH 3/8] added unit test for reading zip-compressed CSV

---
 cpp/tests/io/csv_test.cpp | 42 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index e45b67505ba..c61aebf1e86 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -1081,6 +1081,48 @@ TEST_F(CsvReaderTest, ArrowFileSource)
   expect_column_data_equal(std::vector<int8_t>{9, 8, 7, 6, 5, 4, 3, 2}, view.column(0));
 }
 
+TEST_F(CsvReaderTest, ZippedCSV)
+{
+  // input is the following CSV zip'ed:
+  // "year,name,description\n1997,Ford,"Super, luxurious truck"\n1997,Ford,"Super, luxurious truck"
+  constexpr uint8_t compressed[] = {
+    0x50, 0x4b, 0x03, 0x04, 0x14, 0x00, 0x00, 0x00, 0x08, 0x00, 0x4e, 0x5b, 0xcf, 0x52, 0xaf, 0x6f,
+    0xfa, 0xe1, 0x3c, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x1c, 0x00, 0x7a, 0x69,
+    0x70, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x2e, 0x63, 0x73, 0x76, 0x55, 0x54, 0x09, 0x00,
+    0x03, 0x44, 0x72, 0xc8, 0x60, 0x45, 0x72, 0xc8, 0x60, 0x75, 0x78, 0x0b, 0x00, 0x01, 0x04, 0xf7,
+    0x01, 0x00, 0x00, 0x04, 0x14, 0x00, 0x00, 0x00, 0xab, 0x4c, 0x4d, 0x2c, 0xd2, 0xc9, 0x4b, 0xcc,
+    0x4d, 0xd5, 0x49, 0x49, 0x2d, 0x4e, 0x2e, 0xca, 0x2c, 0x28, 0xc9, 0xcc, 0xcf, 0xe3, 0x32, 0xb4,
+    0xb4, 0x34, 0xd7, 0x71, 0xcb, 0x2f, 0x4a, 0xd1, 0x51, 0x0a, 0x2e, 0x2d, 0x48, 0x2d, 0xd2, 0x51,
+    0xc8, 0x29, 0xad, 0x28, 0x2d, 0xca, 0xcc, 0x2f, 0x2d, 0x56, 0x28, 0x29, 0x2a, 0x4d, 0xce, 0x56,
+    0x22, 0x46, 0x09, 0x00, 0x50, 0x4b, 0x01, 0x02, 0x1e, 0x03, 0x14, 0x00, 0x00, 0x00, 0x08, 0x00,
+    0x4e, 0x5b, 0xcf, 0x52, 0xaf, 0x6f, 0xfa, 0xe1, 0x3c, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
+    0x0e, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xa4, 0x81, 0x00, 0x00,
+    0x00, 0x00, 0x7a, 0x69, 0x70, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x2e, 0x63, 0x73, 0x76,
+    0x55, 0x54, 0x05, 0x00, 0x03, 0x44, 0x72, 0xc8, 0x60, 0x75, 0x78, 0x0b, 0x00, 0x01, 0x04, 0xf7,
+    0x01, 0x00, 0x00, 0x04, 0x14, 0x00, 0x00, 0x00, 0x50, 0x4b, 0x05, 0x06, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x01, 0x00, 0x54, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00,
+  };
+
+  const auto filepath = temp_env->get_temp_dir() + "zipped_example.zip";
+  {
+    std::ofstream outfile(filepath, std::ofstream::out | std::ofstream::binary);
+    outfile.write(reinterpret_cast<const char*>(compressed), sizeof(compressed));
+  }
+
+  cudf_io::csv_reader_options in_opts =
+    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+      .compression(cudf::io::compression_type::ZIP);
+  const auto result      = cudf_io::read_csv(in_opts);
+  const auto result_view = result.tbl->view();
+
+  auto year_col  = std::vector<int64_t>{1997L, 1997L};
+  auto name_col  = std::vector<std::string>{"Ford", "Ford"};
+  auto descr_col = std::vector<std::string>{"Super, luxurious truck", "Super, luxurious truck"};
+  expect_column_data_equal(year_col, result_view.column(0));
+  expect_column_data_equal(name_col, result_view.column(1));
+  expect_column_data_equal(descr_col, result_view.column(2));
+}
+
 TEST_F(CsvReaderTest, InvalidFloatingPoint)
 {
   const auto filepath = temp_env->get_temp_dir() + "InvalidFloatingPoint.csv";

From e4db732509cbeed2038f67277a6f7a9300a1c455 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Tue, 15 Jun 2021 02:48:39 -0700
Subject: [PATCH 4/8] replaced underflow handling

---
 cpp/src/io/comp/uncomp.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 039665b34f7..44581bbc184 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -193,16 +193,18 @@ bool OpenZipArchive(zip_archive_s *dst, const uint8_t *raw, size_t len)
   memset(dst, 0, sizeof(zip_archive_s));
   // Find the end of central directory
   if (len >= sizeof(zip_eocd_s) + 2) {
-    for (size_t i = len - sizeof(zip_eocd_s) - 2; i + sizeof(zip_eocd_s) + 2 + 0xffff >= len; i--) {
+    for (ptrdiff_t i = len - sizeof(zip_eocd_s) - 2;
+         i + sizeof(zip_eocd_s) + 2 + 0xffff >= len && i >= 0;
+         i--) {
       const zip_eocd_s *eocd = reinterpret_cast<zip_eocd_s const *>(raw + i);
       if (eocd->sig == 0x06054b50 &&
           eocd->disk_id == eocd->start_disk  // multi-file archives not supported
           && eocd->num_entries == eocd->total_entries &&
           eocd->cdir_size >= sizeof(zip_cdfh_s) * eocd->num_entries && eocd->cdir_offset < len &&
-          i + *reinterpret_cast<const uint16_t *>(eocd + 1) <= len) {
+          i + *reinterpret_cast<const uint16_t *>(eocd + 1) <= static_cast<ptrdiff_t>(len)) {
         const zip_cdfh_s *cdfh = reinterpret_cast<const zip_cdfh_s *>(raw + eocd->cdir_offset);
         dst->eocd              = eocd;
-        if (i >= sizeof(zip64_eocdl)) {
+        if (i >= static_cast<ptrdiff_t>(sizeof(zip64_eocdl))) {
           const zip64_eocdl *eocdl =
             reinterpret_cast<const zip64_eocdl *>(raw + i - sizeof(zip64_eocdl));
           if (eocdl->sig == 0x07064b50) { dst->eocdl = eocdl; }
@@ -210,8 +212,6 @@ bool OpenZipArchive(zip_archive_s *dst, const uint8_t *raw, size_t len)
         // Start of central directory
         if (cdfh->sig == 0x02014b50) { dst->cdfh = cdfh; }
       }
-      // ensure we don't read further than the beginning of the file and we don't underflow
-      if (i == 0) break;
     }
   }
   return (dst->eocd && dst->cdfh);

From d4a3a78b47af5791acb0e1ef706a2277e6e75de2 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 16 Jun 2021 00:18:31 -0700
Subject: [PATCH 5/8] add python test for small zip files

---
 python/cudf/cudf/tests/test_csv.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 0a31624ef7c..a6396756a63 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -979,6 +979,30 @@ def test_csv_reader_filepath_or_buffer(tmpdir, path_or_buf, src):
     assert_eq(expect, got)
 
 
+def test_small_zip(tmpdir):
+    df = pd.DataFrame(
+        {
+            "a": [1997] * 2,
+            "b": ["Ford"] * 2,
+            "c": ["Super, luxurious truck"] * 2,
+        }
+    )
+
+    fname = tmpdir.join("small_zip_file.zip")
+    df.to_csv(fname, index=False)
+
+    df = pd.DataFrame(
+        {
+            "a": [1997] * 2,
+            "b": ["Ford"] * 2,
+            "c": ["Superd, luxurious truck"] * 2,
+        }
+    )
+
+    got = cudf.read_csv(fname)
+    assert_eq(df, got)
+
+
 def test_csv_reader_carriage_return(tmpdir):
     rows = 1000
     names = ["int_row", "int_double_row"]

From 3c9e6df7449796ca8039a0c082787811c4512d10 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 16 Jun 2021 08:37:28 -0700
Subject: [PATCH 6/8] reverted c++ test in favor of python test

---
 cpp/tests/io/csv_test.cpp | 42 ---------------------------------------
 1 file changed, 42 deletions(-)

diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index c2e707d9f06..f2278267f74 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -1081,48 +1081,6 @@ TEST_F(CsvReaderTest, ArrowFileSource)
   expect_column_data_equal(std::vector<int8_t>{9, 8, 7, 6, 5, 4, 3, 2}, view.column(0));
 }
 
-TEST_F(CsvReaderTest, ZippedCSV)
-{
-  // input is the following CSV zip'ed:
-  // "year,name,description\n1997,Ford,"Super, luxurious truck"\n1997,Ford,"Super, luxurious truck"
-  constexpr uint8_t compressed[] = {
-    0x50, 0x4b, 0x03, 0x04, 0x14, 0x00, 0x00, 0x00, 0x08, 0x00, 0x4e, 0x5b, 0xcf, 0x52, 0xaf, 0x6f,
-    0xfa, 0xe1, 0x3c, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x1c, 0x00, 0x7a, 0x69,
-    0x70, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x2e, 0x63, 0x73, 0x76, 0x55, 0x54, 0x09, 0x00,
-    0x03, 0x44, 0x72, 0xc8, 0x60, 0x45, 0x72, 0xc8, 0x60, 0x75, 0x78, 0x0b, 0x00, 0x01, 0x04, 0xf7,
-    0x01, 0x00, 0x00, 0x04, 0x14, 0x00, 0x00, 0x00, 0xab, 0x4c, 0x4d, 0x2c, 0xd2, 0xc9, 0x4b, 0xcc,
-    0x4d, 0xd5, 0x49, 0x49, 0x2d, 0x4e, 0x2e, 0xca, 0x2c, 0x28, 0xc9, 0xcc, 0xcf, 0xe3, 0x32, 0xb4,
-    0xb4, 0x34, 0xd7, 0x71, 0xcb, 0x2f, 0x4a, 0xd1, 0x51, 0x0a, 0x2e, 0x2d, 0x48, 0x2d, 0xd2, 0x51,
-    0xc8, 0x29, 0xad, 0x28, 0x2d, 0xca, 0xcc, 0x2f, 0x2d, 0x56, 0x28, 0x29, 0x2a, 0x4d, 0xce, 0x56,
-    0x22, 0x46, 0x09, 0x00, 0x50, 0x4b, 0x01, 0x02, 0x1e, 0x03, 0x14, 0x00, 0x00, 0x00, 0x08, 0x00,
-    0x4e, 0x5b, 0xcf, 0x52, 0xaf, 0x6f, 0xfa, 0xe1, 0x3c, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
-    0x0e, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xa4, 0x81, 0x00, 0x00,
-    0x00, 0x00, 0x7a, 0x69, 0x70, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x2e, 0x63, 0x73, 0x76,
-    0x55, 0x54, 0x05, 0x00, 0x03, 0x44, 0x72, 0xc8, 0x60, 0x75, 0x78, 0x0b, 0x00, 0x01, 0x04, 0xf7,
-    0x01, 0x00, 0x00, 0x04, 0x14, 0x00, 0x00, 0x00, 0x50, 0x4b, 0x05, 0x06, 0x00, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x01, 0x00, 0x54, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00,
-  };
-
-  const auto filepath = temp_env->get_temp_dir() + "zipped_example.zip";
-  {
-    std::ofstream outfile(filepath, std::ofstream::out | std::ofstream::binary);
-    outfile.write(reinterpret_cast<const char*>(compressed), sizeof(compressed));
-  }
-
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
-      .compression(cudf::io::compression_type::ZIP);
-  const auto result      = cudf_io::read_csv(in_opts);
-  const auto result_view = result.tbl->view();
-
-  auto year_col  = std::vector<int64_t>{1997L, 1997L};
-  auto name_col  = std::vector<std::string>{"Ford", "Ford"};
-  auto descr_col = std::vector<std::string>{"Super, luxurious truck", "Super, luxurious truck"};
-  expect_column_data_equal(year_col, result_view.column(0));
-  expect_column_data_equal(name_col, result_view.column(1));
-  expect_column_data_equal(descr_col, result_view.column(2));
-}
-
 TEST_F(CsvReaderTest, InvalidFloatingPoint)
 {
   const auto filepath = temp_env->get_temp_dir() + "InvalidFloatingPoint.csv";

From 8479017e58860724f631b4b5cd5c45641700a088 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 16 Jun 2021 14:22:10 -0700
Subject: [PATCH 7/8] remove redundant code block

---
 python/cudf/cudf/tests/test_csv.py | 24 ++++--------------------
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index f202786ffcd..27f0f684adc 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -991,14 +991,6 @@ def test_small_zip(tmpdir):
     fname = tmpdir.join("small_zip_file.zip")
     df.to_csv(fname, index=False)
 
-    df = pd.DataFrame(
-        {
-            "a": [1997] * 2,
-            "b": ["Ford"] * 2,
-            "c": ["Superd, luxurious truck"] * 2,
-        }
-    )
-
     got = cudf.read_csv(fname)
     assert_eq(df, got)
 
@@ -1607,8 +1599,10 @@ def test_csv_writer_column_and_header_options(
 
 def test_csv_writer_empty_columns_parameter(cudf_mixed_dataframe):
     df = cudf_mixed_dataframe
-    write_str = df.to_csv(columns=[], index=False)
-    assert_eq(write_str, "\n")
+
+    buffer = BytesIO()
+    with pytest.raises(RuntimeError):
+        df.to_csv(buffer, columns=[], index=False)
 
 
 def test_csv_writer_multiindex(tmpdir):
@@ -2001,13 +1995,3 @@ def test_to_csv_compression_error():
     error_message = "Writing compressed csv is not currently supported in cudf"
     with pytest.raises(NotImplementedError, match=re.escape(error_message)):
         df.to_csv("test.csv", compression=compression)
-
-
-def test_empty_df_no_index():
-    actual = cudf.DataFrame({})
-    buffer = BytesIO()
-    actual.to_csv(buffer, index=False)
-
-    result = cudf.read_csv(buffer)
-
-    assert_eq(actual, result)

From e408fad6fd73e832140c5489030aec2e69cb5b9e Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 16 Jun 2021 22:45:09 -0700
Subject: [PATCH 8/8] revert unrelated changes

---
 python/cudf/cudf/tests/test_csv.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 27f0f684adc..925369048cb 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1599,10 +1599,8 @@ def test_csv_writer_column_and_header_options(
 
 def test_csv_writer_empty_columns_parameter(cudf_mixed_dataframe):
     df = cudf_mixed_dataframe
-
-    buffer = BytesIO()
-    with pytest.raises(RuntimeError):
-        df.to_csv(buffer, columns=[], index=False)
+    write_str = df.to_csv(columns=[], index=False)
+    assert_eq(write_str, "\n")
 
 
 def test_csv_writer_multiindex(tmpdir):
@@ -1995,3 +1993,13 @@ def test_to_csv_compression_error():
     error_message = "Writing compressed csv is not currently supported in cudf"
     with pytest.raises(NotImplementedError, match=re.escape(error_message)):
         df.to_csv("test.csv", compression=compression)
+
+
+def test_empty_df_no_index():
+    actual = cudf.DataFrame({})
+    buffer = BytesIO()
+    actual.to_csv(buffer, index=False)
+
+    result = cudf.read_csv(buffer)
+
+    assert_eq(actual, result)