Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes out-of-bounds access for small files in unzip #8498

Merged
8 changes: 5 additions & 3 deletions cpp/src/io/comp/uncomp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,16 +193,18 @@ bool OpenZipArchive(zip_archive_s *dst, const uint8_t *raw, size_t len)
memset(dst, 0, sizeof(zip_archive_s));
// Find the end of central directory
if (len >= sizeof(zip_eocd_s) + 2) {
for (size_t i = len - sizeof(zip_eocd_s) - 2; i + sizeof(zip_eocd_s) + 2 + 0xffff >= len; i--) {
for (ptrdiff_t i = len - sizeof(zip_eocd_s) - 2;
i + sizeof(zip_eocd_s) + 2 + 0xffff >= len && i >= 0;
i--) {
const zip_eocd_s *eocd = reinterpret_cast<zip_eocd_s const *>(raw + i);
if (eocd->sig == 0x06054b50 &&
eocd->disk_id == eocd->start_disk // multi-file archives not supported
&& eocd->num_entries == eocd->total_entries &&
eocd->cdir_size >= sizeof(zip_cdfh_s) * eocd->num_entries && eocd->cdir_offset < len &&
i + *reinterpret_cast<const uint16_t *>(eocd + 1) <= len) {
i + *reinterpret_cast<const uint16_t *>(eocd + 1) <= static_cast<ptrdiff_t>(len)) {
const zip_cdfh_s *cdfh = reinterpret_cast<const zip_cdfh_s *>(raw + eocd->cdir_offset);
dst->eocd = eocd;
if (i >= sizeof(zip64_eocdl)) {
if (i >= static_cast<ptrdiff_t>(sizeof(zip64_eocdl))) {
const zip64_eocdl *eocdl =
reinterpret_cast<const zip64_eocdl *>(raw + i - sizeof(zip64_eocdl));
if (eocdl->sig == 0x07064b50) { dst->eocdl = eocdl; }
Expand Down
42 changes: 42 additions & 0 deletions cpp/tests/io/csv_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1081,6 +1081,48 @@ TEST_F(CsvReaderTest, ArrowFileSource)
expect_column_data_equal(std::vector<int8_t>{9, 8, 7, 6, 5, 4, 3, 2}, view.column(0));
}

TEST_F(CsvReaderTest, ZippedCSV)
elstehle marked this conversation as resolved.
Show resolved Hide resolved
{
// input is the following CSV zip'ed:
// "year,name,description\n1997,Ford,"Super, luxurious truck"\n1997,Ford,"Super, luxurious truck"
constexpr uint8_t compressed[] = {
0x50, 0x4b, 0x03, 0x04, 0x14, 0x00, 0x00, 0x00, 0x08, 0x00, 0x4e, 0x5b, 0xcf, 0x52, 0xaf, 0x6f,
0xfa, 0xe1, 0x3c, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x1c, 0x00, 0x7a, 0x69,
0x70, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x2e, 0x63, 0x73, 0x76, 0x55, 0x54, 0x09, 0x00,
0x03, 0x44, 0x72, 0xc8, 0x60, 0x45, 0x72, 0xc8, 0x60, 0x75, 0x78, 0x0b, 0x00, 0x01, 0x04, 0xf7,
0x01, 0x00, 0x00, 0x04, 0x14, 0x00, 0x00, 0x00, 0xab, 0x4c, 0x4d, 0x2c, 0xd2, 0xc9, 0x4b, 0xcc,
0x4d, 0xd5, 0x49, 0x49, 0x2d, 0x4e, 0x2e, 0xca, 0x2c, 0x28, 0xc9, 0xcc, 0xcf, 0xe3, 0x32, 0xb4,
0xb4, 0x34, 0xd7, 0x71, 0xcb, 0x2f, 0x4a, 0xd1, 0x51, 0x0a, 0x2e, 0x2d, 0x48, 0x2d, 0xd2, 0x51,
0xc8, 0x29, 0xad, 0x28, 0x2d, 0xca, 0xcc, 0x2f, 0x2d, 0x56, 0x28, 0x29, 0x2a, 0x4d, 0xce, 0x56,
0x22, 0x46, 0x09, 0x00, 0x50, 0x4b, 0x01, 0x02, 0x1e, 0x03, 0x14, 0x00, 0x00, 0x00, 0x08, 0x00,
0x4e, 0x5b, 0xcf, 0x52, 0xaf, 0x6f, 0xfa, 0xe1, 0x3c, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
0x0e, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xa4, 0x81, 0x00, 0x00,
0x00, 0x00, 0x7a, 0x69, 0x70, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x2e, 0x63, 0x73, 0x76,
0x55, 0x54, 0x05, 0x00, 0x03, 0x44, 0x72, 0xc8, 0x60, 0x75, 0x78, 0x0b, 0x00, 0x01, 0x04, 0xf7,
0x01, 0x00, 0x00, 0x04, 0x14, 0x00, 0x00, 0x00, 0x50, 0x4b, 0x05, 0x06, 0x00, 0x00, 0x00, 0x00,
0x01, 0x00, 0x01, 0x00, 0x54, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00,
};

const auto filepath = temp_env->get_temp_dir() + "zipped_example.zip";
{
std::ofstream outfile(filepath, std::ofstream::out | std::ofstream::binary);
outfile.write(reinterpret_cast<const char*>(compressed), sizeof(compressed));
}

cudf_io::csv_reader_options in_opts =
cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
.compression(cudf::io::compression_type::ZIP);
const auto result = cudf_io::read_csv(in_opts);
const auto result_view = result.tbl->view();

auto year_col = std::vector<int64_t>{1997L, 1997L};
auto name_col = std::vector<std::string>{"Ford", "Ford"};
auto descr_col = std::vector<std::string>{"Super, luxurious truck", "Super, luxurious truck"};
expect_column_data_equal(year_col, result_view.column(0));
expect_column_data_equal(name_col, result_view.column(1));
expect_column_data_equal(descr_col, result_view.column(2));
}

TEST_F(CsvReaderTest, InvalidFloatingPoint)
{
const auto filepath = temp_env->get_temp_dir() + "InvalidFloatingPoint.csv";
Expand Down
24 changes: 24 additions & 0 deletions python/cudf/cudf/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -979,6 +979,30 @@ def test_csv_reader_filepath_or_buffer(tmpdir, path_or_buf, src):
assert_eq(expect, got)


def test_small_zip(tmpdir):
df = pd.DataFrame(
{
"a": [1997] * 2,
"b": ["Ford"] * 2,
"c": ["Super, luxurious truck"] * 2,
}
)

fname = tmpdir.join("small_zip_file.zip")
df.to_csv(fname, index=False)

df = pd.DataFrame(
{
"a": [1997] * 2,
"b": ["Ford"] * 2,
"c": ["Superd, luxurious truck"] * 2,
}
)

vuule marked this conversation as resolved.
Show resolved Hide resolved
got = cudf.read_csv(fname)
assert_eq(df, got)


def test_csv_reader_carriage_return(tmpdir):
rows = 1000
names = ["int_row", "int_double_row"]
Expand Down