Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes out-of-bounds access for small files in unzip #8498

Merged
8 changes: 5 additions & 3 deletions cpp/src/io/comp/uncomp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,16 +193,18 @@ bool OpenZipArchive(zip_archive_s *dst, const uint8_t *raw, size_t len)
memset(dst, 0, sizeof(zip_archive_s));
// Find the end of central directory
if (len >= sizeof(zip_eocd_s) + 2) {
for (size_t i = len - sizeof(zip_eocd_s) - 2; i + sizeof(zip_eocd_s) + 2 + 0xffff >= len; i--) {
for (ptrdiff_t i = len - sizeof(zip_eocd_s) - 2;
i + sizeof(zip_eocd_s) + 2 + 0xffff >= len && i >= 0;
i--) {
const zip_eocd_s *eocd = reinterpret_cast<zip_eocd_s const *>(raw + i);
if (eocd->sig == 0x06054b50 &&
eocd->disk_id == eocd->start_disk // multi-file archives not supported
&& eocd->num_entries == eocd->total_entries &&
eocd->cdir_size >= sizeof(zip_cdfh_s) * eocd->num_entries && eocd->cdir_offset < len &&
i + *reinterpret_cast<const uint16_t *>(eocd + 1) <= len) {
i + *reinterpret_cast<const uint16_t *>(eocd + 1) <= static_cast<ptrdiff_t>(len)) {
const zip_cdfh_s *cdfh = reinterpret_cast<const zip_cdfh_s *>(raw + eocd->cdir_offset);
dst->eocd = eocd;
if (i >= sizeof(zip64_eocdl)) {
if (i >= static_cast<ptrdiff_t>(sizeof(zip64_eocdl))) {
const zip64_eocdl *eocdl =
reinterpret_cast<const zip64_eocdl *>(raw + i - sizeof(zip64_eocdl));
if (eocdl->sig == 0x07064b50) { dst->eocdl = eocdl; }
Expand Down
16 changes: 16 additions & 0 deletions python/cudf/cudf/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -979,6 +979,22 @@ def test_csv_reader_filepath_or_buffer(tmpdir, path_or_buf, src):
assert_eq(expect, got)


def test_small_zip(tmpdir):
df = pd.DataFrame(
{
"a": [1997] * 2,
"b": ["Ford"] * 2,
"c": ["Super, luxurious truck"] * 2,
}
)

fname = tmpdir.join("small_zip_file.zip")
df.to_csv(fname, index=False)

got = cudf.read_csv(fname)
assert_eq(df, got)


def test_csv_reader_carriage_return(tmpdir):
rows = 1000
names = ["int_row", "int_double_row"]
Expand Down