diff --git a/clients/drcachesim/reader/compressed_file_reader.cpp b/clients/drcachesim/reader/compressed_file_reader.cpp index 02ba9a2cc07..e61b87038f3 100644 --- a/clients/drcachesim/reader/compressed_file_reader.cpp +++ b/clients/drcachesim/reader/compressed_file_reader.cpp @@ -1,5 +1,5 @@ /* ********************************************************** - * Copyright (c) 2017-2020 Google, Inc. All rights reserved. + * Copyright (c) 2017-2022 Google, Inc. All rights reserved. * **********************************************************/ /* @@ -35,36 +35,46 @@ /* clang-format off */ /* (make vera++ newline-after-type check happy) */ template <> /* clang-format on */ -file_reader_t::~file_reader_t() +file_reader_t::~file_reader_t() { - for (auto file : input_files_) - gzclose(file); + for (auto &gzip : input_files_) + gzclose(gzip.file); delete[] thread_eof_; } template <> bool -file_reader_t::open_single_file(const std::string &path) +file_reader_t::open_single_file(const std::string &path) { gzFile file = gzopen(path.c_str(), "rb"); if (file == nullptr) return false; VPRINT(this, 1, "Opened input file %s\n", path.c_str()); - input_files_.push_back(file); + input_files_.emplace_back(file); return true; } template <> bool -file_reader_t::read_next_thread_entry(size_t thread_index, - OUT trace_entry_t *entry, OUT bool *eof) +file_reader_t::read_next_thread_entry(size_t thread_index, + OUT trace_entry_t *entry, + OUT bool *eof) { - int len = gzread(input_files_[thread_index], (char *)entry, sizeof(*entry)); - // Returns less than asked-for for end of file, or –1 for error. - if (len < (int)sizeof(*entry)) { - *eof = (len >= 0); - return false; + gzip_reader_t *gzip = &input_files_[thread_index]; + if (gzip->cur_buf >= gzip->max_buf) { + int len = gzread(gzip->file, gzip->buf, sizeof(gzip->buf)); + // Returns less than asked-for for end of file, or –1 for error. + // We should always get a multiple of the record size. + if (len < static_cast(sizeof(*entry)) || + len % static_cast(sizeof(*entry)) != 0) { + *eof = (len >= 0); + return false; + } + gzip->cur_buf = gzip->buf; + gzip->max_buf = gzip->buf + (len / sizeof(*gzip->max_buf)); } + *entry = *gzip->cur_buf; + ++gzip->cur_buf; VPRINT(this, 4, "Read from thread #%zd file: type=%d, size=%d, addr=%zu\n", thread_index, entry->type, entry->size, entry->addr); return true; @@ -72,7 +82,7 @@ file_reader_t::read_next_thread_entry(size_t thread_index, template <> bool -file_reader_t::is_complete() +file_reader_t::is_complete() { // The gzip reading interface does not support seeking to SEEK_END so there // is no efficient way to read the footer. diff --git a/clients/drcachesim/reader/compressed_file_reader.h b/clients/drcachesim/reader/compressed_file_reader.h index dce2c5cc77c..e4cfd093076 100644 --- a/clients/drcachesim/reader/compressed_file_reader.h +++ b/clients/drcachesim/reader/compressed_file_reader.h @@ -1,5 +1,5 @@ /* ********************************************************** - * Copyright (c) 2017 Google, Inc. All rights reserved. + * Copyright (c) 2017-2022 Google, Inc. All rights reserved. * **********************************************************/ /* @@ -38,6 +38,22 @@ #include #include "file_reader.h" -typedef file_reader_t compressed_file_reader_t; +struct gzip_reader_t { + explicit gzip_reader_t(gzFile file) + : file(file) + { + } + gzFile file; + // Adding our own buffering to gzFile provides an 18% speedup. We use the same + // buffer size as zipfile_reader_t. + // If more readers want the same buffering we may want to bake this into the shared + // template class to avoid duplication: but some readers have good buffering + // already. + trace_entry_t buf[4096]; + trace_entry_t *cur_buf = buf; + trace_entry_t *max_buf = buf; +}; + +typedef file_reader_t compressed_file_reader_t; #endif /* _COMPRESSED_FILE_READER_H_ */ diff --git a/clients/drcachesim/reader/zipfile_file_reader.cpp b/clients/drcachesim/reader/zipfile_file_reader.cpp index 24249559f97..a1764460617 100644 --- a/clients/drcachesim/reader/zipfile_file_reader.cpp +++ b/clients/drcachesim/reader/zipfile_file_reader.cpp @@ -39,21 +39,21 @@ /* clang-format off */ /* (make vera++ newline-after-type check happy) */ template <> /* clang-format on */ -file_reader_t::~file_reader_t() +file_reader_t::~file_reader_t() { - for (auto file : input_files_) - unzClose(file); + for (auto &zipfile : input_files_) + unzClose(zipfile.file); delete[] thread_eof_; } template <> bool -file_reader_t::open_single_file(const std::string &path) +file_reader_t::open_single_file(const std::string &path) { unzFile file = unzOpen(path.c_str()); if (file == nullptr) return false; - input_files_.push_back(file); + input_files_.emplace_back(file); if (unzGoToFirstFile(file) != UNZ_OK || unzOpenCurrentFile(file) != UNZ_OK) return false; VPRINT(this, 1, "Opened input file %s\n", path.c_str()); @@ -62,54 +62,60 @@ file_reader_t::open_single_file(const std::string &path) template <> bool -file_reader_t::read_next_thread_entry(size_t thread_index, - OUT trace_entry_t *entry, OUT bool *eof) +file_reader_t::read_next_thread_entry(size_t thread_index, + OUT trace_entry_t *entry, + OUT bool *eof) { *eof = false; - // TODO i#5538: The reading performance for .zip files seems to be up to 60% - // slower than .gz files, which is unexpected since both use zlib and the checksum - // and header differences make zlib normally faster than gzip. - // Do we need to buffer ourselves here by reading many entries at once? - int num_read = unzReadCurrentFile(input_files_[thread_index], entry, sizeof(*entry)); - if (num_read == 0) { + zipfile_reader_t *zipfile = &input_files_[thread_index]; + if (zipfile->cur_buf >= zipfile->max_buf) { + int num_read = + unzReadCurrentFile(zipfile->file, zipfile->buf, sizeof(zipfile->buf)); + if (num_read == 0) { #ifdef DEBUG - if (verbosity_ >= 3) { - char name[128]; - name[0] = '\0'; /* Just in case. */ - // This call is expensive if we do it every time. - unzGetCurrentFileInfo64(input_files_[thread_index], nullptr, name, - sizeof(name), nullptr, 0, nullptr, 0); - VPRINT(this, 3, - "Thread #%zd hit end of component %s; opening next component\n", - thread_index, name); - } + if (verbosity_ >= 3) { + char name[128]; + name[0] = '\0'; /* Just in case. */ + // This call is expensive if we do it every time. + unzGetCurrentFileInfo64(zipfile->file, nullptr, name, sizeof(name), + nullptr, 0, nullptr, 0); + VPRINT(this, 3, + "Thread #%zd hit end of component %s; opening next component\n", + thread_index, name); + } #endif - // read_next_entry() stores the last entry into entry_copy_. - if ((entry_copy_.type != TRACE_TYPE_MARKER || - entry_copy_.size != TRACE_MARKER_TYPE_CHUNK_FOOTER) && - entry_copy_.type != TRACE_TYPE_FOOTER) { - VPRINT(this, 1, "Chunk is missing footer: truncation detection\n"); - return false; - } - if (unzCloseCurrentFile(input_files_[thread_index]) != UNZ_OK) - return false; - int res = unzGoToNextFile(input_files_[thread_index]); - if (res != UNZ_OK) { - if (res == UNZ_END_OF_LIST_OF_FILE) { - VPRINT(this, 2, "Thread #%zd hit EOF\n", thread_index); - *eof = true; + // read_next_entry() stores the last entry into entry_copy_. + if ((entry_copy_.type != TRACE_TYPE_MARKER || + entry_copy_.size != TRACE_MARKER_TYPE_CHUNK_FOOTER) && + entry_copy_.type != TRACE_TYPE_FOOTER) { + VPRINT(this, 1, "Chunk is missing footer: truncation detected\n"); + return false; } - return false; + if (unzCloseCurrentFile(zipfile->file) != UNZ_OK) + return false; + int res = unzGoToNextFile(zipfile->file); + if (res != UNZ_OK) { + if (res == UNZ_END_OF_LIST_OF_FILE) { + VPRINT(this, 2, "Thread #%zd hit EOF\n", thread_index); + *eof = true; + } + return false; + } + if (unzOpenCurrentFile(zipfile->file) != UNZ_OK) + return false; + num_read = + unzReadCurrentFile(zipfile->file, zipfile->buf, sizeof(zipfile->buf)); } - if (unzOpenCurrentFile(input_files_[thread_index]) != UNZ_OK) + if (num_read < static_cast(sizeof(*entry))) { + VPRINT(this, 1, "Thread #%zd failed to read: returned %d\n", thread_index, + num_read); return false; - num_read = unzReadCurrentFile(input_files_[thread_index], entry, sizeof(*entry)); - } - if (num_read < static_cast(sizeof(*entry))) { - VPRINT(this, 1, "Thread #%zd failed to read: returned %d\n", thread_index, - num_read); - return false; + } + zipfile->cur_buf = zipfile->buf; + zipfile->max_buf = zipfile->buf + (num_read / sizeof(*zipfile->max_buf)); } + *entry = *zipfile->cur_buf; + ++zipfile->cur_buf; VPRINT(this, 4, "Read from thread #%zd: type=%d, size=%d, addr=%zu\n", thread_index, entry->type, entry->size, entry->addr); return true; @@ -117,7 +123,7 @@ file_reader_t::read_next_thread_entry(size_t thread_index, template <> bool -file_reader_t::is_complete() +file_reader_t::is_complete() { // We could call unzeof() but we need the thread index. // XXX: We should remove or change this interface. It is not used now. diff --git a/clients/drcachesim/reader/zipfile_file_reader.h b/clients/drcachesim/reader/zipfile_file_reader.h index 29b8d22512b..1d6d45a275e 100644 --- a/clients/drcachesim/reader/zipfile_file_reader.h +++ b/clients/drcachesim/reader/zipfile_file_reader.h @@ -39,6 +39,20 @@ #include "minizip/unzip.h" #include "file_reader.h" -typedef file_reader_t zipfile_file_reader_t; +struct zipfile_reader_t { + explicit zipfile_reader_t(unzFile file) + : file(file) + { + } + unzFile file; + // Without our own buffering, reading one trace_entry_t record at a time + // is 60% slower. This buffer size was picked through experimentation to + // perform well without taking up too much memory. + trace_entry_t buf[4096]; + trace_entry_t *cur_buf = buf; + trace_entry_t *max_buf = buf; +}; + +typedef file_reader_t zipfile_file_reader_t; #endif /* _ZIPFILE_FILE_READER_H_ */