Skip to content

Commit

Permalink
Docstring nits; add a stress test
Browse files Browse the repository at this point in the history
  • Loading branch information
pitrou committed Aug 18, 2022
1 parent c890501 commit 89bbef3
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 19 deletions.
16 changes: 8 additions & 8 deletions cpp/src/arrow/filesystem/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@ add_arrow_test(filesystem-test
EXTRA_LABELS
filesystem)

if(ARROW_BUILD_BENCHMARKS AND ARROW_PARQUET)
add_arrow_benchmark(localfs_benchmark
PREFIX
"arrow-filesystem"
SOURCES
localfs_benchmark.cc
STATIC_LINK_LIBS
${ARROW_BENCHMARK_LINK_LIBS})
if(ARROW_BUILD_BENCHMARKS)
add_arrow_benchmark(localfs_benchmark
PREFIX
"arrow-filesystem"
SOURCES
localfs_benchmark.cc
STATIC_LINK_LIBS
${ARROW_BENCHMARK_LINK_LIBS})
endif()

if(ARROW_GCS)
Expand Down
17 changes: 11 additions & 6 deletions cpp/src/arrow/filesystem/localfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,18 @@ struct ARROW_EXPORT LocalFileSystemOptions {

/// Options related to `GetFileInfoGenerator` interface.

/// How many directories should be processed in parallel
/// by the `GetFileInfoGenerator` impl.
/// EXPERIMENTAL: The maximum number of directories processed in parallel
/// by `GetFileInfoGenerator`.
int32_t directory_readahead = kDefaultDirectoryReadahead;
/// Specifies how much entries shall be aggregated into
/// a single FileInfoVector chunk by the `GetFileInfoGenerator` impl, which
/// is the result of `stat`:ing individual dirents, obtained by the call to
/// `internal::ListDir`.

/// EXPERIMENTAL: The maximum number of entries aggregated into each
/// FileInfoVector chunk by `GetFileInfoGenerator`.
///
/// Since each FileInfo entry needs a separate `stat` system call, a
/// directory with a very large number of files may take a lot of time to
/// process entirely. By generating a FileInfoVector after this chunk
/// size is reached, we ensure FileInfo entries can start being consumed
/// from the FileInfoGenerator with less initial latency.
int32_t file_info_batch_size = kDefaultFileInfoBatchSize;

/// \brief Initialize with defaults
Expand Down
6 changes: 2 additions & 4 deletions cpp/src/arrow/filesystem/localfs_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,14 @@

#include "benchmark/benchmark.h"

#include <arrow/status.h>
#include <arrow/testing/random.h>
#include <arrow/util/async_generator.h>
#include <arrow/util/string_view.h>
#include "arrow/filesystem/localfs.h"
#include "arrow/io/file.h"
#include "arrow/status.h"
#include "arrow/table.h"
#include "arrow/testing/future_util.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/random.h"
#include "arrow/util/async_generator.h"
#include "arrow/util/formatting.h"
#include "arrow/util/io_util.h"
#include "arrow/util/make_unique.h"
Expand Down
80 changes: 79 additions & 1 deletion cpp/src/arrow/filesystem/localfs_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <cerrno>
#include <chrono>
#include <memory>
#include <sstream>
#include <string>
#include <utility>
#include <vector>
Expand Down Expand Up @@ -163,8 +164,12 @@ class TestLocalFS : public LocalFSTestMixin {
void SetUp() {
LocalFSTestMixin::SetUp();
path_formatter_ = PathFormatter();
local_fs_ = std::make_shared<LocalFileSystem>();
local_path_ = EnsureTrailingSlash(path_formatter_(temp_dir_->path().ToString()));
MakeFileSystem();
}

void MakeFileSystem() {
local_fs_ = std::make_shared<LocalFileSystem>(options_);
fs_ = std::make_shared<SubTreeFileSystem>(local_path_, local_fs_);
}

Expand Down Expand Up @@ -248,6 +253,7 @@ class TestLocalFS : public LocalFSTestMixin {

protected:
PathFormatter path_formatter_;
LocalFileSystemOptions options_ = LocalFileSystemOptions::Defaults();
std::shared_ptr<LocalFileSystem> local_fs_;
std::shared_ptr<FileSystem> fs_;
std::string local_path_;
Expand Down Expand Up @@ -398,6 +404,78 @@ TYPED_TEST(TestLocalFS, FileMTime) {
AssertDurationBetween(t2 - infos[1].mtime(), -kTimeSlack, kTimeSlack);
}

struct DirTreeCreator {
static constexpr int kFilesPerDir = 50;
static constexpr int kDirLevels = 2;
static constexpr int kSubdirsPerDir = 8;

FileSystem* fs_;

Result<FileInfoVector> Create(const std::string& base) {
FileInfoVector infos;
RETURN_NOT_OK(Create(base, 0, &infos));
return std::move(infos);
}

Status Create(const std::string& base, int depth, FileInfoVector* infos) {
for (int i = 0; i < kFilesPerDir; ++i) {
std::stringstream ss;
ss << "f" << i;
auto path = ConcatAbstractPath(base, ss.str());
const int data_size = i % 5;
std::string data(data_size, 'x');
CreateFile(fs_, path, data);
FileInfo info(std::move(path), FileType::File);
info.set_size(data_size);
infos->push_back(std::move(info));
}
if (depth < kDirLevels) {
for (int i = 0; i < kSubdirsPerDir; ++i) {
std::stringstream ss;
ss << "d" << i;
auto path = ConcatAbstractPath(base, ss.str());
RETURN_NOT_OK(fs_->CreateDir(path));
infos->push_back(FileInfo(path, FileType::Directory));
RETURN_NOT_OK(Create(path, depth + 1, infos));
}
}
return Status::OK();
}
};

TYPED_TEST(TestLocalFS, StressGetFileInfoGenerator) {
// Stress GetFileInfoGenerator with large numbers of entries
DirTreeCreator dir_tree_creator{this->local_fs_.get()};
ASSERT_OK_AND_ASSIGN(FileInfoVector expected,
dir_tree_creator.Create(this->local_path_));
SortInfos(&expected);

for (int32_t directory_readahead : {1, 5}) {
for (int32_t file_info_batch_size : {3, 1000}) {
ARROW_SCOPED_TRACE("directory_readahead = ", directory_readahead,
", file_info_batch_size = ", file_info_batch_size);
this->options_.directory_readahead = directory_readahead;
this->options_.file_info_batch_size = file_info_batch_size;
this->MakeFileSystem();

FileSelector selector;
selector.base_dir = this->local_path_;
selector.recursive = true;

auto gen = this->local_fs_->GetFileInfoGenerator(selector);
FileInfoVector actual;
CollectFileInfoGenerator(gen, &actual);
ASSERT_EQ(actual.size(), expected.size());
SortInfos(&actual);

for (int64_t i = 0; i < static_cast<int64_t>(actual.size()); ++i) {
AssertFileInfo(actual[i], expected[i].path(), expected[i].type(),
expected[i].size());
}
}
}
}

// TODO Should we test backslash paths on Windows?
// SubTreeFileSystem isn't compatible with them.

Expand Down

0 comments on commit 89bbef3

Please sign in to comment.