From 543900dce132a401e00b677ffb5d3a07c3baffc7 Mon Sep 17 00:00:00 2001 From: Jakob Blomer Date: Tue, 29 Oct 2024 08:50:06 +0100 Subject: [PATCH] [ntuple] adjust default cluster size Following the experience with AGC testing, double the default compressed cluster size to 100 MB and also double the maximum uncompressed cluster size to 1 GiB. --- tree/ntuple/v7/doc/tuning.md | 4 ++-- tree/ntuple/v7/inc/ROOT/RNTupleWriteOptions.hxx | 4 ++-- tree/ntupleutil/v7/inc/ROOT/RNTupleImporter.hxx | 2 +- tree/ntupleutil/v7/src/RNTupleImporter.cxx | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tree/ntuple/v7/doc/tuning.md b/tree/ntuple/v7/doc/tuning.md index 854deb2457043..c73cd914f8f3c 100644 --- a/tree/ntuple/v7/doc/tuning.md +++ b/tree/ntuple/v7/doc/tuning.md @@ -5,7 +5,7 @@ A cluster contains all the data of a given event range. As clusters are usually compressed and tied to event boundaries, an exact size cannot be enforced. Instead, RNTuple uses a *target size* for the compressed data as a guideline for when to flush a cluster. -The default cluster target size is 50MB of compressed data. +The default cluster target size is 100 MB of compressed data. The default can be changed by the `RNTupleWriteOptions`. The default should work well in the majority of cases. In general, larger clusters provide room for more and larger pages and should improve compression ratio and speed. @@ -13,7 +13,7 @@ However, clusters also need to be buffered during write and (partially) during r so larger clusters increase the memory footprint. A second option in `RNTupleWriteOptions` specifies the maximum uncompressed cluster size. -The default is 512MiB. +The default is 1 GiB. This setting acts as an "emergency break" and should prevent very compressible clusters from growing too large. Given the two settings, writing works as follows: diff --git a/tree/ntuple/v7/inc/ROOT/RNTupleWriteOptions.hxx b/tree/ntuple/v7/inc/ROOT/RNTupleWriteOptions.hxx index 1d933f5ebad69..6626325b8bbb7 100644 --- a/tree/ntuple/v7/inc/ROOT/RNTupleWriteOptions.hxx +++ b/tree/ntuple/v7/inc/ROOT/RNTupleWriteOptions.hxx @@ -61,10 +61,10 @@ public: protected: int fCompression{RCompressionSetting::EDefaults::kUseGeneralPurpose}; /// Approximation of the target compressed cluster size - std::size_t fApproxZippedClusterSize = 50 * 1000 * 1000; + std::size_t fApproxZippedClusterSize = 100 * 1000 * 1000; /// Memory limit for committing a cluster: with very high compression ratio, we need a limit /// on how large the I/O buffer can grow during writing. - std::size_t fMaxUnzippedClusterSize = 512 * 1024 * 1024; + std::size_t fMaxUnzippedClusterSize = 1024 * 1024 * 1024; /// Initially, columns start with a page large enough to hold the given number of elements. The initial /// page size is the given number of elements multiplied by the column's element size. /// If more elements are needed, pages are increased up until the byte limit given by fMaxUnzippedPageSize diff --git a/tree/ntupleutil/v7/inc/ROOT/RNTupleImporter.hxx b/tree/ntupleutil/v7/inc/ROOT/RNTupleImporter.hxx index 8bf567aced529..98669a31efded 100644 --- a/tree/ntupleutil/v7/inc/ROOT/RNTupleImporter.hxx +++ b/tree/ntupleutil/v7/inc/ROOT/RNTupleImporter.hxx @@ -105,7 +105,7 @@ public: /// Used to make adjustments to the fields of the output model. using FieldModifier_t = std::function; - /// Used to report every ~50MB (compressed), and at the end about the status of the import. + /// Used to report every ~100 MB (compressed), and at the end about the status of the import. class RProgressCallback { public: virtual ~RProgressCallback() = default; diff --git a/tree/ntupleutil/v7/src/RNTupleImporter.cxx b/tree/ntupleutil/v7/src/RNTupleImporter.cxx index 2d93b38dcfe69..3c9b98c227bf0 100644 --- a/tree/ntupleutil/v7/src/RNTupleImporter.cxx +++ b/tree/ntupleutil/v7/src/RNTupleImporter.cxx @@ -43,14 +43,14 @@ namespace { class RDefaultProgressCallback : public ROOT::Experimental::RNTupleImporter::RProgressCallback { private: - static constexpr std::uint64_t gUpdateFrequencyBytes = 50 * 1000 * 1000; // report every 50MB + static constexpr std::uint64_t gUpdateFrequencyBytes = 100 * 1000 * 1000; // report every 100 MB std::uint64_t fNbytesNext = gUpdateFrequencyBytes; public: ~RDefaultProgressCallback() override {} void Call(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) final { - // Report if more than 50MB (compressed) where written since the last status update + // Report if more than 100 MB (compressed) where written since the last status update if (nbytesWritten < fNbytesNext) return; std::cout << "Wrote " << nbytesWritten / 1000 / 1000 << "MB, " << neventsWritten << " entries\n";