Skip to content

Commit

Permalink
feat: trie cache factory to allow variable cache sizes (#7022)
Browse files Browse the repository at this point in the history
  • Loading branch information
Longarithm authored and nikurt committed Jun 15, 2022
1 parent 0cfed42 commit 1f1701b
Show file tree
Hide file tree
Showing 12 changed files with 135 additions and 89 deletions.
2 changes: 1 addition & 1 deletion chain/chain/src/test_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ impl KeyValueRuntime {
epoch_length: u64,
no_gc: bool,
) -> Self {
let tries = ShardTries::new(store.clone(), 0, num_shards);
let tries = ShardTries::test(store.clone(), num_shards);
let mut initial_amounts = HashMap::new();
for (i, validator) in validators.iter().flatten().enumerate() {
initial_amounts.insert(validator.clone(), (1000 + 100 * i) as u128);
Expand Down
2 changes: 1 addition & 1 deletion core/primitives/src/shard_layout.rs
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ fn is_top_level_account(top_account: &AccountId, account: &AccountId) -> bool {
}

/// ShardUId is an unique representation for shards from different shard layout
#[derive(Hash, Clone, Debug, Copy, PartialEq, Eq, PartialOrd, Ord)]
#[derive(Serialize, Deserialize, Hash, Clone, Debug, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct ShardUId {
pub version: ShardVersion,
pub shard_id: u32,
Expand Down
92 changes: 41 additions & 51 deletions core/store/src/config.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
use near_primitives::shard_layout::ShardUId;

#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
#[serde(default)]
pub struct StoreConfig {
/// Attempted writes to the DB will fail. Doesn't require a `LOCK` file.
#[serde(skip)]
pub read_only: bool,

/// Collect internal storage layer statistics.
/// Minor performance impact is expected.
#[serde(default)]
pub enable_statistics: bool,

/// Re-export storage layer statistics as prometheus metrics.
#[serde(default = "default_enable_statistics_export")]
pub enable_statistics_export: bool,

/// Maximum number of store files being opened simultaneously.
Expand All @@ -20,81 +20,71 @@ pub struct StoreConfig {
/// needs.
/// Increasing this value up to a value higher than 1024 also requires setting `ulimit -n` in
/// Linux.
#[serde(default = "default_max_open_files")]
pub max_open_files: u32,

/// Cache size for DBCol::State column.
/// Default value: 512MiB.
/// Increasing DBCol::State cache size helps making storage more efficient. On the other hand we
/// don't want to increase hugely requirements for running a node so currently we use a small
/// default value for it.
#[serde(default = "default_col_state_cache_size")]
pub col_state_cache_size: bytesize::ByteSize,

/// Block size used internally in RocksDB.
/// Default value: 16KiB.
/// We're still experimented with this parameter and it seems decreasing its value can improve
/// We're still experimenting with this parameter and it seems decreasing its value can improve
/// the performance of the storage
#[serde(default = "default_block_size")]
pub block_size: bytesize::ByteSize,
}

const fn default_enable_statistics_export() -> bool {
StoreConfig::const_default().enable_statistics_export
}

const fn default_max_open_files() -> u32 {
StoreConfig::const_default().max_open_files
}

const fn default_col_state_cache_size() -> bytesize::ByteSize {
StoreConfig::const_default().col_state_cache_size
}

const fn default_block_size() -> bytesize::ByteSize {
StoreConfig::const_default().block_size
/// Trie cache capacities
/// Default value: ShardUId {version: 1, shard_id: 3} -> 2_000_000. TODO: clarify
/// We're still experimenting with this parameter and it seems decreasing its value can improve
/// the performance of the storage
pub trie_cache_capacities: Vec<(ShardUId, usize)>,
}

impl StoreConfig {
/// We've used a value of 512 for max_open_files since 3 Dec 2019. As it turned out we were
/// hitting that limit and store had to constantly close/reopen the same set of files.
/// Running state viewer on a dense set of 500 blocks did almost 200K file opens (having less
/// than 7K unique files opened, some files were opened 400+ times).
/// Using 10K limit for max_open_files led to performance improvement of ~11%.
const DEFAULT_MAX_OPEN_FILES: u32 = 10_000;

/// We used to have the same cache size for all columns 32MB. When some RocksDB
/// inefficiencies were found DBCol::State cache size was increased up to 512MB.
/// This was done Nov 13 2021 and we consider increasing the value.
/// Tests have shown that increase of col_state_cache_size up to 25GB (we've used this big
/// value to estimate performance improvement headroom) having max_open_files=10K improved
/// performance of state viewer by 60%.
const DEFAULT_COL_STATE_CACHE_SIZE: bytesize::ByteSize = bytesize::ByteSize::mib(512);

/// Earlier this value was taken from the openethereum default parameter and we use it since
/// then.
const DEFAULT_BLOCK_SIZE: bytesize::ByteSize = bytesize::ByteSize::kib(16);

const fn const_default() -> Self {
impl Default for StoreConfig {
fn default() -> Self {
Self {
read_only: false,
enable_statistics: false,
enable_statistics_export: true,
max_open_files: Self::DEFAULT_MAX_OPEN_FILES,
col_state_cache_size: Self::DEFAULT_COL_STATE_CACHE_SIZE,
block_size: Self::DEFAULT_BLOCK_SIZE,

// We used to use value of 512 but we were hitting that limit often
// and store had to constantly close and reopen the same set of
// files. Running state viewer on a dense set of 500 blocks did
// almost 200k file opens (having less than 7K unique files opened,
// some files were opened 400+ times). Using 10k limit for
// max_open_files led to performance improvement of ~11%.
max_open_files: 10_000,

// We used to have the same cache size for all columns, 32 MiB.
// When some RocksDB inefficiencies were found [`DBCol::State`]
// cache size was increased up to 512 MiB. This was done on 13th of
// Nov 2021 and we consider increasing the value. Tests have shown
// that increase to 25 GiB (we've used this big value to estimate
// performance improvement headroom) having `max_open_files` at 10k
// improved performance of state viewer by 60%.
col_state_cache_size: bytesize::ByteSize::mib(512),

// This value was taken from the Openethereum default parameter and
// we use it since then.
block_size: bytesize::ByteSize::kib(16),

trie_cache_capacities: vec![(ShardUId { version: 1, shard_id: 3 }, 2_000_000)],
}
}
}

pub const fn read_only() -> StoreConfig {
StoreConfig::const_default().with_read_only(true)
impl StoreConfig {
pub fn read_only() -> StoreConfig {
StoreConfig::default().with_read_only(true)
}

pub const fn read_write() -> StoreConfig {
Self::const_default()
pub fn read_write() -> StoreConfig {
Self::default()
}

pub const fn with_read_only(mut self, read_only: bool) -> Self {
pub fn with_read_only(mut self, read_only: bool) -> Self {
self.read_only = read_only;
self
}
Expand Down
3 changes: 2 additions & 1 deletion core/store/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ pub use crate::trie::iterator::TrieIterator;
pub use crate::trie::update::{TrieUpdate, TrieUpdateIterator, TrieUpdateValuePtr};
pub use crate::trie::{
estimator, split_state, ApplyStatePartResult, KeyForStateChanges, PartialStorage, ShardTries,
Trie, TrieCache, TrieCachingStorage, TrieChanges, TrieStorage, WrappedTrieChanges,
Trie, TrieCache, TrieCacheFactory, TrieCachingStorage, TrieChanges, TrieStorage,
WrappedTrieChanges,
};

mod columns;
Expand Down
5 changes: 3 additions & 2 deletions core/store/src/test_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use rand::seq::SliceRandom;
use rand::Rng;

use crate::db::TestDB;
use crate::{ShardTries, Store};
use crate::{ShardTries, Store, TrieCacheFactory};
use near_primitives::account::id::AccountId;
use near_primitives::hash::CryptoHash;
use near_primitives::receipt::{DataReceipt, Receipt, ReceiptEnum};
Expand All @@ -26,7 +26,8 @@ pub fn create_tries() -> ShardTries {

pub fn create_tries_complex(shard_version: ShardVersion, num_shards: NumShards) -> ShardTries {
let store = create_test_store();
ShardTries::new(store, shard_version, num_shards)
let trie_cache_factory = TrieCacheFactory::new(Default::default(), shard_version, num_shards);
ShardTries::new(store, trie_cache_factory)
}

pub fn test_populate_trie(
Expand Down
16 changes: 9 additions & 7 deletions core/store/src/trie/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ use near_primitives::types::{StateRoot, StateRootNode};
use crate::trie::insert_delete::NodesStorage;
use crate::trie::iterator::TrieIterator;
use crate::trie::nibble_slice::NibbleSlice;
pub use crate::trie::shard_tries::{KeyForStateChanges, ShardTries, WrappedTrieChanges};
pub use crate::trie::shard_tries::{
KeyForStateChanges, ShardTries, TrieCacheFactory, WrappedTrieChanges,
};
pub use crate::trie::trie_storage::{TrieCache, TrieCachingStorage, TrieStorage};
use crate::trie::trie_storage::{TrieMemoryPartialStorage, TrieRecordingStorage};
use crate::StorageError;
Expand Down Expand Up @@ -1096,7 +1098,7 @@ mod tests {
#[test]
fn test_trie_restart() {
let store = create_test_store();
let tries = ShardTries::new(store.clone(), 0, 1);
let tries = ShardTries::test(store.clone(), 1);
let empty_root = Trie::empty_root();
let changes = vec![
(b"doge".to_vec(), Some(b"coin".to_vec())),
Expand All @@ -1108,7 +1110,7 @@ mod tests {
];
let root = test_populate_trie(&tries, &empty_root, ShardUId::single_shard(), changes);

let tries2 = ShardTries::new(store, 0, 1);
let tries2 = ShardTries::test(store, 1);
let trie2 = tries2.get_trie_for_shard(ShardUId::single_shard());
assert_eq!(trie2.get(&root, b"doge"), Ok(Some(b"coin".to_vec())));
}
Expand All @@ -1117,7 +1119,7 @@ mod tests {
#[test]
fn test_trie_recording_reads() {
let store = create_test_store();
let tries = ShardTries::new(store, 0, 1);
let tries = ShardTries::test(store, 1);
let empty_root = Trie::empty_root();
let changes = vec![
(b"doge".to_vec(), Some(b"coin".to_vec())),
Expand All @@ -1144,7 +1146,7 @@ mod tests {
#[test]
fn test_trie_recording_reads_update() {
let store = create_test_store();
let tries = ShardTries::new(store, 0, 1);
let tries = ShardTries::test(store, 1);
let empty_root = Trie::empty_root();
let changes = vec![
(b"doge".to_vec(), Some(b"coin".to_vec())),
Expand Down Expand Up @@ -1179,7 +1181,7 @@ mod tests {
#[test]
fn test_dump_load_trie() {
let store = create_test_store();
let tries = ShardTries::new(store.clone(), 0, 1);
let tries = ShardTries::test(store.clone(), 1);
let empty_root = Trie::empty_root();
let changes = vec![
(b"doge".to_vec(), Some(b"coin".to_vec())),
Expand All @@ -1190,7 +1192,7 @@ mod tests {
store.save_to_file(DBCol::State, &dir.path().join("test.bin")).unwrap();
let store2 = create_test_store();
store2.load_from_file(DBCol::State, &dir.path().join("test.bin")).unwrap();
let tries2 = ShardTries::new(store2, 0, 1);
let tries2 = ShardTries::test(store2, 1);
let trie2 = tries2.get_trie_for_shard(ShardUId::single_shard());
assert_eq!(trie2.get(&root, b"doge").unwrap().unwrap(), b"coin");
}
Expand Down
67 changes: 54 additions & 13 deletions core/store/src/trie/shard_tries.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,44 @@ use crate::trie::{TrieRefcountChange, POISONED_LOCK_ERR};
use crate::{DBCol, DBOp, DBTransaction};
use crate::{Store, StoreUpdate, Trie, TrieChanges, TrieUpdate};

/// Responsible for creation of trie caches, stores necessary configuration for it.
#[derive(Default)]
pub struct TrieCacheFactory {
capacities: HashMap<ShardUId, usize>,
shard_version: ShardVersion,
num_shards: NumShards,
}

impl TrieCacheFactory {
pub fn new(
capacities: HashMap<ShardUId, usize>,
shard_version: ShardVersion,
num_shards: NumShards,
) -> Self {
Self { capacities, shard_version, num_shards }
}

/// Create new cache for the given shard uid.
pub fn create_cache(&self, shard_uid: &ShardUId) -> TrieCache {
match self.capacities.get(shard_uid) {
Some(capacity) => TrieCache::with_capacity(*capacity),
None => TrieCache::new(),
}
}

/// Create caches on the initialization of storage structures.
pub fn create_initial_caches(&self) -> HashMap<ShardUId, TrieCache> {
assert_ne!(self.num_shards, 0);
let shards: Vec<_> = (0..self.num_shards)
.map(|shard_id| ShardUId { version: self.shard_version, shard_id: shard_id as u32 })
.collect();
shards.iter().map(|&shard_uid| (shard_uid, self.create_cache(&shard_uid))).collect()
}
}

struct ShardTriesInner {
store: Store,
trie_cache_factory: TrieCacheFactory,
/// Cache reserved for client actor to use
caches: RwLock<HashMap<ShardUId, TrieCache>>,
/// Cache for readers.
Expand All @@ -29,22 +65,21 @@ struct ShardTriesInner {
pub struct ShardTries(Arc<ShardTriesInner>);

impl ShardTries {
fn get_new_cache(shards: &[ShardUId]) -> HashMap<ShardUId, TrieCache> {
shards.iter().map(|&shard_id| (shard_id, TrieCache::new())).collect()
}

pub fn new(store: Store, shard_version: ShardVersion, num_shards: NumShards) -> Self {
assert_ne!(num_shards, 0);
let shards: Vec<_> = (0..num_shards)
.map(|shard_id| ShardUId { version: shard_version, shard_id: shard_id as u32 })
.collect();
pub fn new(store: Store, trie_cache_factory: TrieCacheFactory) -> Self {
let caches = trie_cache_factory.create_initial_caches();
let view_caches = trie_cache_factory.create_initial_caches();
ShardTries(Arc::new(ShardTriesInner {
store,
caches: RwLock::new(Self::get_new_cache(&shards)),
view_caches: RwLock::new(Self::get_new_cache(&shards)),
trie_cache_factory,
caches: RwLock::new(caches),
view_caches: RwLock::new(view_caches),
}))
}

pub fn test(store: Store, num_shards: NumShards) -> Self {
Self::new(store, TrieCacheFactory::new(Default::default(), 0, num_shards))
}

pub fn is_same(&self, other: &Self) -> bool {
Arc::ptr_eq(&self.0, &other.0)
}
Expand All @@ -61,7 +96,10 @@ impl ShardTries {
let caches_to_use = if is_view { &self.0.view_caches } else { &self.0.caches };
let cache = {
let mut caches = caches_to_use.write().expect(POISONED_LOCK_ERR);
caches.entry(shard_uid).or_insert_with(TrieCache::new).clone()
caches
.entry(shard_uid)
.or_insert_with(|| self.0.trie_cache_factory.create_cache(&shard_uid))
.clone()
};
let store = Box::new(TrieCachingStorage::new(self.0.store.clone(), cache, shard_uid));
Trie::new(store, shard_uid)
Expand Down Expand Up @@ -101,7 +139,10 @@ impl ShardTries {
}
}
for (shard_uid, ops) in shards {
let cache = caches.entry(shard_uid).or_insert_with(TrieCache::new).clone();
let cache = caches
.entry(shard_uid)
.or_insert_with(|| self.0.trie_cache_factory.create_cache(&shard_uid))
.clone();
cache.update_cache(ops);
}
Ok(())
Expand Down
15 changes: 7 additions & 8 deletions core/store/src/trie/trie_storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ pub struct TrieCache(Arc<Mutex<LruCache<CryptoHash, Arc<[u8]>>>>);

impl TrieCache {
pub fn new() -> Self {
Self::with_capacity(TRIE_MAX_SHARD_CACHE_SIZE)
Self::with_capacity(TRIE_DEFAULT_SHARD_CACHE_SIZE)
}

pub fn with_capacity(cap: usize) -> Self {
Expand Down Expand Up @@ -143,20 +143,20 @@ impl TrieStorage for TrieMemoryPartialStorage {
}
}

/// Maximum number of cache entries.
/// Default number of cache entries.
/// It was chosen to fit into RAM well. RAM spend on trie cache should not exceed 50_000 * 4 (number of shards) *
/// TRIE_LIMIT_CACHED_VALUE_SIZE * 2 (number of caches - for regular and view client) = 1.6 GB.
/// TRIE_LIMIT_CACHED_VALUE_SIZE * 2 (number of caches - for regular and view client) = 0.4 GB.
/// In our tests on a single shard, it barely occupied 40 MB, which is dominated by state cache size
/// with 512 MB limit. The total RAM usage for a single shard was 1 GB.
#[cfg(not(feature = "no_cache"))]
const TRIE_MAX_SHARD_CACHE_SIZE: usize = 50000;
const TRIE_DEFAULT_SHARD_CACHE_SIZE: usize = 50000;

#[cfg(feature = "no_cache")]
const TRIE_MAX_SHARD_CACHE_SIZE: usize = 1;
const TRIE_DEFAULT_SHARD_CACHE_SIZE: usize = 1;

/// Values above this size (in bytes) are never cached.
/// Note that Trie inner nodes are always smaller than this.
pub(crate) const TRIE_LIMIT_CACHED_VALUE_SIZE: usize = 4000;
/// Note that most of Trie inner nodes are smaller than this - e.g. branches use around 32 * 16 = 512 bytes.
pub(crate) const TRIE_LIMIT_CACHED_VALUE_SIZE: usize = 1000;

pub struct TrieCachingStorage {
pub(crate) store: Store,
Expand All @@ -171,7 +171,6 @@ pub struct TrieCachingStorage {
/// txs/receipts ends. Then cache is removed automatically in `apply_transactions_with_optional_storage_proof` when
/// `TrieCachingStorage` is removed.
/// Note that for both caches key is the hash of value, so for the fixed key the value is unique.
/// TODO (#5920): enable chunk nodes caching in Runtime::apply.
pub(crate) chunk_cache: RefCell<HashMap<CryptoHash, Arc<[u8]>>>,
pub(crate) cache_mode: Cell<TrieCacheMode>,

Expand Down
Loading

0 comments on commit 1f1701b

Please sign in to comment.