Skip to content

Commit

Permalink
use jemalloc instead of system's glibc malloc (openzfs#637)
Browse files Browse the repository at this point in the history
We sometimes see memory usage of the agent process grow very large over
time, with the vast majority of it unaccounted for by our allocation
tracking.  Investigation revealed that most of the memory usage was
unallocated parts of the heap, which were likely stranded due to memory
fragmentation, exacerbated by periodic floods of large allocations (and
then frees a few seconds later) due to zettacache index merging.

This commit switches the agent to use jemalloc, which is less subject to
memory fragmentation.  The process's memory usage (RSS) now more closely
tracks the amount of currently-malloc'd memory.  Additionally, jemalloc
uses less CPU time, and overall performance of zettacache ingestion
improved by up to 25%.

This commit also implements the alloc_zeroed() and realloc() methods of
the GlobalAlloc trait, enabling use of the underlying optimized versions
in jemalloc, which contributes to the above mentioned overall
performance improvement.
  • Loading branch information
ahrens authored Oct 7, 2022
1 parent 0a6564b commit 3dadc35
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 15 deletions.
2 changes: 1 addition & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ paxcheck:
CHECKS += flake8
flake8:
$(AM_V_at)if type flake8 > /dev/null 2>&1; then \
flake8 $(top_srcdir); \
flake8 --exclude=build $(top_srcdir); \
else \
echo "skipping flake8 because flake8 is not installed"; \
fi
Expand Down
40 changes: 40 additions & 0 deletions cmd/zfs_object_agent/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions cmd/zfs_object_agent/server/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ static GIT_VERSION: &str = git_version!(

tunable! {
static ref ALLOCATOR_PRINT_DURATION: Duration = Duration::from_secs(60);
static ref ALLOCATOR_TOTALS_DURATION: Duration = Duration::from_secs(10);
}

#[derive(Args)]
Expand Down Expand Up @@ -299,6 +300,18 @@ fn main() {
}
});

runtime.spawn(async {
let mut interval = tokio::time::interval(*ALLOCATOR_TOTALS_DURATION);
loop {
interval.tick().await;
let totals = TrackingAllocator::totals();
debug!(
"ALLOCD tracked,allocated,resident: {}, {}, {}",
totals.tracked, totals.allocated, totals.resident
);
}
});

let cache_mode = match cli.cache_device_dir {
Some(dir) => CacheOpenMode::DiscoveryDirectory(dir, cli.guid),
None => CacheOpenMode::None,
Expand Down
2 changes: 2 additions & 0 deletions cmd/zfs_object_agent/util/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,7 @@ serde = { version = "1.0.125", features = ["derive"] }
serde_json = "1.0.64"
serial_test = "0.9.0"
signal-hook = "0.3.13"
tikv-jemalloc-ctl = "0.5.0"
tikv-jemallocator = "0.5.0"
tokio = { version = "1.4", features = ["full"] }
uuid = {version = "1.0.0", features = ["v4", "serde"]}
95 changes: 81 additions & 14 deletions cmd/zfs_object_agent/util/src/alloc.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use std::alloc::GlobalAlloc;
use std::alloc::Layout;
use std::alloc::System;
use std::cell::Cell;
use std::collections::HashMap;
use std::fmt;
Expand All @@ -13,11 +12,15 @@ use std::sync::Mutex;

use backtrace::Backtrace;
use lazy_static::lazy_static;
use tikv_jemalloc_ctl::epoch;
use tikv_jemalloc_ctl::stats;
use tikv_jemallocator::Jemalloc;

use crate::from64::AsUsize;
use crate::tunable::get_tunable;

lazy_static! {
pub static ref ALLOCATOR_PRINT_MIN_BYTES: u64 =
pub static ref ALLOCATOR_PRINT_MIN_BYTES: usize =
get_tunable("allocator_print_min_bytes", 1024 * 1024);
pub static ref ALLOCATOR_PRINT_MIN_ALLOCS: u64 =
get_tunable("allocator_print_min_allocs", 1_000_000);
Expand Down Expand Up @@ -100,7 +103,7 @@ struct PerBacktraceInfo {
bt: Backtrace,
allocs: u64,
frees: u64,
capacity: u64, // i.e. bytes currently allocated
capacity: usize, // i.e. bytes currently allocated
}

impl Display for PerBacktraceInfo {
Expand All @@ -113,7 +116,7 @@ impl Display for PerBacktraceInfo {
self.capacity / 1024 / 1024,
self.allocs - self.frees,
self.capacity
.checked_div(self.allocs - self.frees)
.checked_div((self.allocs - self.frees).as_usize())
.unwrap_or_default(),
)
}
Expand All @@ -130,7 +133,7 @@ struct PerAllocInfo {
}

impl PerAllocInfo {
fn new(layout: Layout) -> Self {
fn increment(layout: Layout) -> Self {
nonrecursive(|| {
let key = match ALLOC_TAG.with(|a| a.get()) {
Some(str) => Key::Tag(str),
Expand Down Expand Up @@ -169,20 +172,20 @@ impl PerAllocInfo {
};
let e = allocs.get_mut(key_ref).unwrap();
e.allocs += 1;
e.capacity += layout.size() as u64;
e.capacity += layout.size();
PerAllocInfo { key: Some(key_ref) }
})
.unwrap_or_default()
}

fn decrement(&self, layout: Layout) {
fn decrement(self, layout: Layout) {
// Note: allocating memory could result in infinite recursion, and must be avoided.
if let Some(key) = self.key {
let mut h = ALLOCS.lock().unwrap();

let e = h.entry(key).or_default();
e.frees += 1;
e.capacity -= layout.size() as u64;
e.capacity -= layout.size();
}
}
}
Expand All @@ -197,24 +200,62 @@ fn new_layout(layout: Layout) -> (Layout, isize) {
}

unsafe impl GlobalAlloc for TrackingAllocator {
#[inline]
unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
let info = PerAllocInfo::new(layout);
let info = PerAllocInfo::increment(layout);
let (new_layout, info_offset) = new_layout(layout);
let ptr = System.alloc(new_layout);
let ptr = Jemalloc.alloc(new_layout);
if !ptr.is_null() {
let info_ptr = ptr.offset(info_offset) as *mut PerAllocInfo;
*info_ptr = info;
}
ptr
}

#[inline]
unsafe fn alloc_zeroed(&self, layout: Layout) -> *mut u8 {
let info = PerAllocInfo::increment(layout);
let (new_layout, info_offset) = new_layout(layout);
let ptr = Jemalloc.alloc_zeroed(new_layout);
if !ptr.is_null() {
let info_ptr = ptr.offset(info_offset) as *mut PerAllocInfo;
*info_ptr = info;
}
ptr
}

#[inline]
unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
// Note: allocating memory here could result in infinite recursion, and must be avoided.
let (new_layout, info_offset) = new_layout(layout);
let info_ptr = ptr.offset(info_offset) as *const PerAllocInfo;
(*info_ptr).decrement(layout);
info_ptr.read().decrement(layout);

System.dealloc(ptr, new_layout)
Jemalloc.dealloc(ptr, new_layout)
}

#[inline]
unsafe fn realloc(
&self,
ptr: *mut u8,
old_fake_layout: Layout,
new_fake_size: usize,
) -> *mut u8 {
let (old_real_layout, old_info_offset) = new_layout(old_fake_layout);
// dereference the old PerAllocInfo before deallocating the old buffer
let old_info = (ptr.offset(old_info_offset) as *const PerAllocInfo).read();

let new_fake_layout =
Layout::from_size_align(new_fake_size, old_fake_layout.align()).unwrap();
let (new_real_layout, new_info_offset) = new_layout(new_fake_layout);
let ptr = Jemalloc.realloc(ptr, old_real_layout, new_real_layout.size());
if !ptr.is_null() {
old_info.decrement(old_fake_layout);
let info = PerAllocInfo::increment(new_fake_layout);
let info_ptr = ptr.offset(new_info_offset) as *mut PerAllocInfo;
*info_ptr = info;
}
ptr
}
}

Expand All @@ -232,17 +273,35 @@ impl TrackingAllocator {
ALLOC_TAG_HIGH_FREQ.store(get_tunable("alloc_tag_high_freq", false), Ordering::Relaxed);
}

pub fn format(min_allocs: u64, min_bytes: u64) -> DelayedFormat {
pub fn format(min_allocs: u64, min_bytes: usize) -> DelayedFormat {
DelayedFormat {
min_allocs,
min_bytes,
}
}

pub fn totals() -> Totals {
let tracked =
nonrecursive(|| ALLOCS.lock().unwrap().values().map(|e| e.capacity).sum()).unwrap();

epoch::advance().unwrap();
Totals {
tracked,
allocated: stats::allocated::read().unwrap(),
resident: stats::resident::read().unwrap(),
}
}
}

pub struct Totals {
pub tracked: usize,
pub allocated: usize,
pub resident: usize,
}

pub struct DelayedFormat {
min_allocs: u64,
min_bytes: u64,
min_bytes: usize,
}

impl Display for DelayedFormat {
Expand Down Expand Up @@ -288,6 +347,14 @@ impl Display for DelayedFormat {
}
}
writeln!(f, "TOTAL TRACKED: {}", total)?;

epoch::advance().unwrap();
writeln!(
f,
"jemalloc: {}MB allocated out of {}MB resident",
stats::allocated::read().unwrap() / 1024 / 1024,
stats::resident::read().unwrap() / 1024 / 1024,
)?;
Ok(())
}
}
Expand Down

0 comments on commit 3dadc35

Please sign in to comment.