Skip to content

Commit

Permalink
System-wide speculative prefetch limit.
Browse files Browse the repository at this point in the history
With some pathological access patterns it is possible to make ZFS
accumulate almost unlimited amount of speculative prefetch ZIOs.
Combined with linear ABD allocations in RAIDZ code, it appears to
be possible to exhaust system KVA, triggering kernel panic.

Address this by introducing a system-wide counter of active prefetch
requests and blocking prefetch distance doubling per stream hits if
the number of active requests is higher that ~6% of ARC size.

Signed-off-by:  Alexander Motin <[email protected]>
Sponsored by:	iXsystems, Inc.
  • Loading branch information
amotin committed Feb 21, 2023
1 parent 0f32b1f commit 370e779
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 5 deletions.
1 change: 1 addition & 0 deletions include/sys/arc_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#define _SYS_ARC_IMPL_H

#include <sys/arc.h>
#include <sys/multilist.h>
#include <sys/zio_crypt.h>
#include <sys/zthr.h>
#include <sys/aggsum.h>
Expand Down
28 changes: 23 additions & 5 deletions module/zfs/dmu_zfetch.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
*/

#include <sys/zfs_context.h>
#include <sys/arc_impl.h>
#include <sys/dnode.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_zfetch.h>
Expand Down Expand Up @@ -65,20 +66,23 @@ typedef struct zfetch_stats {
kstat_named_t zfetchstat_misses;
kstat_named_t zfetchstat_max_streams;
kstat_named_t zfetchstat_io_issued;
kstat_named_t zfetchstat_io_active;
} zfetch_stats_t;

static zfetch_stats_t zfetch_stats = {
{ "hits", KSTAT_DATA_UINT64 },
{ "misses", KSTAT_DATA_UINT64 },
{ "max_streams", KSTAT_DATA_UINT64 },
{ "io_issued", KSTAT_DATA_UINT64 },
{ "io_issued", KSTAT_DATA_UINT64 },
{ "io_active", KSTAT_DATA_UINT64 },
};

struct {
wmsum_t zfetchstat_hits;
wmsum_t zfetchstat_misses;
wmsum_t zfetchstat_max_streams;
wmsum_t zfetchstat_io_issued;
aggsum_t zfetchstat_io_active;
} zfetch_sums;

#define ZFETCHSTAT_BUMP(stat) \
Expand All @@ -104,6 +108,8 @@ zfetch_kstats_update(kstat_t *ksp, int rw)
wmsum_value(&zfetch_sums.zfetchstat_max_streams);
zs->zfetchstat_io_issued.value.ui64 =
wmsum_value(&zfetch_sums.zfetchstat_io_issued);
zs->zfetchstat_io_active.value.ui64 =
aggsum_value(&zfetch_sums.zfetchstat_io_active);
return (0);
}

Expand All @@ -114,6 +120,7 @@ zfetch_init(void)
wmsum_init(&zfetch_sums.zfetchstat_misses, 0);
wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0);
wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0);
aggsum_init(&zfetch_sums.zfetchstat_io_active, 0);

zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
Expand All @@ -138,6 +145,7 @@ zfetch_fini(void)
wmsum_fini(&zfetch_sums.zfetchstat_misses);
wmsum_fini(&zfetch_sums.zfetchstat_max_streams);
wmsum_fini(&zfetch_sums.zfetchstat_io_issued);
aggsum_fini(&zfetch_sums.zfetchstat_io_active);
}

/*
Expand Down Expand Up @@ -294,6 +302,7 @@ dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued)
zs->zs_more = B_TRUE;
if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
dmu_zfetch_stream_fini(zs);
aggsum_add(&zfetch_sums.zfetchstat_io_active, -1);
}

/*
Expand Down Expand Up @@ -407,20 +416,28 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
* Start prefetch from the demand access size (nblks). Double the
* distance every access up to zfetch_min_distance. After that only
* if needed increase the distance by 1/8 up to zfetch_max_distance.
*
* Don't double the distance beyond single block if we have more
* than ~6% of ARC held by active prefetches. It should help with
* getting out of RAM on some badly mispredicted read patterns.
*/
unsigned int nbytes = nblks << zf->zf_dnode->dn_datablkshift;
unsigned int dbs = zf->zf_dnode->dn_datablkshift;
unsigned int nbytes = nblks << dbs;
unsigned int pf_nblks;
if (fetch_data) {
if (unlikely(zs->zs_pf_dist < nbytes))
zs->zs_pf_dist = nbytes;
else if (zs->zs_pf_dist < zfetch_min_distance)
else if (zs->zs_pf_dist < zfetch_min_distance &&
(zs->zs_pf_dist < (1 << dbs) ||
aggsum_compare(&zfetch_sums.zfetchstat_io_active,
arc_c_max >> (4 + dbs)) < 0))
zs->zs_pf_dist *= 2;
else if (zs->zs_more)
zs->zs_pf_dist += zs->zs_pf_dist / 8;
zs->zs_more = B_FALSE;
if (zs->zs_pf_dist > zfetch_max_distance)
zs->zs_pf_dist = zfetch_max_distance;
pf_nblks = zs->zs_pf_dist >> zf->zf_dnode->dn_datablkshift;
pf_nblks = zs->zs_pf_dist >> dbs;
} else {
pf_nblks = 0;
}
Expand All @@ -439,7 +456,7 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
zs->zs_ipf_dist *= 2;
if (zs->zs_ipf_dist > zfetch_max_idistance)
zs->zs_ipf_dist = zfetch_max_idistance;
pf_nblks = zs->zs_ipf_dist >> zf->zf_dnode->dn_datablkshift;
pf_nblks = zs->zs_ipf_dist >> dbs;
if (zs->zs_ipf_start < zs->zs_pf_end)
zs->zs_ipf_start = zs->zs_pf_end;
if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks)
Expand Down Expand Up @@ -510,6 +527,7 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
dmu_zfetch_stream_fini(zs);
return;
}
aggsum_add(&zfetch_sums.zfetchstat_io_active, issued);

if (!have_lock)
rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
Expand Down

0 comments on commit 370e779

Please sign in to comment.