Skip to content

Commit

Permalink
md/raid5: allow the stripe_cache to grow and shrink.
Browse files Browse the repository at this point in the history
The default setting of 256 stripe_heads is probably
much too small for many configurations.  So it is best to make it
auto-configure.

Shrinking the cache under memory pressure is easy.  The only
interesting part here is that we put a fairly high cost
('seeks') on shrinking the cache as the cost is greater than
just having to read more data, it reduces parallelism.

Growing the cache on demand needs to be done carefully.  If we allow
fast growth, that can upset memory balance as lots of dirty memory can
quickly turn into lots of memory queued in the stripe_cache.
It is important for the raid5 block device to appear congested to
allow write-throttling to work.

So we only add stripes slowly. We set a flag when an allocation
fails because all stripes are in use, allocate at a convenient
time when that flag is set, and don't allow it to be set again
until at least one stripe_head has been released for re-use.

This means that a spurt of requests will only cause one stripe_head
to be allocated, but a steady stream of requests will slowly
increase the cache size - until memory pressure puts it back again.

It could take hours to reach a steady state.

The value written to, and displayed in, stripe_cache_size is
used as a minimum.  The cache can grow above this and shrink back
down to it.  The actual size is not directly visible, though it can
be deduced to some extent by watching stripe_cache_active.

Signed-off-by: NeilBrown <[email protected]>
  • Loading branch information
neilbrown committed Apr 21, 2015
1 parent 5423399 commit edbe83a
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 8 deletions.
68 changes: 61 additions & 7 deletions drivers/md/raid5.c
Original file line number Diff line number Diff line change
Expand Up @@ -672,8 +672,13 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
*(conf->hash_locks + hash));
sh = __find_stripe(conf, sector, conf->generation - previous);
if (!sh) {
if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
sh = get_free_stripe(conf, hash);
if (!sh && llist_empty(&conf->released_stripes) &&
!test_bit(R5_DID_ALLOC, &conf->cache_state))
set_bit(R5_ALLOC_MORE,
&conf->cache_state);
}
if (noblock && sh == NULL)
break;
if (!sh) {
Expand Down Expand Up @@ -5761,6 +5766,8 @@ static void raid5d(struct md_thread *thread)
int batch_size, released;

released = release_stripe_list(conf, conf->temp_inactive_list);
if (released)
clear_bit(R5_DID_ALLOC, &conf->cache_state);

if (
!list_empty(&conf->bitmap_list)) {
Expand Down Expand Up @@ -5799,6 +5806,13 @@ static void raid5d(struct md_thread *thread)
pr_debug("%d stripes handled\n", handled);

spin_unlock_irq(&conf->device_lock);
if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) {
grow_one_stripe(conf, __GFP_NOWARN);
/* Set flag even if allocation failed. This helps
* slow down allocation requests when mem is short
*/
set_bit(R5_DID_ALLOC, &conf->cache_state);
}

async_tx_issue_pending_all();
blk_finish_plug(&plug);
Expand All @@ -5814,7 +5828,7 @@ raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
spin_lock(&mddev->lock);
conf = mddev->private;
if (conf)
ret = sprintf(page, "%d\n", conf->max_nr_stripes);
ret = sprintf(page, "%d\n", conf->min_nr_stripes);
spin_unlock(&mddev->lock);
return ret;
}
Expand All @@ -5828,10 +5842,12 @@ raid5_set_cache_size(struct mddev *mddev, int size)
if (size <= 16 || size > 32768)
return -EINVAL;

conf->min_nr_stripes = size;
while (size < conf->max_nr_stripes &&
drop_one_stripe(conf))
;


err = md_allow_write(mddev);
if (err)
return err;
Expand Down Expand Up @@ -5947,7 +5963,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
conf = mddev->private;
if (!conf)
err = -ENODEV;
else if (new > conf->max_nr_stripes)
else if (new > conf->min_nr_stripes)
err = -EINVAL;
else
conf->bypass_threshold = new;
Expand Down Expand Up @@ -6228,6 +6244,8 @@ static void raid5_free_percpu(struct r5conf *conf)

static void free_conf(struct r5conf *conf)
{
if (conf->shrinker.seeks)
unregister_shrinker(&conf->shrinker);
free_thread_groups(conf);
shrink_stripes(conf);
raid5_free_percpu(conf);
Expand Down Expand Up @@ -6295,6 +6313,30 @@ static int raid5_alloc_percpu(struct r5conf *conf)
return err;
}

static unsigned long raid5_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
int ret = 0;
while (ret < sc->nr_to_scan) {
if (drop_one_stripe(conf) == 0)
return SHRINK_STOP;
ret++;
}
return ret;
}

static unsigned long raid5_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);

if (conf->max_nr_stripes < conf->min_nr_stripes)
/* unlikely, but not impossible */
return 0;
return conf->max_nr_stripes - conf->min_nr_stripes;
}

static struct r5conf *setup_conf(struct mddev *mddev)
{
struct r5conf *conf;
Expand Down Expand Up @@ -6445,17 +6487,29 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf->prev_algo = mddev->layout;
}

memory = NR_STRIPES * (sizeof(struct stripe_head) +
conf->min_nr_stripes = NR_STRIPES;
memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
if (grow_stripes(conf, NR_STRIPES)) {
if (grow_stripes(conf, conf->min_nr_stripes)) {
printk(KERN_ERR
"md/raid:%s: couldn't allocate %dkB for buffers\n",
mdname(mddev), memory);
goto abort;
} else
printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
mdname(mddev), memory);
/*
* Losing a stripe head costs more than the time to refill it,
* it reduces the queue depth and so can hurt throughput.
* So set it rather large, scaled by number of devices.
*/
conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
conf->shrinker.scan_objects = raid5_cache_scan;
conf->shrinker.count_objects = raid5_cache_count;
conf->shrinker.batch = 128;
conf->shrinker.flags = 0;
register_shrinker(&conf->shrinker);

sprintf(pers_name, "raid%d", mddev->new_level);
conf->thread = md_register_thread(raid5d, mddev, pers_name);
Expand Down Expand Up @@ -7097,9 +7151,9 @@ static int check_stripe_cache(struct mddev *mddev)
*/
struct r5conf *conf = mddev->private;
if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
> conf->max_nr_stripes ||
> conf->min_nr_stripes ||
((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
> conf->max_nr_stripes) {
> conf->min_nr_stripes) {
printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n",
mdname(mddev),
((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
Expand Down
11 changes: 10 additions & 1 deletion drivers/md/raid5.h
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,7 @@ struct r5conf {
int max_degraded;
int raid_disks;
int max_nr_stripes;
int min_nr_stripes;

/* reshape_progress is the leading edge of a 'reshape'
* It has value MaxSector when no reshape is happening
Expand Down Expand Up @@ -513,7 +514,15 @@ struct r5conf {
#define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked,
* waiting for 25% to be free
*/

#define R5_ALLOC_MORE 2 /* It might help to allocate another
* stripe.
*/
#define R5_DID_ALLOC 4 /* A stripe was allocated, don't allocate
* more until at least one has been
* released. This avoids flooding
* the cache.
*/
struct shrinker shrinker;
int pool_size; /* number of disks in stripeheads in pool */
spinlock_t device_lock;
struct disk_info *disks;
Expand Down

0 comments on commit edbe83a

Please sign in to comment.