Skip to content

Commit

Permalink
netfilter: ipset: Fix "INFO: rcu detected stall in hash_xxx" reports
Browse files Browse the repository at this point in the history
In the case of huge hash:* types of sets, due to the single spinlock of
a set the processing of the whole set under spinlock protection could take
too long.

There were four places where the whole hash table of the set was processed
from bucket to bucket under holding the spinlock:

- During resizing a set, the original set was locked to exclude kernel side
  add/del element operations (userspace add/del is excluded by the
  nfnetlink mutex). The original set is actually just read during the
  resize, so the spinlocking is replaced with rcu locking of regions.
  However, thus there can be parallel kernel side add/del of entries.
  In order not to loose those operations a backlog is added and replayed
  after the successful resize.
- Garbage collection of timed out entries was also protected by the spinlock.
  In order not to lock too long, region locking is introduced and a single
  region is processed in one gc go. Also, the simple timer based gc running
  is replaced with a workqueue based solution. The internal book-keeping
  (number of elements, size of extensions) is moved to region level due to
  the region locking.
- Adding elements: when the max number of the elements is reached, the gc
  was called to evict the timed out entries. The new approach is that the gc
  is called just for the matching region, assuming that if the region
  (proportionally) seems to be full, then the whole set does. We could scan
  the other regions to check every entry under rcu locking, but for huge
  sets it'd mean a slowdown at adding elements.
- Listing the set header data: when the set was defined with timeout
  support, the garbage collector was called to clean up timed out entries
  to get the correct element numbers and set size values. Now the set is
  scanned to check non-timed out entries, without actually calling the gc
  for the whole set.

Thanks to Florian Westphal for helping me to solve the SOFTIRQ-safe ->
SOFTIRQ-unsafe lock order issues during working on the patch.

Reported-by: [email protected]
Reported-by: [email protected]
Reported-by: [email protected]
Fixes: 23c42a4 ("netfilter: ipset: Introduction of new commands and protocol version 7")
Signed-off-by: Jozsef Kadlecsik <[email protected]>
  • Loading branch information
Jozsef Kadlecsik committed Feb 22, 2020
1 parent 83d0585 commit f66ee04
Show file tree
Hide file tree
Showing 3 changed files with 472 additions and 206 deletions.
11 changes: 10 additions & 1 deletion include/linux/netfilter/ipset/ip_set.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ struct ip_set_ext {
u32 timeout;
u8 packets_op;
u8 bytes_op;
bool target;
};

struct ip_set;
Expand Down Expand Up @@ -187,6 +188,14 @@ struct ip_set_type_variant {
/* Return true if "b" set is the same as "a"
* according to the create set parameters */
bool (*same_set)(const struct ip_set *a, const struct ip_set *b);
/* Region-locking is used */
bool region_lock;
};

struct ip_set_region {
spinlock_t lock; /* Region lock */
size_t ext_size; /* Size of the dynamic extensions */
u32 elements; /* Number of elements vs timeout */
};

/* The core set type structure */
Expand Down Expand Up @@ -501,7 +510,7 @@ ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo,
}

#define IP_SET_INIT_KEXT(skb, opt, set) \
{ .bytes = (skb)->len, .packets = 1, \
{ .bytes = (skb)->len, .packets = 1, .target = true,\
.timeout = ip_set_adt_opt_timeout(opt, set) }

#define IP_SET_INIT_UEXT(set) \
Expand Down
34 changes: 24 additions & 10 deletions net/netfilter/ipset/ip_set_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,20 @@ ip_set_rcu_get(struct net *net, ip_set_id_t index)
return set;
}

static inline void
ip_set_lock(struct ip_set *set)
{
if (!set->variant->region_lock)
spin_lock_bh(&set->lock);
}

static inline void
ip_set_unlock(struct ip_set *set)
{
if (!set->variant->region_lock)
spin_unlock_bh(&set->lock);
}

int
ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
const struct xt_action_param *par, struct ip_set_adt_opt *opt)
Expand All @@ -744,9 +758,9 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
if (ret == -EAGAIN) {
/* Type requests element to be completed */
pr_debug("element must be completed, ADD is triggered\n");
spin_lock_bh(&set->lock);
ip_set_lock(set);
set->variant->kadt(set, skb, par, IPSET_ADD, opt);
spin_unlock_bh(&set->lock);
ip_set_unlock(set);
ret = 1;
} else {
/* --return-nomatch: invert matched element */
Expand Down Expand Up @@ -775,9 +789,9 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
!(opt->family == set->family || set->family == NFPROTO_UNSPEC))
return -IPSET_ERR_TYPE_MISMATCH;

spin_lock_bh(&set->lock);
ip_set_lock(set);
ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt);
spin_unlock_bh(&set->lock);
ip_set_unlock(set);

return ret;
}
Expand All @@ -797,9 +811,9 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
!(opt->family == set->family || set->family == NFPROTO_UNSPEC))
return -IPSET_ERR_TYPE_MISMATCH;

spin_lock_bh(&set->lock);
ip_set_lock(set);
ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt);
spin_unlock_bh(&set->lock);
ip_set_unlock(set);

return ret;
}
Expand Down Expand Up @@ -1264,9 +1278,9 @@ ip_set_flush_set(struct ip_set *set)
{
pr_debug("set: %s\n", set->name);

spin_lock_bh(&set->lock);
ip_set_lock(set);
set->variant->flush(set);
spin_unlock_bh(&set->lock);
ip_set_unlock(set);
}

static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb,
Expand Down Expand Up @@ -1713,9 +1727,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
bool eexist = flags & IPSET_FLAG_EXIST, retried = false;

do {
spin_lock_bh(&set->lock);
ip_set_lock(set);
ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried);
spin_unlock_bh(&set->lock);
ip_set_unlock(set);
retried = true;
} while (ret == -EAGAIN &&
set->variant->resize &&
Expand Down
Loading

0 comments on commit f66ee04

Please sign in to comment.