Skip to content

Commit

Permalink
netfilter: snat: evict closing tcp entries on reply tuple collision
Browse files Browse the repository at this point in the history
When all tried source tuples are in use, the connection request (skb)
and the new conntrack will be dropped in nf_confirm() due to the
non-recoverable clash.

Make it so that the last 32 attempts are allowed to evict a colliding
entry if this connection is already closing and the new sequence number
has advanced past the old one.

Such "all tuples taken" secenario can happen with tcp-rpc workloads where
same dst:dport gets queried repeatedly.

Signed-off-by: Florian Westphal <[email protected]>
Signed-off-by: Pablo Neira Ayuso <[email protected]>
  • Loading branch information
Florian Westphal authored and ummakynes committed Jun 26, 2023
1 parent 96b2ef9 commit 4589725
Showing 1 changed file with 88 additions and 4 deletions.
92 changes: 88 additions & 4 deletions net/netfilter/nf_nat_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@

#include "nf_internals.h"

#define NF_NAT_MAX_ATTEMPTS 128
#define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4)

static spinlock_t nf_nat_locks[CONNTRACK_LOCKS];

static DEFINE_MUTEX(nf_nat_proto_mutex);
Expand Down Expand Up @@ -197,6 +200,88 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
}

static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags)
{
static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT |
IPS_DYING;
static const unsigned long flags_needed = IPS_SRC_NAT;
enum tcp_conntrack old_state;

old_state = READ_ONCE(ct->proto.tcp.state);
if (old_state < TCP_CONNTRACK_TIME_WAIT)
return false;

if (flags & flags_refuse)
return false;

return (flags & flags_needed) == flags_needed;
}

/* reverse direction will send packets to new source, so
* make sure such packets are invalid.
*/
static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new)
{
return (__s32)(new->proto.tcp.seen[0].td_end -
old->proto.tcp.seen[0].td_end) > 0;
}

static int
nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple,
const struct nf_conn *ignored_conntrack,
unsigned int attempts_left)
{
static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD;
struct nf_conntrack_tuple_hash *thash;
const struct nf_conntrack_zone *zone;
struct nf_conntrack_tuple reply;
unsigned long flags;
struct nf_conn *ct;
bool taken = true;
struct net *net;

nf_ct_invert_tuple(&reply, tuple);

if (attempts_left > NF_NAT_HARDER_THRESH ||
tuple->dst.protonum != IPPROTO_TCP ||
ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT)
return nf_conntrack_tuple_taken(&reply, ignored_conntrack);

/* :ast few attempts to find a free tcp port. Destructive
* action: evict colliding if its in timewait state and the
* tcp sequence number has advanced past the one used by the
* old entry.
*/
net = nf_ct_net(ignored_conntrack);
zone = nf_ct_zone(ignored_conntrack);

thash = nf_conntrack_find_get(net, zone, &reply);
if (!thash)
return false;

ct = nf_ct_tuplehash_to_ctrack(thash);

if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL)
goto out;

if (WARN_ON_ONCE(ct == ignored_conntrack))
goto out;

flags = READ_ONCE(ct->status);
if (!nf_nat_may_kill(ct, flags))
goto out;

if (!nf_seq_has_advanced(ct, ignored_conntrack))
goto out;

/* Even if we can evict do not reuse if entry is offloaded. */
if (nf_ct_kill(ct))
taken = flags & flags_offload;
out:
nf_ct_put(ct);
return taken;
}

static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t,
const struct nf_nat_range2 *range)
{
Expand Down Expand Up @@ -385,7 +470,6 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
unsigned int range_size, min, max, i, attempts;
__be16 *keyptr;
u16 off;
static const unsigned int max_attempts = 128;

switch (tuple->dst.protonum) {
case IPPROTO_ICMP:
Expand Down Expand Up @@ -471,8 +555,8 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
off = get_random_u16();

attempts = range_size;
if (attempts > max_attempts)
attempts = max_attempts;
if (attempts > NF_NAT_MAX_ATTEMPTS)
attempts = NF_NAT_MAX_ATTEMPTS;

/* We are in softirq; doing a search of the entire range risks
* soft lockup when all tuples are already used.
Expand All @@ -483,7 +567,7 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
another_round:
for (i = 0; i < attempts; i++, off++) {
*keyptr = htons(min + off % range_size);
if (!nf_nat_used_tuple(tuple, ct))
if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i))
return;
}

Expand Down

0 comments on commit 4589725

Please sign in to comment.