Skip to content

Commit

Permalink
tcp_output(): maximize LSO frames size
Browse files Browse the repository at this point in the history
Maximizing the LSO packet size we reduce the CPU usage both in
the Guest, and in HV.

This improves the netperf TCP_STREAM test performance when it's executed vs
the netserver on the same host where we run an OSv Guest (8GB RAM, 1 vCPU)
by the following margins:

message size    compared to origin
64              30.18%
128             31.86%
256             59.60%
512             97.88%
1024            214.14%
2048            247.84%
4096            208.82%
8192            106.49%
16384           38.51%
32768           12.69%
65536           5.01%

Signed-off-by: Vlad Zolotarov <[email protected]>
Signed-off-by: Pekka Enberg <[email protected]>
  • Loading branch information
Vlad Zolotarov authored and Pekka Enberg committed Jul 17, 2014
1 parent 10fda3e commit 902535a
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 4 deletions.
52 changes: 48 additions & 4 deletions bsd/sys/netinet/tcp_output.cc
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,12 @@

#include <machine/in_cksum.h>

TRACEPOINT(trace_tso_flush_sched, "");
TRACEPOINT(trace_tso_flush_cancel, "");
TRACEPOINT(trace_tso_flush_fire,
"Going to send %d bytes, off %d, sendwin %d sb_cc "
"%d cur_seq %u", int, int, int, int, unsigned int);

VNET_DEFINE(int, path_mtu_discovery) = 1;
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
&VNET_NAME(path_mtu_discovery), 1,
Expand Down Expand Up @@ -128,6 +134,16 @@ cc_after_idle(struct tcpcb *tp)
CC_ALGO(tp)->after_idle(tp->ccv);
}

static inline void cancel_tso_flush_timer(struct tcpcb *tp)
{
//
// Don't actually "cancel" the timer, just make it do nothing when it
// fires.
//
tp->t_flags &= ~((u_int)TF_TSO_NOW);
tp->t_flags &= ~((u_int)TF_TSO_PENDING);
}

/*
* Tcp output routine: figure out what should be sent and send it.
*/
Expand Down Expand Up @@ -450,15 +466,19 @@ tcp_output(struct tcpcb *tp)
*/
ipsec_optlen = ipsec_hdrsiz_tcp(tp);
#endif
if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
bool tso_capable = false;
if ((tp->t_flags & TF_TSO) && V_tcp_do_tso &&
((tp->t_flags & TF_SIGNATURE) == 0) &&
tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
#ifdef IPSEC
ipsec_optlen == 0 &&
#endif
tp->t_inpcb->inp_options == NULL &&
tp->t_inpcb->in6p_options == NULL)
tso = 1;
tp->t_inpcb->in6p_options == NULL) {
tso_capable = true;
}

tso = !!(tso_capable && len > tp->t_maxseg);

if (sack_rxmit) {
if (p->rxmit + len < tp->snd_una + so->so_snd.sb_cc)
Expand All @@ -483,8 +503,26 @@ tcp_output(struct tcpcb *tp)
* - we need to retransmit
*/
if (len) {
if (len >= tp->t_maxseg)
u_int send_thresh = tp->t_maxseg;

if (tso_capable) {
send_thresh = bsd_min(IP_MAXPACKET, sendwin);
}

if (len >= send_thresh) {
goto send;
} else if (tp->t_flags & TF_TSO_NOW) {
trace_tso_flush_fire(len, off, sendwin,
so->so_snd.sb_cc,
tp->snd_nxt.raw() + len);
goto send;
} else if (!(tp->t_flags & TF_TSO_PENDING)) {
trace_tso_flush_sched();
tp->t_flags |= TF_TSO_PENDING;
// Defer sending for no longer than 2 ticks
tcp_timer_activate(tp, TT_TSO_FLUSH, 2);
}

/*
* NOTE! on localhost connections an 'ack' from the remote
* end may occur synchronously with the output and cause
Expand Down Expand Up @@ -1196,6 +1234,9 @@ tcp_output(struct tcpcb *tp)
*/
ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);

// We are about to send the frame - cancel the TSO_FLUSH timer
cancel_tso_flush_timer(tp);

/* TODO: IPv6 IP6TOS_ECT bit on */
error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &ro,
((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
Expand Down Expand Up @@ -1230,6 +1271,9 @@ tcp_output(struct tcpcb *tp)
if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss)
ip->ip_off |= IP_DF;

// We are about to send the frame - cancel the TSO_FLUSH timer
cancel_tso_flush_timer(tp);

error = ip_output(m, tp->t_inpcb->inp_options, &ro,
((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
tp->t_inpcb);
Expand Down
37 changes: 37 additions & 0 deletions bsd/sys/netinet/tcp_timer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@
#include <bsd/sys/netinet/tcp_debug.h>
#endif

TRACEPOINT(trace_tcp_timer_tso_flush, "");
TRACEPOINT(trace_tcp_timer_tso_flush_ret, "");
TRACEPOINT(trace_tcp_timer_tso_flush_err, "");

int tcp_keepinit;
SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
&tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");
Expand Down Expand Up @@ -421,6 +425,36 @@ tcp_timer_persist(serial_timer_task& timer, struct tcpcb *tp)
CURVNET_RESTORE();
}

static void
tcp_timer_tso_flush(serial_timer_task& timer, struct tcpcb *tp)
{
trace_tcp_timer_tso_flush();

CURVNET_SET(tp->t_vnet);
struct inpcb *inp = tp->t_inpcb;


KASSERT(inp != NULL, ("tcp_timer_tso_flush: inp == NULL"));
KASSERT(tp->t_flags & TF_TSO, "tcp_timer_tso_flush: TSO disabled");
INP_LOCK(inp);

// Re-check the TF_TSO_PENDING flag under the lock
if (!timer.try_fire() || !(tp->t_flags & TF_TSO_PENDING)) {
INP_UNLOCK(inp);
CURVNET_RESTORE();
trace_tcp_timer_tso_flush_err();
return;
}

tp->t_flags |= TF_TSO_NOW;
(void) tcp_output(tp);

INP_UNLOCK(inp);
CURVNET_RESTORE();

trace_tcp_timer_tso_flush_ret();
}

static void
tcp_timer_rexmt(serial_timer_task& timer, struct tcpcb *tp)
{
Expand Down Expand Up @@ -611,6 +645,9 @@ init_timers(struct tcp_timer* timers, struct tcpcb *tp, struct inpcb *inp)

timers->timers[tcp_timer_type::TT_2MSL] =
new serial_timer_task(inp->inp_lock, std::bind(tcp_timer_2msl, _1, tp));

timers->timers[tcp_timer_type::TT_TSO_FLUSH] =
new serial_timer_task(inp->inp_lock, std::bind(tcp_timer_tso_flush, _1, tp));
}

serial_timer_task&
Expand Down
1 change: 1 addition & 0 deletions bsd/sys/netinet/tcp_timer.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ enum tcp_timer_type {
TT_PERSIST, /* keepalive */
TT_KEEP, /* 2*msl TIME_WAIT timer */
TT_2MSL, /* delayed ACK timer */
TT_TSO_FLUSH, /* TSO flush timer */
COUNT
};

Expand Down
2 changes: 2 additions & 0 deletions bsd/sys/netinet/tcp_var.h
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,8 @@ class tcpcb {
#define TF_NEEDFIN 0x000800 /* send FIN (implicit state) */
#define TF_NOPUSH 0x001000 /* don't push */
#define TF_PREVVALID 0x002000 /* saved values for bad rxmit valid */
#define TF_TSO_NOW 0x004000 /* send TSO packet now */
#define TF_TSO_PENDING 0x008000 /* TSO packet is pending */
#define TF_MORETOCOME 0x010000 /* More data to be appended to sock */
#define TF_LQ_OVERFLOW 0x020000 /* listen queue overflow */
#define TF_LASTIDLE 0x040000 /* connection was previously idle */
Expand Down

0 comments on commit 902535a

Please sign in to comment.