From 8f87ec301b75e2949d3305a3868fda7c174b3fe5 Mon Sep 17 00:00:00 2001 From: Puneeth Nanjundaswamy Date: Fri, 28 Mar 2014 17:56:51 +0100 Subject: [PATCH] Inital merge from Linux source tree 2.6.31.9 to 2.6.32 for aNCR implementation. --- include/linux/sysctl.h | 3 + include/linux/tcp.h | 27 ++- include/net/inet_connection_sock.h | 8 + include/net/tcp.h | 68 +++++- kernel/sysctl_check.c | 3 + nameonly.log | 26 +++ net/ipv4/Kconfig | 71 ++++++ net/ipv4/Makefile | 6 +- net/ipv4/sysctl_net_ipv4.c | 118 ++++++++++ net/ipv4/tcp.c | 38 ++++ net/ipv4/tcp_ancr.c | 338 +++++++++++++++++++++++++++++ net/ipv4/tcp_cong.c | 8 +- net/ipv4/tcp_input.c | 217 +++++++++++++++--- net/ipv4/tcp_ipv4.c | 16 ++ net/ipv4/tcp_leungma.c | 292 +++++++++++++++++++++++++ net/ipv4/tcp_minisocks.c | 25 +++ net/ipv4/tcp_ncr.c | 277 +++++++++++++++++++++++ net/ipv4/tcp_noreor.c | 60 +++++ net/ipv4/tcp_output.c | 81 ++++++- net/ipv4/tcp_reorder.c | 324 +++++++++++++++++++++++++++ net/ipv4/tcp_timer.c | 7 +- net/ipv6/tcp_ipv6.c | 1 + 22 files changed, 1958 insertions(+), 56 deletions(-) create mode 100644 nameonly.log create mode 100644 net/ipv4/tcp_ancr.c create mode 100644 net/ipv4/tcp_leungma.c create mode 100644 net/ipv4/tcp_ncr.c create mode 100644 net/ipv4/tcp_noreor.c create mode 100644 net/ipv4/tcp_reorder.c diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 1e4743ee683103..694bba4a13013e 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -434,6 +434,9 @@ enum NET_TCP_ALLOWED_CONG_CONTROL=123, NET_TCP_MAX_SSTHRESH=124, NET_TCP_FRTO_RESPONSE=125, + NET_TCP_REORDER=126, + NET_TCP_AVAIL_REORDER=127, + NET_TCP_ALLOWED_REORDER=128, }; enum { diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 61723a7c21fe57..2f3b7938650cc8 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -50,7 +50,7 @@ struct tcphdr { fin:1; #else #error "Adjust your defines" -#endif +#endif __be16 window; __sum16 check; __be16 urg_ptr; @@ -61,14 +61,14 @@ struct tcphdr { * (union is compatible to any of its members) * This means this part of the code is -fstrict-aliasing safe now. */ -union tcp_word_hdr { +union tcp_word_hdr { struct tcphdr hdr; __be32 words[5]; -}; +}; -#define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3]) +#define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3]) -enum { +enum { TCP_FLAG_CWR = __cpu_to_be32(0x00800000), TCP_FLAG_ECE = __cpu_to_be32(0x00400000), TCP_FLAG_URG = __cpu_to_be32(0x00200000), @@ -79,7 +79,7 @@ enum { TCP_FLAG_FIN = __cpu_to_be32(0x00010000), TCP_RESERVED_BITS = __cpu_to_be32(0x0F000000), TCP_DATA_OFFSET = __cpu_to_be32(0xF0000000) -}; +}; /* TCP socket options */ #define TCP_NODELAY 1 /* Turn off Nagle's algorithm. */ @@ -96,6 +96,8 @@ enum { #define TCP_QUICKACK 12 /* Block/reenable quick acks */ #define TCP_CONGESTION 13 /* Congestion control algorithm */ #define TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */ +#define TCP_REORDER 15 /* Reordering Algorithm */ +#define TCP_REORDER_MODE 16 /* meaning of "mode" depends on reorder module */ #define TCPI_OPT_TIMESTAMPS 1 #define TCPI_OPT_SACK 2 @@ -157,6 +159,11 @@ struct tcp_info __u32 tcpi_rcv_space; __u32 tcpi_total_retrans; + __u32 tcpi_total_fast_retrans; + __u32 tcpi_total_rto_retrans; + __u32 tcpi_total_dsacks; + __u32 tcpi_dupthresh; + __u32 tcpi_last_reor_sample; }; /* for TCP_MD5SIG socket option */ @@ -244,6 +251,13 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) return (struct tcp_request_sock *)req; } +struct reorder_sample { + struct list_head list; + u32 seq; + int factor; + int sample; +}; + struct tcp_sock { /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; @@ -330,6 +344,7 @@ struct tcp_sock { u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ u32 snd_cwnd_used; u32 snd_cwnd_stamp; + u32 current_cwnd; u32 rcv_wnd; /* Current receiver window */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 696d6e4ce68a0e..479329ff28729d 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -95,6 +95,7 @@ struct inet_connection_sock { __u32 icsk_rto; __u32 icsk_pmtu_cookie; const struct tcp_congestion_ops *icsk_ca_ops; + const struct tcp_reorder_ops *icsk_ro_ops; const struct inet_connection_sock_af_ops *icsk_af_ops; unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state; @@ -126,7 +127,9 @@ struct inet_connection_sock { int probe_size; } icsk_mtup; u32 icsk_ca_priv[16]; + u32 icsk_ro_priv[5]; #define ICSK_CA_PRIV_SIZE (16 * sizeof(u32)) +#define ICSK_RO_PRIV_SIZE (5 * sizeof(u32)) }; #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ @@ -144,6 +147,11 @@ static inline void *inet_csk_ca(const struct sock *sk) return (void *)inet_csk(sk)->icsk_ca_priv; } +static inline void *inet_csk_ro(const struct sock *sk) +{ + return (void *)inet_csk(sk)->icsk_ro_priv; +} + extern struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, const gfp_t priority); diff --git a/include/net/tcp.h b/include/net/tcp.h index 03a49c7033774c..c83e291abda126 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -468,9 +468,11 @@ extern __u32 cookie_v6_init_sequence(struct sock *sk, struct sk_buff *skb, extern void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle); extern int tcp_may_send_now(struct sock *sk); -extern int tcp_retransmit_skb(struct sock *, struct sk_buff *); +extern int tcp_retransmit_skb(struct sock *, struct sk_buff *, int fast_rexmit); +//extern int tcp_retransmit_skb(struct sock *, struct sk_buff *); extern void tcp_retransmit_timer(struct sock *sk); -extern void tcp_xmit_retransmit_queue(struct sock *); +extern void tcp_xmit_retransmit_queue(struct sock *, int fast_rexmit); +//extern void tcp_xmit_retransmit_queue(struct sock *); extern void tcp_simple_retransmit(struct sock *); extern int tcp_trim_head(struct sock *, struct sk_buff *, u32); extern int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int); @@ -653,6 +655,68 @@ static inline int tcp_skb_mss(const struct sk_buff *skb) return skb_shinfo(skb)->gso_size; } +/* + * Interface for adding new TCP segment reordering handlers + */ +#define TCP_REORDER_NAME_MAX 16 +#define TCP_REORDER_MAX 128 +#define TCP_REORDER_BUF_MAX (TCP_REORDER_NAME_MAX*TCP_REORDER_MAX) + +#define TCP_REORDER_NON_RESTRICTED 0x1 + +struct tcp_reorder_ops { + struct list_head list; + unsigned long flags; + + /* initialize private data (optional) */ + void (*init)(struct sock *sk); + /* cleanup private data (optional) */ + void (*release)(struct sock *sk); + + /* return dupack threshold (required) */ + u32 (*dupthresh)(struct sock *sk); + u32 (*moddupthresh)(struct sock *sk); + /* update the mode of operation (required) */ + void (*update_mode)(struct sock *sk, int val); + /* allow cwnd moderation in disorder state [bool] (required) */ + int allow_moderation; + /* allow head timeout to trigger fast recovery [bool] (required) */ + int allow_head_to; + + /* a new sack'ed segment (optional) */ + void (*new_sack)(struct sock *sk); + /* a non-retransmitted SACK hole was filled (optional) */ + void (*sack_hole_filled)(struct sock *sk, int flag); + /* state machine will start now (optional) */ + void (*sm_starts)(struct sock *sk, int flag, int acked); + /* recovery phase starts (optional) */ + void (*recovery_starts)(struct sock *sk, int flag); + void (*recovery_ends)(struct sock *sk, int flag); + /* reordering event with a certain degree was detected (optional) */ + void (*reorder_detected)(struct sock *sk, int length); + /* reordering event with a certain factor was detected (optional) */ + void (*reorder_detected_factor)(struct sock *sk, int factor); + /* a RTO timeout happened (optional) */ + void (*rto_happened)(struct sock *sk); + + char name[TCP_REORDER_NAME_MAX]; + struct module *owner; +}; + +extern int tcp_register_reorder(struct tcp_reorder_ops *ro); +extern void tcp_unregister_reorder(struct tcp_reorder_ops *ro); +extern void tcp_init_reorder(struct sock *sk); +extern void tcp_cleanup_reorder(struct sock *sk); +extern int tcp_set_default_reorder(const char *name); +extern void tcp_get_default_reorder(char *name); +extern void tcp_get_available_reorder(char *buf, size_t maxlen); +extern void tcp_get_allowed_reorder(char *buf, size_t maxlen); +extern int tcp_set_allowed_reorder(char *val); +extern int tcp_set_reorder(struct sock *sk, const char *name); +extern u32 tcp_native_dupthresh(struct sock *sk); +extern struct tcp_reorder_ops tcp_init_reorder_ops; +extern struct tcp_reorder_ops tcp_native; + /* Events passed to congestion control interface */ enum tcp_ca_event { CA_EVENT_TX_START, /* first transmit when no packets in flight */ diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index b6e7aaea46043a..0246aca843c977 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -390,6 +390,9 @@ static const struct trans_ctl_table trans_net_ipv4_table[] = { { NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" }, { NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" }, { NET_TCP_FRTO_RESPONSE, "tcp_frto_response" }, + { NET_TCP_REORDER, "tcp_reorder"} + { NET_TCP_AVAIL_REORDER, "tcp_available_reorder"} + { NET_TCP_ALLOWED_REORDER, "tcp_allowed_reorder"} { 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" }, {} }; diff --git a/nameonly.log b/nameonly.log new file mode 100644 index 00000000000000..53d9d87e3c1ea0 --- /dev/null +++ b/nameonly.log @@ -0,0 +1,26 @@ +This file contains the list of files that were originally added/modified to the +linux source tree for aNCR implementation + +config-meshnode +config-vmeshnode-pae +include/linux/sysctl.h +include/linux/tcp.h +include/net/inet_connection_sock.h +include/net/tcp.h +kernel/sysctl_check.c +net/ipv4/Kconfig +net/ipv4/Makefile +net/ipv4/sysctl_net_ipv4.c +net/ipv4/tcp.c +net/ipv4/tcp_ancr.c +net/ipv4/tcp_cong.c +net/ipv4/tcp_input.c +net/ipv4/tcp_ipv4.c +net/ipv4/tcp_leungma.c +net/ipv4/tcp_minisocks.c +net/ipv4/tcp_ncr.c +net/ipv4/tcp_noreor.c +net/ipv4/tcp_output.c +net/ipv4/tcp_reorder.c +net/ipv4/tcp_timer.c +net/ipv6/tcp_ipv6.c diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 70491d9035eb40..774fbe3797df4b 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -627,3 +627,74 @@ config TCP_MD5SIG If unsure, say N. +menuconfig TCP_REORDER_ADVANCED + bool "TCP: advanced reordering algorithms" + ---help--- + Support for selection of various TCP reordering algorithms. + + Nearly all users can safely say no here, and a safe default + selection will be made (native Linux algorithm). + + If unsure, say N. + +if TCP_REORDER_ADVANCED + +config TCP_REORDER_NCR + tristate "TCP-NCR" + depends on EXPERIMENTAL + default n + ---help--- + TCP-NCR improves the robustness of TCP to non-congestion events. + The algorithm is described in RFC4653 + + For further details see: + http://tools.ietf.org/html/rfc4653 + +config TCP_REORDER_ANCR + tristate "TCP-ANCR" + depends on EXPERIMENTAL + default n + ---help--- + TCP-ANCR improves the robustness of TCP to reordering. + It is based on TCP-NCR (http://tools.ietf.org/html/rfc4653) + and the native Linux behavior. + +config TCP_REORDER_LEUNGMA + tristate "TCP-LEUNGMA" + depends on EXPERIMENTAL + default n + ---help--- + TCP-LEUNGMA is based on this paper by Leung + and Ma (http://www.eee.hku.hk/~kcleung/papers/journals/ + TCP_reordering:JCN_2005/JCN04-3-107.pdf) + +choice + prompt "Default TCP reordering algorithm" + default DEFAULT_REORDER_NATIVE + help + Select the TCP reordering algorithm that will be used by default + for all connections. + + config DEFAULT_REORDER_NCR + bool "ncr" if TCP_REORDER_NCR=y + + config DEFAULT_REORDER_ANCR + bool "ancr" if TCP_REORDER_ANCR=y + + config DEFAULT_REORDER_LEUNGMA + bool "leungma" if TCP_REORDER_LEUNGMA=y + + config DEFAULT_REORDER_NATIVE + bool "native" + +endchoice + +endif + +config DEFAULT_TCP_REORDER + string + default "ncr" if DEFAULT_REORDER_NCR + default "ancr" if DEFAULT_REORDER_ANCR + default "leungma" if DEFAULT_REORDER_LEUNGMA + default "native" + diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 80ff87ce43aac6..4f15556e191de4 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -7,7 +7,7 @@ obj-y := route.o inetpeer.o protocol.o \ ip_output.o ip_sockglue.o inet_hashtables.o \ inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ - tcp_minisocks.o tcp_cong.o \ + tcp_minisocks.o tcp_cong.o tcp_reorder.o \ datagram.o raw.o udp.o udplite.o \ arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o \ @@ -48,6 +48,10 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o +obj-$(CONFIG_TCP_REORDER_NCR) += tcp_ncr.o +obj-$(CONFIG_TCP_REORDER_ANCR) += tcp_ancr.o +obj-$(CONFIG_TCP_REORDER_LEUNGMA) += tcp_leungma.o +#obj-$(CONFIG_TCP_REORDER_NONE) += tcp_noreor.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 2dcf04d9b005cd..fa41eff343c96a 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -90,6 +90,102 @@ static int ipv4_sysctl_local_port_range(ctl_table *table, return ret; } +static int proc_tcp_reorder(ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char val[TCP_REORDER_NAME_MAX]; + ctl_table tbl = { + .data = val, + .maxlen = TCP_REORDER_NAME_MAX, + }; + int ret; + + tcp_get_default_reorder(val); + + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) + ret = tcp_set_default_reorder(val); + return ret; +} + +static int sysctl_tcp_reorder(ctl_table *table, + void __user *oldval, + size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + char val[TCP_REORDER_NAME_MAX]; + ctl_table tbl = { + .data = val, + .maxlen = TCP_REORDER_NAME_MAX, + }; + int ret; + + tcp_get_default_reorder(val); + ret = sysctl_string(&tbl, oldval, oldlenp, newval, newlen); + if (ret == 1 && newval && newlen) + ret = tcp_set_default_reorder(val); + return ret; +} + +static int proc_tcp_available_reorder(ctl_table *ctl, + int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + ctl_table tbl = { .maxlen = TCP_REORDER_BUF_MAX, }; + int ret; + + tbl.data = kmalloc(tbl.maxlen, GFP_USER); + if (!tbl.data) + return -ENOMEM; + tcp_get_available_reorder(tbl.data, TCP_REORDER_BUF_MAX); + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + kfree(tbl.data); + return ret; +} + +static int proc_allowed_reorder(ctl_table *ctl, + int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + ctl_table tbl = { .maxlen = TCP_REORDER_BUF_MAX }; + int ret; + + tbl.data = kmalloc(tbl.maxlen, GFP_USER); + if (!tbl.data) + return -ENOMEM; + + tcp_get_allowed_reorder(tbl.data, tbl.maxlen); + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) + ret = tcp_set_allowed_reorder(tbl.data); + kfree(tbl.data); + return ret; +} + +static int strategy_allowed_reorder(ctl_table *table, + void __user *oldval, + size_t __user *oldlenp, + void __user *newval, + size_t newlen) +{ + ctl_table tbl = { .maxlen = TCP_REORDER_BUF_MAX }; + int ret; + + tbl.data = kmalloc(tbl.maxlen, GFP_USER); + if (!tbl.data) + return -ENOMEM; + + tcp_get_available_reorder(tbl.data, tbl.maxlen); + ret = sysctl_string(&tbl, oldval, oldlenp, newval, newlen); + if (ret == 1 && newval && newlen) + ret = tcp_set_allowed_reorder(tbl.data); + kfree(tbl.data); + + return ret; + +} static int proc_tcp_congestion_control(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -606,6 +702,14 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_tcp_congestion_control, .strategy = sysctl_tcp_congestion_control, }, + { + .ctl_name = NET_TCP_REORDER, + .procname = "tcp_reorder", + .mode = 0644, + .maxlen = TCP_REORDER_NAME_MAX, + .proc_handler = proc_tcp_reorder, + .strategy = sysctl_tcp_reorder, + }, { .ctl_name = NET_TCP_ABC, .procname = "tcp_abc", @@ -704,6 +808,20 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_allowed_congestion_control, .strategy = strategy_allowed_congestion_control, }, + { + .procname = "tcp_available_reorder", + .maxlen = TCP_REORDER_BUF_MAX, + .mode = 0444, + .proc_handler = proc_tcp_available_reorder, + }, + { + .ctl_name = NET_TCP_ALLOWED_REORDER, + .procname = "tcp_allowed_reorder", + .maxlen = TCP_REORDER_BUF_MAX, + .mode = 0644, + .proc_handler = proc_allowed_reorder, + .strategy = strategy_allowed_reorder, + }, { .ctl_name = NET_TCP_MAX_SSTHRESH, .procname = "tcp_max_ssthresh", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f1813bc7108811..97245cd5428bea 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2101,6 +2101,23 @@ static int do_tcp_setsockopt(struct sock *sk, int level, release_sock(sk); return err; } + if (optname == TCP_REORDER) { + char name[TCP_REORDER_NAME_MAX]; + + if (optlen < 1) + return -EINVAL; + + val = strncpy_from_user(name, optval, + min(TCP_REORDER_NAME_MAX-1, optlen)); + if (val < 0) + return -EFAULT; + name[val] = 0; + + lock_sock(sk); + err = tcp_set_reorder(sk, name); + release_sock(sk); + return err; + } if (optlen < sizeof(int)) return -EINVAL; @@ -2111,6 +2128,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, lock_sock(sk); switch (optname) { + case TCP_REORDER_MODE: + if (icsk->icsk_ro_ops->update_mode) + icsk->icsk_ro_ops->update_mode(sk, val); + break; + case TCP_MAXSEG: /* Values greater than interface MTU won't take effect. However * at the point when this call is done we typically don't yet @@ -2342,6 +2364,12 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_rcv_space = tp->rcvq_space.space; info->tcpi_total_retrans = tp->total_retrans; + info->tcpi_total_fast_retrans = tp->total_fast_retrans; + info->tcpi_total_rto_retrans = tp->total_rto_retrans; + info->tcpi_total_dsacks = tp->total_dsacks; + + info->tcpi_dupthresh = inet_csk(sk)->icsk_ro_ops->moddupthresh(sk); + info->tcpi_last_reor_sample = tp->last_reor_sample; } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2425,6 +2453,15 @@ static int do_tcp_getsockopt(struct sock *sk, int level, if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) return -EFAULT; return 0; + case TCP_REORDER: + if (get_user(len, optlen)) + return -EFAULT; + len = min_t(unsigned int, len, TCP_REORDER_NAME_MAX); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, icsk->icsk_ro_ops->name, len)) + return -EFAULT; + return 0; default: return -ENOPROTOOPT; } @@ -2975,6 +3012,7 @@ void __init tcp_init(void) tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size); tcp_register_congestion_control(&tcp_reno); + tcp_register_reorder(&tcp_native); } EXPORT_SYMBOL(tcp_close); diff --git a/net/ipv4/tcp_ancr.c b/net/ipv4/tcp_ancr.c new file mode 100644 index 00000000000000..6e8d170b25017d --- /dev/null +++ b/net/ipv4/tcp_ancr.c @@ -0,0 +1,338 @@ +/* + * TCP-aNCR reordering response + * + * Inspired by ideas from TCP-NCR and the paper + * "Enhancing TCP Performance to Persistent Packet Reordering" + * by Ka-Cheong Leung and Changming Ma + * + * Changes: + * Lennart Schulte: burst protection in elt + * Lennart Schulte: max factor instead of max reordering length + */ + +#include +#include +#include + +#include + +#define MIN_DUPTHRESH 3 +#define FIXED_POINT_SHIFT 8 + +// copied from tcp_input.c +#define FLAG_DATA_SACKED 0x20 // New SACK. + +/* ancr variables */ +struct ancr { + u8 reorder_mode; + u8 elt_flag; + u8 lt_f; + u32 dupthresh; + u32 max_factor; +}; + +static inline void tcp_ancr_init(struct sock *sk) +{ + struct ancr *ro = inet_csk_ro(sk); + + if (ro->reorder_mode == 2) { + ro->lt_f = 4; + } else { + ro->lt_f = 3; + ro->reorder_mode = 1; + } + ro->elt_flag = 0; + ro->dupthresh = MIN_DUPTHRESH; + ro->max_factor = 0; +} + +static void tcp_ancr_reorder_detected(struct sock *sk, int sample) +{ + //struct ancr *ro = inet_csk_ro(sk); + + //if (sample > ro->max_sample) { + // //printk(KERN_INFO "new max_sample = %u", sample); + // ro->max_sample = sample; + //} +} + +/* New reordering event, recalculate avg and mdev (and dupthresh) + */ +static void tcp_ancr_reordering_detected_factor(struct sock *sk, int factor) +{ + struct ancr *ro = inet_csk_ro(sk); + + //1st condition: use biggest sample ever seen + //2nd condition: ncr upper bound (if normalized sample is 1 this is the same + // as ncr) + if (factor > ro->max_factor) { + //printk(KERN_INFO "new factor = %u", factor); + ro->max_factor = min_t(u32, factor, 1 << FIXED_POINT_SHIFT); + } +} + +/* Test if TCP-ancr may be used + */ +static int tcp_ancr_test(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return tcp_is_sack(tp) && !(tp->nonagle & TCP_NAGLE_OFF); +} + +/* Set the dupthresh + */ +static void tcp_ancr_calc_dupthresh(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct ancr *ro = inet_csk_ro(sk); + + /* Minimum is always MIN_DUPTHRESH. + * Maximum is the new dupthresh. + * At the beginning of disorder the dupthresh has to be lower than the new + * dupthresh, since it would never retransmit if no new packets would be + * send during elt. + */ + u32 new = (ro->max_factor * tp->prior_packets_out) >> FIXED_POINT_SHIFT; + u32 ncr = (2 * tp->packets_out)/ro->lt_f; + + /*if (ro->max_factor == 0) { + ro->dupthresh = 3; + //printk(KERN_INFO "return 3"); + return; + }*/ + + //u32 new_fac = ((2 << 16)/ro->max_factor); + //printk(KERN_DEBUG "max_f = %u, new_fac = %u", ro->max_factor,new_fac); + //ro->dupthresh = (((2*tp->packets_out) << 8) / (new_fac + (1 << 8))) + 1; + + ro->dupthresh = min_t(u32, new, ncr); + ro->dupthresh = max_t(u32, ro->dupthresh, MIN_DUPTHRESH); +} + +/* Initiate Extended Limited Transmit + */ +static void tcp_ancr_elt_init(struct sock *sk, int how) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct ancr *ro = inet_csk_ro(sk); + + /* on entering disorder current_cwnd has to be set to cwnd + * else it would just be increased all the time */ + if (!how) + tp->current_cwnd = tp->snd_cwnd; + + /* set prior_packets_out if entering disorder or recover is reached */ + //printk(KERN_INFO "elt init: how=%u", how); + if (!how || (tp->high_seq <= tp->snd_una)) { + tp->prior_packets_out = tp->current_cwnd; + tp->high_seq = tp->snd_nxt; + } + + ro->elt_flag = 1; + tcp_ancr_calc_dupthresh(sk); +} + +/* Extended Limited Transmit + * + * tcp_cwnd_down() is not meant to be used in the disorder phase. It is + * implemented under assumptions only valid in the recovery phase. + * So, we need our own version for ELT, similar to the "E"-steps in RFC 4653 + */ +static void tcp_ancr_elt(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct ancr *ro = inet_csk_ro(sk); + u32 sent; + u32 room = tp->current_cwnd > tcp_packets_in_flight(tp) ? + tp->current_cwnd - tcp_packets_in_flight(tp) : + 0; + + if (ro->reorder_mode == 1) { + //pkts sent during elt up to now + sent = tp->packets_out > tp->current_cwnd ? + tp->packets_out - tp->current_cwnd : + 0; + room = room > sent ? + room - sent : + 0; + if (room > 1) //happens with ACK loss/reordering and after a partial ACK + room = (room+1)/2; //prevent ACK loss/reordering to trigger + //too large packet burst which is followed by + //a long sending pause + } + //printk(KERN_INFO "elt: po=%u, ppo=%u, pif=%u, room=%u", tp->packets_out, tp->prior_packets_out, tcp_packets_in_flight(tp), room); + + tp->snd_cwnd = tcp_packets_in_flight(tp) + min_t(u32, room, 3); // burst protection + tp->snd_cwnd_stamp = tcp_time_stamp; + //printk(KERN_INFO "elt: cwnd=%u", tp->snd_cwnd); + tcp_ancr_calc_dupthresh(sk); +} + +/* Terminate Extended Limited Transmit + */ +static void tcp_ancr_elt_end(struct sock *sk, int flag , int cumack) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct ancr *ro = inet_csk_ro(sk); + + //printk(KERN_INFO "elt_end: cwnd=%u, cumack=%u", tp->snd_cwnd, cumack); + + if (cumack) { + /* New cumulative ACK during ELT, it is reordering.*/ + if (tp->sacked_out > 0) { + //printk(KERN_INFO "elt_end: elt init 1"); + tcp_ancr_elt_init(sk, 1); + } + else { + ro->elt_flag = 0; + + /*The following condition will only be true, if we were previously in + congestion avoidance. In that case, set ssthresh to allow slow + starting quickly back to the previous operating point. Otherwise, + don't touch ssthresh to allow slow start to continue to the point + it was previously supposed to. */ + if (tp->snd_ssthresh < tp->current_cwnd) + tp->snd_ssthresh = tp->current_cwnd; + + tp->snd_cwnd = tcp_packets_in_flight(tp) + 1; + tp->snd_cwnd_stamp = tcp_time_stamp; + } + } else { + /* Dupthresh is reached, start recovery, set ssthresh to an + * appropriate value to start with ratehalving */ + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + ro->elt_flag = 0; + } +} + +/* Return the dupthresh. + * If everything is right, return ancr's dupthresh. + * Else, fall back to native + */ +static u32 tcp_ancr_dupthresh(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct ancr *ro = inet_csk_ro(sk); + + if (tcp_ancr_test(sk)) { + return ro->dupthresh; + } + + return tp->reordering; +} + +/* We received a SACK for a segment not previously SACK'ed */ +static void tcp_ancr_new_sack(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct ancr *ro = inet_csk_ro(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + + // only init ELT, if we're not already in ELT and this is the first SACK'ed segment + if (tcp_ancr_test(sk) && (!ro->elt_flag) && (tp->sacked_out == 0) && (icsk->icsk_ca_state < TCP_CA_CWR)) + tcp_ancr_elt_init(sk, 0); +} + +/* A non-retransmitted SACK hole was filled */ +static void tcp_ancr_sack_hole_filled(struct sock *sk, int flag) +{ + struct ancr *ro = inet_csk_ro(sk); + + if (ro->elt_flag) + tcp_ancr_elt_end(sk, flag, 1); +} + +/* the state machine will start right after this */ +static void tcp_ancr_sm_starts(struct sock *sk, int flag, int acked) +{ + struct ancr *ro = inet_csk_ro(sk); + struct tcp_sock *tp = tcp_sk(sk); + + //if (ro->elt_flag && !(flag & FLAG_DATA_SACKED)) + // printk(KERN_INFO "elt, no sacked flag!"); + + if (ro->elt_flag && (tp->sacked_out)) //(flag & FLAG_DATA_SACKED)) + tcp_ancr_elt(sk); +} + +/* recovery starts */ +static void tcp_ancr_recovery_starts(struct sock *sk, int flag) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct ancr *ro = inet_csk_ro(sk); + + if (ro->elt_flag) { + tcp_ancr_elt_end(sk, flag, 0); + } + else + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); +} + +static void tcp_ancr_recovery_ends(struct sock *sk, int flag) +{ + //struct ancr *ro = inet_csk_ro(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->sacked_out) //(flag & FLAG_DATA_SACKED) + tcp_ancr_elt_init(sk, 0); +} + +static void tcp_ancr_update_mode(struct sock *sk, int val) { + struct ancr *ro = inet_csk_ro(sk); + + if (val == 2) + ro->reorder_mode = val; + else + ro->reorder_mode = 1; +} + +static void tcp_ancr_rto_happened(struct sock *sk) +{ + struct ancr *ro = inet_csk_ro(sk); + ro->elt_flag = 0; + ro->max_factor = 0; +} + +static struct tcp_reorder_ops tcp_ancr = { + .flags = TCP_REORDER_NON_RESTRICTED, + .name = "ancr", + .owner = THIS_MODULE, + .init = tcp_ancr_init, + .dupthresh = tcp_ancr_dupthresh, + .new_sack = tcp_ancr_new_sack, + .sack_hole_filled = tcp_ancr_sack_hole_filled, + .sm_starts = tcp_ancr_sm_starts, + .recovery_starts = tcp_ancr_recovery_starts, + .recovery_ends = tcp_ancr_recovery_ends, + .reorder_detected = tcp_ancr_reorder_detected, + .reorder_detected_factor = tcp_ancr_reordering_detected_factor, + + .update_mode = tcp_ancr_update_mode, + .allow_moderation = 0, + .allow_head_to = 0, + .moddupthresh = tcp_ancr_dupthresh, + .rto_happened = tcp_ancr_rto_happened, +}; + +static int __init tcp_ancr_register(void) +{ + BUILD_BUG_ON(sizeof(struct ancr) > ICSK_RO_PRIV_SIZE); + tcp_register_reorder(&tcp_ancr); + return 0; +} + +static void __exit tcp_ancr_unregister(void) +{ + tcp_unregister_reorder(&tcp_ancr); +} + +module_init(tcp_ancr_register); +module_exit(tcp_ancr_unregister); + +MODULE_AUTHOR("Carsten Wolff, Lennart Schulte"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP-ANCR"); +MODULE_VERSION("3.0"); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 6428b342b16442..7fd04fc76575c3 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -330,8 +330,10 @@ void tcp_slow_start(struct tcp_sock *tp) tp->snd_cwnd_cnt += cnt; while (tp->snd_cwnd_cnt >= tp->snd_cwnd) { tp->snd_cwnd_cnt -= tp->snd_cwnd; - if (tp->snd_cwnd < tp->snd_cwnd_clamp) + if (tp->snd_cwnd < tp->snd_cwnd_clamp) { tp->snd_cwnd++; + tp->current_cwnd++; + } } } EXPORT_SYMBOL_GPL(tcp_slow_start); @@ -340,8 +342,10 @@ EXPORT_SYMBOL_GPL(tcp_slow_start); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w) { if (tp->snd_cwnd_cnt >= w) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) + if (tp->snd_cwnd < tp->snd_cwnd_clamp) { tp->snd_cwnd++; + tp->current_cwnd++; + } tp->snd_cwnd_cnt = 0; } else { tp->snd_cwnd_cnt++; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d86784be7ab3d0..fc71531806aece 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -937,13 +937,19 @@ static void tcp_init_metrics(struct sock *sk) } static void tcp_update_reordering(struct sock *sk, const int metric, - const int ts) + const int ts, int real_reorder) { + /* real_reorder == 1 -> packet was not retransmitted + * == 0 -> packet was retransmitted + */ + struct tcp_sock *tp = tcp_sk(sk); + //printk(KERN_INFO "vorher .. metric: %u, tp->reordering: %u", metric, tp->reordering); if (metric > tp->reordering) { int mib_idx; tp->reordering = min(TCP_MAX_REORDERING, metric); + //printk(KERN_INFO "nachher .. tp->reordering: %u", tp->reordering); /* This exciting event is worth to be remembered. 8) */ if (ts) @@ -966,6 +972,67 @@ static void tcp_update_reordering(struct sock *sk, const int metric, #endif tcp_disable_fack(tp); } + if (inet_csk(sk)->icsk_ro_ops->reorder_detected && real_reorder) + inet_csk(sk)->icsk_ro_ops->reorder_detected(sk, metric); + + if (inet_csk(sk)->icsk_ro_ops->reorder_detected_factor && real_reorder && tp->prior_packets_out) + inet_csk(sk)->icsk_ro_ops->reorder_detected_factor(sk, ((metric << 8)/tp->prior_packets_out) + 1); + //else if (!tp->prior_packets_out) + // printk(KERN_INFO "ppo 0, metric %u", metric); + + if (real_reorder) + tp->last_reor_sample = metric; +} + +/* Save reordering samples for retransmitted segments + * @sample number of segments the reordered segment was late + * @sk the socket + * @skb segment which was reordered + */ +static void tcp_save_sample(int sample, struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct reorder_sample *rs; + + //printk(KERN_INFO "sample: %u, seqno: %u", sample, TCP_SKB_CB(skb)->seq); + rs = (struct reorder_sample *)kmalloc(sizeof(struct reorder_sample), GFP_KERNEL); + + rs->sample = sample; + //printk(KERN_DEBUG "prior_packets_out = %u", tp->prior_packets_out); + if (tp->prior_packets_out > 0) + rs->factor = ((sample << 8)/tp->prior_packets_out) + 1; //round up + else + rs->factor = 0; + rs->seq = TCP_SKB_CB(skb)->seq; + + list_add_tail(&rs->list, &tp->reorder_samples); +} + +/* If a DSACK arrives look for a stored sample and use it + * @seq the lowest sequence number of the DSACK block + * @sk the socket + */ +static void tcp_deliver_sample(u32 seq, struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct list_head *lh; + struct reorder_sample *entry; + + list_for_each(lh, &tp->reorder_samples) { + entry = list_entry(lh, struct reorder_sample, list); + if (entry->seq == seq) { + //printk(KERN_INFO "DSACK -> sample: %u, seqno: %u\n", entry->sample, seq); + //tcp_update_reordering(sk, entry->sample, 0, 1); + if (inet_csk(sk)->icsk_ro_ops->reorder_detected_factor) { + //printk(KERN_DEBUG "DSACK -> sample: %u, seqno: %u, factor: %u\n", entry->sample, seq, entry->factor); + inet_csk(sk)->icsk_ro_ops->reorder_detected_factor(sk, entry->factor); + inet_csk(sk)->icsk_ro_ops->reorder_detected(sk, entry->sample); + + tp->last_reor_sample = entry->sample; + } + break; + } + } } /* This must be called before lost_out is incremented */ @@ -1225,11 +1292,18 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb, } } + /* Count spurious retransmission by couting DSACKs. + * This works as long as the network does not duplicate segments. */ + if (dup_sack) + tp->total_dsacks++; + /* D-SACK for already forgotten data... Do dumb counting. */ if (dup_sack && !after(end_seq_0, prior_snd_una) && - after(end_seq_0, tp->undo_marker)) + after(end_seq_0, tp->undo_marker)) { tp->undo_retrans--; + tcp_deliver_sample(start_seq_0, sk); + } return dup_sack; } @@ -1305,8 +1379,10 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, if (dup_sack && (sacked & TCPCB_RETRANS)) { if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) tp->undo_retrans--; - if (sacked & TCPCB_SACKED_ACKED) - state->reord = min(fack_count, state->reord); + //if (sacked & TCPCB_SACKED_ACKED) + //state->reord = min(fack_count, state->reord); + /* Reordering sample with DSACK does not work this way + * since it produces samples of kind "b" */ } /* Nothing to do; acked frame is about to be dropped (was ACKed). */ @@ -1319,6 +1395,11 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, * we do not clear RETRANS, believing * that retransmission is still in flight. */ + + int sample = tp->fackets_out - fack_count; + //TODO: save sample for packet + tcp_save_sample(sample, sk, skb); + if (sacked & TCPCB_LOST) { sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); tp->lost_out -= pcount; @@ -1345,6 +1426,9 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, } } + if (inet_csk(sk)->icsk_ro_ops->new_sack) + inet_csk(sk)->icsk_ro_ops->new_sack(sk); + sacked |= TCPCB_SACKED_ACKED; state->flag |= FLAG_DATA_SACKED; tp->sacked_out += pcount; @@ -1704,6 +1788,11 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, return skb; } +static u32 tcp_dupthresh(struct sock *sk) +{ + return inet_csk(sk)->icsk_ro_ops->dupthresh(sk); +} + static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache) { return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); @@ -1739,6 +1828,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, num_sacks, prior_snd_una); + //printk(KERN_INFO "sacktag_write_queue: found_dup_sack=%u", found_dup_sack); if (found_dup_sack) state.flag |= FLAG_DSACKING_ACK; @@ -1913,8 +2003,10 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, if ((state.reord < tp->fackets_out) && ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) && - (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) - tcp_update_reordering(sk, tp->fackets_out - state.reord, 0); + (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) { + //printk(KERN_INFO "sacktag_write_queue: update reordering, found_dup_sack=%u, fackets_out=%u, state.reord=%u, metric: %u", found_dup_sack, tp->fackets_out, state.reord, tp->fackets_out - state.reord); + tcp_update_reordering(sk, tp->fackets_out - state.reord, 0, 1); + } out: @@ -1951,8 +2043,10 @@ static int tcp_limit_reno_sacked(struct tcp_sock *tp) static void tcp_check_reno_reordering(struct sock *sk, const int addend) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_limit_reno_sacked(tp)) - tcp_update_reordering(sk, tp->packets_out + addend, 0); + if (tcp_limit_reno_sacked(tp)) { + //printk(KERN_INFO "check_reno_reordering: update reordering, metric: %u", tp->packets_out + addend); + tcp_update_reordering(sk, tp->packets_out + addend, 0, 1); + } } /* Emulate SACKs for SACKless connection: account for a new dupack. */ @@ -2272,7 +2366,7 @@ static int tcp_check_sack_reneging(struct sock *sk, int flag) tcp_enter_loss(sk, 1); icsk->icsk_retransmits++; - tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); + tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 0); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); return 1; @@ -2314,6 +2408,9 @@ static inline int tcp_head_timedout(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + if (!inet_csk(sk)->icsk_ro_ops->allow_head_to) + return 0; + return tp->packets_out && tcp_skb_timedout(sk, tcp_write_queue_head(sk)); } @@ -2425,7 +2522,7 @@ static int tcp_time_to_recover(struct sock *sk) return 1; /* Not-A-Trick#2 : Classic rule... */ - if (tcp_dupack_heurestics(tp) > tp->reordering) + if (tcp_dupack_heurestics(tp) > tcp_dupthresh(sk)) return 1; /* Trick#3 : when we use RFC2988 timer restart, fast @@ -2438,7 +2535,7 @@ static int tcp_time_to_recover(struct sock *sk) * recovery more? */ packets_out = tp->packets_out; - if (packets_out <= tp->reordering && + if (packets_out <= tcp_dupthresh(sk) /*tp->reordering*/ && tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) && !tcp_may_send_now(sk)) { /* We have nothing to send. This connection is limited @@ -2491,7 +2588,7 @@ static void tcp_timeout_skbs(struct sock *sk) /* Mark head of queue up as lost. With RFC3517 SACK, the packets is * is against sacked "cnt", otherwise it's against facked "cnt" */ -static void tcp_mark_head_lost(struct sock *sk, int packets) +static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -2503,6 +2600,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) if (tp->lost_skb_hint) { skb = tp->lost_skb_hint; cnt = tp->lost_cnt_hint; + /* Head already handled? */ + if (mark_head && skb != tcp_write_queue_head(sk)) + return; } else { skb = tcp_write_queue_head(sk); cnt = 0; @@ -2536,6 +2636,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) } tcp_skb_mark_lost(tp, skb); + + if (mark_head) + break; } tcp_verify_left_out(tp); } @@ -2547,17 +2650,18 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) struct tcp_sock *tp = tcp_sk(sk); if (tcp_is_reno(tp)) { - tcp_mark_head_lost(sk, 1); + tcp_mark_head_lost(sk, 1, 1); } else if (tcp_is_fack(tp)) { - int lost = tp->fackets_out - tp->reordering; + int lost = tp->fackets_out - tcp_dupthresh(sk); if (lost <= 0) lost = 1; - tcp_mark_head_lost(sk, lost); + tcp_mark_head_lost(sk, lost, 0); } else { - int sacked_upto = tp->sacked_out - tp->reordering; - if (sacked_upto < fast_rexmit) - sacked_upto = fast_rexmit; - tcp_mark_head_lost(sk, sacked_upto); + int sacked_upto = tp->sacked_out - tcp_dupthresh(sk); + if (sacked_upto >= 0) + tcp_mark_head_lost(sk, sacked_upto, 0); + else if (fast_rexmit) + tcp_mark_head_lost(sk, 1, 1); } tcp_timeout_skbs(sk); @@ -2653,8 +2757,13 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) if (icsk->icsk_ca_ops->undo_cwnd) tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk); - else + else { tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); + /* Undo can be done any time. The saved packets_out on + * entering disorder has to be adapted here. */ + tp->prior_packets_out = max_t(u32, tp->prior_packets_out, tp->snd_cwnd); + //printk(KERN_DEBUG "undo_cwr1: ppo = %u", tp->prior_packets_out); + } if (undo && tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; @@ -2662,7 +2771,10 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) } } else { tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); + tp->prior_packets_out = max_t(u32, tp->prior_packets_out, tp->snd_cwnd); + //printk(KERN_DEBUG "undo_cwr2: ppo = %u", tp->prior_packets_out); } + //printk(KERN_INFO "moderate_cwnd in undo_cwr"); tcp_moderate_cwnd(tp); tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -2697,6 +2809,7 @@ static int tcp_try_undo_recovery(struct sock *sk) /* Hold old state until something *above* high_seq * is ACKed. For Reno it is MUST to prevent false * fast retransmits (RFC2582). SACK TCP is safe. */ + //printk(KERN_INFO "moderate_cwnd in try_undo_recovery"); tcp_moderate_cwnd(tp); return 1; } @@ -2719,11 +2832,11 @@ static void tcp_try_undo_dsack(struct sock *sk) /* Undo during fast recovery after partial ACK. */ -static int tcp_try_undo_partial(struct sock *sk, int acked) +static int tcp_try_undo_partial(struct sock *sk, int acked, int real_reorder) { struct tcp_sock *tp = tcp_sk(sk); /* Partial ACK arrived. Force Hoe's retransmit. */ - int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering); + int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tcp_dupthresh(sk)); if (tcp_may_undo(tp)) { /* Plain luck! Hole if filled with delayed @@ -2732,7 +2845,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) if (tp->retrans_out == 0) tp->retrans_stamp = 0; - tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); + tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1, real_reorder); DBGUNDO(sk, "Hoe"); tcp_undo_cwr(sk, 0); @@ -2752,12 +2865,15 @@ static int tcp_try_undo_loss(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + //printk(KERN_DEBUG "try_undo_loss: undo_marker=%u, tsecr=%u, rex_stamp=%u", tp->undo_marker, tp->rx_opt.rcv_tsecr, tp->retrans_stamp); if (tcp_may_undo(tp)) { struct sk_buff *skb; + //printk(KERN_DEBUG "try_undo_loss: undo"); tcp_for_write_queue(skb, sk) { if (skb == tcp_send_head(sk)) break; TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + //printk(KERN_DEBUG "try_undo_loss: %u", TCP_SKB_CB(skb)->seq); } tcp_clear_all_retrans_hints(tp); @@ -2792,6 +2908,14 @@ static void tcp_try_keep_open(struct sock *sk) state = TCP_CA_Disorder; if (inet_csk(sk)->icsk_ca_state != state) { + /* Save packets_out on entering disorder state */ + if (state == TCP_CA_Disorder) { + //if (inet_csk(sk)->icsk_ro_ops->new_sack) + // inet_csk(sk)->icsk_ro_ops->new_sack(sk); + //tp->prior_packets_out = tp->packets_out; + //printk(KERN_DEBUG "begin disorder: %u, ppo = %u", tp->snd_una, tp->prior_packets_out); + } + tcp_set_ca_state(sk, state); tp->high_seq = tp->snd_nxt; } @@ -2811,7 +2935,10 @@ static void tcp_try_to_open(struct sock *sk, int flag) if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); - tcp_moderate_cwnd(tp); + if (inet_csk(sk)->icsk_ro_ops->allow_moderation || + inet_csk(sk)->icsk_ca_state != TCP_CA_Disorder) + //printk(KERN_INFO "moderate_cwnd in try_to_open"); + tcp_moderate_cwnd(tp); } else { tcp_cwnd_down(sk, flag); } @@ -2891,7 +3018,7 @@ void tcp_simple_retransmit(struct sock *sk) tp->undo_marker = 0; tcp_set_ca_state(sk, TCP_CA_Loss); } - tcp_xmit_retransmit_queue(sk); + tcp_xmit_retransmit_queue(sk, 0); } /* Process an event, which can update packets-in-flight not trivially. @@ -2911,7 +3038,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) struct tcp_sock *tp = tcp_sk(sk); int is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && - (tcp_fackets_out(tp) > tp->reordering)); + (tcp_fackets_out(tp) > tcp_dupthresh(sk))); int fast_rexmit = 0, mib_idx; if (WARN_ON(!tp->packets_out && tp->sacked_out)) @@ -2919,6 +3046,9 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) if (WARN_ON(!tp->sacked_out && tp->fackets_out)) tp->fackets_out = 0; + if (icsk->icsk_ro_ops->sm_starts) + icsk->icsk_ro_ops->sm_starts(sk, flag, pkts_acked); + /* Now state machine starts. * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ if (flag & FLAG_ECE) @@ -2932,8 +3062,8 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) && before(tp->snd_una, tp->high_seq) && icsk->icsk_ca_state != TCP_CA_Open && - tp->fackets_out > tp->reordering) { - tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering); + tp->fackets_out > tcp_dupthresh(sk)) { + tcp_mark_head_lost(sk, tp->fackets_out - tcp_dupthresh(sk), 0); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); } @@ -2970,6 +3100,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) tcp_is_reno(tp) || tp->snd_una != tp->high_seq) { tp->undo_marker = 0; tcp_set_ca_state(sk, TCP_CA_Open); + //printk(KERN_DEBUG "exit disorder, cwnd = %u", tp->snd_cwnd); } break; @@ -2979,6 +3110,8 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) if (tcp_try_undo_recovery(sk)) return; tcp_complete_cwr(sk); + if (unlikely(icsk->icsk_ro_ops->recovery_ends)) + icsk->icsk_ro_ops->recovery_ends(sk, flag); break; } } @@ -2990,7 +3123,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); } else - do_lost = tcp_try_undo_partial(sk, pkts_acked); + do_lost = tcp_try_undo_partial(sk, pkts_acked, flag & FLAG_RETRANS_DATA_ACKED); break; case TCP_CA_Loss: if (flag & FLAG_DATA_ACKED) @@ -2998,8 +3131,9 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); if (!tcp_try_undo_loss(sk)) { + //printk(KERN_INFO "moderate_cwnd in fastretrans_alert"); tcp_moderate_cwnd(tp); - tcp_xmit_retransmit_queue(sk); + tcp_xmit_retransmit_queue(sk, 0); return; } if (icsk->icsk_ca_state != TCP_CA_Open) @@ -3018,6 +3152,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) if (!tcp_time_to_recover(sk)) { tcp_try_to_open(sk, flag); + //printk(KERN_DEBUG "after processing disorder, cwnd = %u", tp->snd_cwnd); return; } @@ -3049,7 +3184,10 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) if (icsk->icsk_ca_state < TCP_CA_CWR) { if (!(flag & FLAG_ECE)) tp->prior_ssthresh = tcp_current_ssthresh(sk); - tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + if (unlikely(icsk->icsk_ro_ops->recovery_starts)) + icsk->icsk_ro_ops->recovery_starts(sk, flag); + else + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); TCP_ECN_queue_cwr(tp); } @@ -3062,7 +3200,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) tcp_update_scoreboard(sk, fast_rexmit); tcp_cwnd_down(sk, flag); - tcp_xmit_retransmit_queue(sk); + tcp_xmit_retransmit_queue(sk, 1); } static void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) @@ -3216,6 +3354,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, seq_rtt = -1; if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1)) flag |= FLAG_NONHEAD_RETRANS_ACKED; + tcp_save_sample(tp->fackets_out - pkts_acked, sk, skb); } else { ca_seq_rtt = now - scb->when; last_ackt = skb->tstamp; @@ -3284,8 +3423,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, int delta; /* Non-retransmitted hole got filled? That's reordering */ - if (reord < prior_fackets) - tcp_update_reordering(sk, tp->fackets_out - reord, 0); + if (reord < prior_fackets) { + //printk(KERN_INFO "clean_rtx_queue: update reordering, metric: %u", tp->fackets_out - reord); + tcp_update_reordering(sk, tp->fackets_out - reord, 0, 1); + if (icsk->icsk_ro_ops->sack_hole_filled) + icsk->icsk_ro_ops->sack_hole_filled(sk, flag); + } delta = tcp_is_fack(tp) ? pkts_acked : prior_sacked - tp->sacked_out; @@ -3433,6 +3576,7 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp) tp->snd_cwnd_cnt = 0; tp->bytes_acked = 0; TCP_ECN_queue_cwr(tp); + //printk(KERN_INFO "moderate_cwnd in conservative_spur_to_response"); tcp_moderate_cwnd(tp); } @@ -3653,6 +3797,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) tp->frto_highmark = 0; if (tcp_ack_is_dubious(sk, flag)) { + //printk(KERN_INFO "dubious ack: %u", ack); /* Advance CWND, if state allows this. */ if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && tcp_may_raise_cwnd(sk, flag)) @@ -4766,6 +4911,8 @@ void tcp_cwnd_application_limited(struct sock *sk) if (win_used < tp->snd_cwnd) { tp->snd_ssthresh = tcp_current_ssthresh(sk); tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; + /* adjust ppo for app limited connections */ + //tp->prior_packets_out = tp->snd_cwnd; } tp->snd_cwnd_used = 0; } @@ -5473,6 +5620,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_metrics(sk); tcp_init_congestion_control(sk); + tcp_init_reorder(sk); /* Prevent spurious tcp_cwnd_restart() on first data * packet. @@ -5716,6 +5864,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_metrics(sk); tcp_init_congestion_control(sk); + tcp_init_reorder(sk); /* Prevent spurious tcp_cwnd_restart() on * first data packet. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7cda24b53f6104..5bc1b45fc2eb28 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1814,6 +1814,9 @@ static int tcp_v4_init_sock(struct sock *sk) tp->reordering = sysctl_tcp_reordering; icsk->icsk_ca_ops = &tcp_init_congestion_ops; + icsk->icsk_ro_ops = &tcp_init_reorder_ops; + + INIT_LIST_HEAD(&tp->reorder_samples); sk->sk_state = TCP_CLOSE; @@ -1840,9 +1843,22 @@ void tcp_v4_destroy_sock(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + struct reorder_sample *tmp; + struct list_head *pos, *q; + //printk(KERN_EMERG "%s() %08x %08x\n", __func__, sk, tp->reorder_samples.next); + /* Clean reorder samples */ + if (tp->reorder_samples.next) { + list_for_each_safe(pos, q, &tp->reorder_samples) { + tmp = list_entry(pos, struct reorder_sample, list); + list_del(pos); + kfree(tmp); + } + } + tcp_clear_xmit_timers(sk); tcp_cleanup_congestion_control(sk); + tcp_cleanup_reorder(sk); /* Cleanup up the write buffer. */ tcp_write_queue_purge(sk); diff --git a/net/ipv4/tcp_leungma.c b/net/ipv4/tcp_leungma.c new file mode 100644 index 00000000000000..45cabef140cbc5 --- /dev/null +++ b/net/ipv4/tcp_leungma.c @@ -0,0 +1,292 @@ +/* + * TCP-leungma reordering response + * + * Inspired by the paper + * "Enhancing TCP Performance to Persistent Packet Reordering" + * by Ka-Cheong Leung and Changming Ma + */ + +#include +#include +#include + +#include + +#define MIN_DUPTHRESH 3 +#define FIXED_POINT_SHIFT 8 + +// copied from tcp_input.c +#define FLAG_DATA_SACKED 0x20 // New SACK. + +/*static int mode = 1; +module_param(mode, int, 0644); +MODULE_PARM_DESC(mode, "mode: careful (1) or aggressive (2)");*/ + +/* leungma variables */ +struct leungma { + u8 reorder_mode; + u8 elt_flag; + u32 dupthresh; + u32 prior_packets_out; + u32 rodist_avg; + u32 rodist_mdev; +}; + +static inline void tcp_leungma_init(struct sock *sk) +{ + struct leungma *ro = inet_csk_ro(sk); + + if (ro->reorder_mode != 2) + ro->reorder_mode = 1; + ro->elt_flag = 0; + ro->dupthresh = MIN_DUPTHRESH; + ro->prior_packets_out = 0; + ro->rodist_avg = 0; + ro->rodist_mdev = 0; +} + +/* recalculate dupthresh + */ +static void tcp_leungma_recalc_dupthresh(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + struct leungma *ro = inet_csk_ro(sk); + + // | about 0.3 * mdev | + u32 dupthresh = (ro->rodist_avg + ((19 * ro->rodist_mdev) >> 6)) >> FIXED_POINT_SHIFT; + + // FIXME: Who says rto/rtt is really >2 ??? + // upper bound. srtt is the RTT-Value left-shifted by 3. So left-shift rto for a correct RTO/RTT ratio + // | about 0.7 * rto / rtt | + u32 upper_bound = tp->snd_cwnd * (((45 * ((icsk->icsk_rto << 3) / tp->srtt)) >> 6) - 2); + + // FIXME: see above + upper_bound = 256; + + // apply bounds + ro->dupthresh = clamp_t(u32, dupthresh, MIN_DUPTHRESH, upper_bound); +} + +/* New reordering event, recalculate avg and mdev + */ +static void tcp_leungma_reordering_detected(struct sock *sk, int length) +{ + struct leungma *ro = inet_csk_ro(sk); + u32 aerr = 0; + u32 slength = length << FIXED_POINT_SHIFT; + + // on the first event, avg needs to be initialized properly + if (unlikely(!ro->rodist_avg && !ro->rodist_mdev)) { + ro->rodist_avg = slength; + } else { + // recalculate avg and mdev. order is important, here! + aerr = abs(ro->rodist_avg - slength); + // | about 0.3 * aerr | | about 0.7 * mdev | + ro->rodist_mdev = ((19 * aerr) >> 6) + ((45 * ro->rodist_mdev) >> 6); + ro->rodist_avg = ((19 * slength) >> 6) + ((45 * ro->rodist_avg) >> 6); + } + + tcp_leungma_recalc_dupthresh(sk); +} + +/* An RTO happened. We probably waited too long, reduce dupthresh by dumping components. + * Only call on the first RTO for the same segment, because there's no way we could have + * avoided a backed-off RTO by fast-retransmitting more quickly. + */ +static void tcp_leungma_rto_happened(struct sock *sk) +{ + struct leungma *ro = inet_csk_ro(sk); + + if (inet_csk(sk)->icsk_retransmits == 0) { + ro->rodist_avg = ro->rodist_avg >> 1; + ro->rodist_mdev = ro->rodist_mdev >> 2; + + tcp_leungma_recalc_dupthresh(sk); + } +} + +/* Test if TCP-leungma may be used + */ +static int tcp_leungma_test(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return tcp_is_sack(tp) && !(tp->nonagle & TCP_NAGLE_OFF); +} + +/* Initiate Extended Limited Transmit + */ +static void tcp_leungma_elt_init(struct sock *sk, int how) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct leungma *ro = inet_csk_ro(sk); + + if (!how) + ro->prior_packets_out = tp->packets_out; + ro->elt_flag = 1; +} + +/* TCP-leungma Extended Limited Transmit + */ +static void tcp_leungma_elt_end(struct sock *sk, int flag , int cumack) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct leungma *ro = inet_csk_ro(sk); + + if (cumack) { + /* New cumulative ACK during ELT, it is reordering. */ + tp->snd_ssthresh = ro->prior_packets_out; + tp->snd_cwnd = min(tcp_packets_in_flight(tp) + 1, ro->prior_packets_out); + tp->snd_cwnd_stamp = tcp_time_stamp; + if (flag & FLAG_DATA_SACKED) + tcp_leungma_elt_init(sk, 1); + else + ro->elt_flag = 0; + } else { + /* Dupthresh is reached, start recovery */ + // Don't force the RFC + //tp->snd_ssthresh = (ro->prior_packets_out/2); + //tp->snd_cwnd = tp->snd_ssthresh; + //tp->snd_cwnd_stamp = tcp_time_stamp; + // Instead, let the usual Linux recovery happen (with ratehalving) + tp->snd_cwnd = min(tcp_packets_in_flight(tp) + 1, ro->prior_packets_out); + tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + ro->elt_flag = 0; + } +} + +/* Extended Limited Transmit + * + * tcp_cwnd_down() is not meant to be used in the disorder phase. It is + * implemented under assumptions only valid in the recovery phase. + * So, we need our own version for ELT, similar to the "E"-steps in RFC 4653 + */ +static void tcp_leungma_elt(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct leungma *ro = inet_csk_ro(sk); + u32 sent; + u32 room = ro->prior_packets_out > tcp_packets_in_flight(tp) ? + ro->prior_packets_out - tcp_packets_in_flight(tp) : + 0; + + if (ro->reorder_mode == 1) { + sent = tp->packets_out > ro->prior_packets_out ? + tp->packets_out - ro->prior_packets_out : + 0; + room = room > sent ? + room - sent : + 0; + } + + tp->snd_cwnd = tcp_packets_in_flight(tp) + min_t(u32, room, 3); // burst protection + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +/* Return the dupthresh. + * If everything is right, return leungma's dupthresh. + * Else, fall back to native + */ +static u32 tcp_leungma_dupthresh(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct leungma *ro = inet_csk_ro(sk); + +// if (ro->elt_flag && tcp_leungma_test(sk)) + if (tcp_leungma_test(sk)) + return ro->dupthresh; + + return tp->reordering; +} + +/* We received a SACK for a segment not previously SACK'ed */ +static void tcp_leungma_new_sack(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct leungma *ro = inet_csk_ro(sk); + + // only init ELT, if we're not already in ELT and this is the first SACK'ed segment + if (tcp_leungma_test(sk) && (!ro->elt_flag) && (tp->sacked_out == 0)) + tcp_leungma_elt_init(sk, 0); +} + +/* A non-retransmitted SACK hole was filled */ +static void tcp_leungma_sack_hole_filled(struct sock *sk, int flag) +{ + struct leungma *ro = inet_csk_ro(sk); + + if (ro->elt_flag) + tcp_leungma_elt_end(sk, flag, 1); +} + +/* the state machine will start right after this */ +static void tcp_leungma_sm_starts(struct sock *sk, int flag) +{ + struct leungma *ro = inet_csk_ro(sk); + + if (ro->elt_flag && (flag & FLAG_DATA_SACKED)) + tcp_leungma_elt(sk); +} + +/* recovery starts */ +static void tcp_leungma_recovery_starts(struct sock *sk, int flag) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct leungma *ro = inet_csk_ro(sk); + + if (ro->elt_flag) + tcp_leungma_elt_end(sk, flag, 0); + else + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); +} + +static void tcp_leungma_update_mode(struct sock *sk, int val) { + struct leungma *ro = inet_csk_ro(sk); + + if (val == 2) + ro->reorder_mode = val; + else + ro->reorder_mode = 1; +} + +static struct tcp_reorder_ops tcp_leungma = { + .flags = TCP_REORDER_NON_RESTRICTED, + .name = "leungma", + .owner = THIS_MODULE, + .init = tcp_leungma_init, + .dupthresh = tcp_leungma_dupthresh, + .new_sack = tcp_leungma_new_sack, + .sack_hole_filled = tcp_leungma_sack_hole_filled, + .sm_starts = tcp_leungma_sm_starts, + .recovery_starts = tcp_leungma_recovery_starts, + .reorder_detected = tcp_leungma_reordering_detected, + .rto_happened = tcp_leungma_rto_happened, + .update_mode = tcp_leungma_update_mode, + .allow_moderation = 0, + .allow_head_to = 0, + .moddupthresh = tcp_leungma_dupthresh, +}; + +static int __init tcp_leungma_register(void) +{ + BUILD_BUG_ON(sizeof(struct leungma) > ICSK_RO_PRIV_SIZE); + tcp_register_reorder(&tcp_leungma); + return 0; +} + +static void __exit tcp_leungma_unregister(void) +{ + tcp_unregister_reorder(&tcp_leungma); +} + +module_init(tcp_leungma_register); +module_exit(tcp_leungma_unregister); + +MODULE_AUTHOR("Carsten Wolff"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP-LEUNGMA"); +MODULE_VERSION("1.0"); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 4c03598ed9248f..bc64c84dab9733 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -425,6 +425,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->frto_highmark = 0; newicsk->icsk_ca_ops = &tcp_init_congestion_ops; + newicsk->icsk_ro_ops = &tcp_init_reorder_ops; + INIT_LIST_HEAD(&newtp->reorder_samples); tcp_set_ca_state(newsk, TCP_CA_Open); tcp_init_xmit_timers(newsk); @@ -657,6 +659,29 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); if (child == NULL) goto listen_overflow; +#ifdef CONFIG_TCP_MD5SIG + else { + /* Copy over the MD5 key from the original socket */ + struct tcp_md5sig_key *key; + struct tcp_sock *tp = tcp_sk(sk); + key = tp->af_specific->md5_lookup(sk, child); + if (key != NULL) { + /* + * We're using one, so create a matching key on the + * newsk structure. If we fail to get memory then we + * end up not copying the key across. Shucks. + */ + char *newkey = kmemdup(key->key, key->keylen, + GFP_ATOMIC); + if (newkey) { + if (!tcp_alloc_md5sig_pool()) + BUG(); + tp->af_specific->md5_add(child, child, newkey, + key->keylen); + } + } + } +#endif inet_csk_reqsk_queue_unlink(sk, req, prev); inet_csk_reqsk_queue_removed(sk, req); diff --git a/net/ipv4/tcp_ncr.c b/net/ipv4/tcp_ncr.c new file mode 100644 index 00000000000000..0b062b74810433 --- /dev/null +++ b/net/ipv4/tcp_ncr.c @@ -0,0 +1,277 @@ +/* + * TCP-NCR reordering response + * + * Implements RFC4653 + * http://www.ietf.org/rfc/rfc4653.txt + * + * While reading this code, remember: + * Linux IETF + * packets_out FLIGHT_SIZE + * tcp_flight_size() pipe() + */ + +#include +#include +#include + +#include + +// copied from tcp_input.c +#define FLAG_DATA_SACKED 0x20 /* New SACK. */ + +/*static int mode = 1; +module_param(mode, int, 0644); +MODULE_PARM_DESC(mode, "mode: careful (1) or aggressive (2)"); */ + +/* NCR variables */ +struct ncr { + u8 reorder_mode; + u8 elt_flag; + u32 dupthresh; + u8 lt_f; + u32 prior_packets_out; + u32 acked; +}; + +static void tcp_ncr_update_mode(struct sock *sk, int val) +{ + struct ncr *ro = inet_csk_ro(sk); + + if (val == 2) { + ro->lt_f = 4; + ro->reorder_mode = val; + } else { + ro->lt_f = 3; + ro->reorder_mode = 1; + } +} + +static inline void tcp_ncr_init(struct sock *sk) +{ + struct ncr *ro = inet_csk_ro(sk); + + ro->elt_flag = 0; + ro->dupthresh = TCP_FASTRETRANS_THRESH; + + tcp_ncr_update_mode(sk, ro->reorder_mode); + + ro->prior_packets_out = 0; + + ro->acked = 0; +} + +/* TCP-NCR: Test if TCP-NCR may be used + * (Following RFC 4653 recommendations) + */ +static int tcp_ncr_test(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return tcp_is_sack(tp) && !(tp->nonagle & TCP_NAGLE_OFF); +} + +/* TCP-NCR: Initiate Extended Limited Transmit + * (RFC 4653 Initialization) + */ +static void tcp_ncr_elt_init(struct sock *sk, int how) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct ncr *ro = inet_csk_ro(sk); + + //printk(KERN_INFO "elt init: how=%u", how); + if (!how) { //execute in I.1 but not in T.4 + ro->prior_packets_out = tp->packets_out; + ro->acked = 0; + } + ro->elt_flag = 1; + ro->dupthresh = max_t(u32, ((2 * tp->packets_out)/ro->lt_f), 3); +} + +/* TCP-NCR Extended Limited Transmit + * (RFC 4653 Termination) + */ +static void tcp_ncr_elt_end(struct sock *sk, int flag , int cumack) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct ncr *ro = inet_csk_ro(sk); + + //printk(KERN_INFO "elt end: cumack=%u", cumack); + if (cumack) { + /* New cumulative ACK during ELT, it is reordering. */ + if (tp->snd_ssthresh < ro->prior_packets_out) //fix: slowstart bug + tp->snd_ssthresh = ro->prior_packets_out; + tp->snd_cwnd = min(tcp_packets_in_flight(tp) + 1, ro->prior_packets_out); + tp->snd_cwnd_stamp = tcp_time_stamp; + if (tp->sacked_out) + //if (flag & FLAG_DATA_SACKED) it seems as if this doesnt work + tcp_ncr_elt_init(sk, 1); //T.4 + else { + //printk(KERN_INFO "elt_flag 0"); + ro->elt_flag = 0; + } + } else { + /* Dupthresh is reached, start recovery */ + // Don't force the RFC + //tp->snd_ssthresh = (ro->prior_packets_out/2); + //tp->snd_cwnd = tp->snd_ssthresh; + //tp->snd_cwnd_stamp = tcp_time_stamp; + // Instead, let the usual Linux recovery happen (with ratehalving) + tp->snd_cwnd = min(tcp_packets_in_flight(tp) + 1, ro->prior_packets_out); + tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + + ro->elt_flag = 0; + } +} + +/* TCP-NCR: Extended Limited Transmit + * (RFC 4653 Main Part) + * + * tcp_cwnd_down() is not meant to be used in the disorder phase. It is + * implemented under assumptions only valid in the recovery phase. + * So, we need our own version for ELT, similar to the "E"-steps in RFC 4653 + */ +static void tcp_ncr_elt(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct ncr *ro = inet_csk_ro(sk); + u32 sent; + u32 room = ro->prior_packets_out > tcp_packets_in_flight(tp) ? + ro->prior_packets_out - tcp_packets_in_flight(tp) : + 0; + + if (ro->reorder_mode == 1) { + /* acked: when receiving an ACK, that acknowledges new data, but also + * carries SACK information (partial ACK in ELT), careful ELT would + * send a burst of data which resembles the number of packets not sent + * compared to aggressive ELT. With this variable all acknowledged + * segments during ELT can be taken into account. + * */ + sent = (tp->packets_out/* + ro->acked*/) > ro->prior_packets_out ? + (tp->packets_out/* + ro->acked*/) - ro->prior_packets_out : + 0; + room = room > sent ? + room - sent : + 0; + } + + tp->snd_cwnd = tcp_packets_in_flight(tp) + room; + tp->snd_cwnd_stamp = tcp_time_stamp; + + ro->dupthresh = max_t(u32, ((2 * tp->packets_out)/ro->lt_f), 3); +} + +/* Return the dupthresh. + * If everything is right, return NCR's dupthresh. + * Else, fall back to native + */ +static u32 tcp_ncr_dupthresh(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct ncr *ro = inet_csk_ro(sk); + + if (ro->elt_flag && tcp_ncr_test(sk)) { + return ro->dupthresh; + } + return tp->reordering; +} + +/* Return the dupthresh calculated by this module + * to show differences between calculated dupthresh + * and native dupthresh in flowgrind logs. + */ +static u32 tcp_ncr_moddupthresh(struct sock *sk) +{ + struct ncr *ro = inet_csk_ro(sk); + return ro->dupthresh; +} + +/* We received a SACK for a segment not previously SACK'ed */ +static void tcp_ncr_new_sack(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct ncr *ro = inet_csk_ro(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + + // only init ELT, if we're not already in ELT and this is the first SACK'ed segment + if (tcp_ncr_test(sk) && (!ro->elt_flag) && (tp->sacked_out == 0) && (icsk->icsk_ca_state < TCP_CA_CWR)) + tcp_ncr_elt_init(sk, 0); //I.1 +} + +/* A non-retransmitted SACK hole was filled */ +static void tcp_ncr_sack_hole_filled(struct sock *sk, int flag) +{ + struct ncr *ro = inet_csk_ro(sk); + + if (ro->elt_flag) + tcp_ncr_elt_end(sk, flag, 1); +} + +/* the state machine will start right after this */ +static void tcp_ncr_sm_starts(struct sock *sk, int flag, int acked) +{ + struct ncr *ro = inet_csk_ro(sk); + struct tcp_sock *tp = tcp_sk(sk); + + ro->acked += acked; + + if (ro->elt_flag && (tp->sacked_out)) //(flag & FLAG_DATA_SACKED)) this one works only for new sack info? + tcp_ncr_elt(sk); +} + +/* recovery starts */ +static void tcp_ncr_recovery_starts(struct sock *sk, int flag) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct ncr *ro = inet_csk_ro(sk); + + if (ro->elt_flag) + tcp_ncr_elt_end(sk, flag, 0); + else + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); +} + +static void tcp_ncr_rto_happened(struct sock *sk) +{ + struct ncr *ro = inet_csk_ro(sk); + ro->elt_flag = 0; +} + +static struct tcp_reorder_ops tcp_ncr = { + .flags = TCP_REORDER_NON_RESTRICTED, + .name = "ncr", + .owner = THIS_MODULE, + .init = tcp_ncr_init, + .dupthresh = tcp_ncr_dupthresh, + .new_sack = tcp_ncr_new_sack, + .sack_hole_filled = tcp_ncr_sack_hole_filled, + .sm_starts = tcp_ncr_sm_starts, + .recovery_starts = tcp_ncr_recovery_starts, + .update_mode = tcp_ncr_update_mode, + .allow_moderation = 0, + .allow_head_to = 0, + .moddupthresh = tcp_ncr_moddupthresh, + .rto_happened = tcp_ncr_rto_happened, +}; + +static int __init tcp_ncr_register(void) +{ + BUILD_BUG_ON(sizeof(struct ncr) > ICSK_RO_PRIV_SIZE); + tcp_register_reorder(&tcp_ncr); + return 0; +} + +static void __exit tcp_ncr_unregister(void) +{ + tcp_unregister_reorder(&tcp_ncr); +} + +module_init(tcp_ncr_register); +module_exit(tcp_ncr_unregister); + +MODULE_AUTHOR("Daniel Slot, Carsten Wolff, Lennart Schulte"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP-NCR"); +MODULE_VERSION("1.0"); diff --git a/net/ipv4/tcp_noreor.c b/net/ipv4/tcp_noreor.c new file mode 100644 index 00000000000000..904b52d684fd38 --- /dev/null +++ b/net/ipv4/tcp_noreor.c @@ -0,0 +1,60 @@ +/* + * No reordering detection/response + * dupthresh is always 3 + */ + +#include +#include +#include + +/* + * TCP Linux noreor DupAck threshold + */ + +struct noreor { + u8 reorder_mode; +}; + +u32 tcp_noreor_dupthresh(struct sock *sk) +{ + return 3; +} +//EXPORT_SYMBOL_GPL(tcp_noreor_dupthresh); + +static void tcp_noreor_update_mode(struct sock *sk, int val) { + struct noreor *ro_priv = inet_csk_ro(sk); + + ro_priv->reorder_mode = val; +} + +struct tcp_reorder_ops tcp_noreor = { + .flags = TCP_REORDER_NON_RESTRICTED, + .name = "noreor", + .owner = THIS_MODULE, + .dupthresh = tcp_noreor_dupthresh, + .update_mode= tcp_noreor_update_mode, + .allow_moderation = 1, + .allow_head_to = 1, + .moddupthresh = tcp_noreor_dupthresh, +}; + +static int __init tcp_noreor_register(void) +{ + BUILD_BUG_ON(sizeof(struct noreor) > ICSK_RO_PRIV_SIZE); + tcp_register_reorder(&tcp_noreor); + return 0; +} + +static void __exit tcp_noreor_unregister(void) +{ + tcp_unregister_reorder(&tcp_noreor); +} + +module_init(tcp_noreor_register); +module_exit(tcp_noreor_unregister); + +MODULE_AUTHOR("Lennart Schulte"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP-NOREOR"); +MODULE_VERSION("1.0"); + diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fcd278a7080e16..58ed4e40d7e0db 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1882,7 +1882,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, * state updates are done by the caller. Returns non-zero if an * error occurred which prevented the send. */ -int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int fast_rexmit) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -1963,12 +1963,16 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); tp->total_retrans++; + if (fast_rexmit) + tp->total_fast_retrans++; + else + tp->total_rto_retrans++; #if FASTRETRANS_DEBUG > 0 - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { - if (net_ratelimit()) - printk(KERN_DEBUG "retrans_out leaked.\n"); - } + //if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { + // if (net_ratelimit()) + // printk(KERN_DEBUG "retrans_out leaked.\n"); + //} #endif if (!tp->retrans_out) tp->lost_retrans_low = tp->snd_nxt; @@ -2019,6 +2023,43 @@ static int tcp_can_forward_retransmit(struct sock *sk) return 1; } +static void print_queue(struct sock *sk, struct sk_buff *old, struct sk_buff *hole) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb, *prev; + + skb = tcp_write_queue_head(sk); + prev = (struct sk_buff *)(&sk->sk_write_queue); + + if (skb == NULL) { + //printk("NULL head, pkts %u\n", tp->packets_out); + return; + } + //printk("head %p tail %p sendhead %p oldhint %p now %p hole %p high %u\n", + // tcp_write_queue_head(sk), tcp_write_queue_tail(sk), + // tcp_send_head(sk), old, tp->retransmit_skb_hint, hole, + // tp->retransmit_high); + + while (skb) { + //printk("skb %p (%u-%u) next %p prev %p sacked %u\n", + // skb, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, + // skb->next, skb->prev, TCP_SKB_CB(skb)->sacked); + //if (prev != skb->prev) + //printk("Inconsistent prev\n"); + + if (skb == tcp_write_queue_tail(sk)) { + //if (skb->next != (struct sk_buff *)(&sk->sk_write_queue)) + //printk("Improper next at tail\n"); + return; + } + + prev = skb; + skb = skb->next; + } + //printk("Encountered unexpected NULL\n"); +} + + /* This gets called after a retransmit timeout, and the initially * retransmitted data is acknowledged. It tries to continue * resending the rest of the retransmit queue, until either @@ -2027,16 +2068,22 @@ static int tcp_can_forward_retransmit(struct sock *sk) * based retransmit packet might feed us FACK information again. * If so, we use it to avoid unnecessarily retransmissions. */ -void tcp_xmit_retransmit_queue(struct sock *sk) +static int caught_it = 0; + +void tcp_xmit_retransmit_queue(struct sock *sk, int fast_rexmit) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; struct sk_buff *hole = NULL; + struct sk_buff *old = tp->retransmit_skb_hint; u32 last_lost; int mib_idx; int fwd_rexmitting = 0; + if (!tp->packets_out) + return; + if (!tp->lost_out) tp->retransmit_high = tp->snd_una; @@ -2050,6 +2097,17 @@ void tcp_xmit_retransmit_queue(struct sock *sk) last_lost = tp->snd_una; } +checknull: + if (skb == NULL) { + if (!caught_it) + print_queue(sk, old, hole); + caught_it++; + //if (net_ratelimit()) + //printk("Errors caught so far %u\n", caught_it); + return; + } + + tcp_for_write_queue_from(skb, sk) { __u8 sacked = TCP_SKB_CB(skb)->sacked; @@ -2090,7 +2148,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) } else if (!(sacked & TCPCB_LOST)) { if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) hole = skb; - continue; + //continue; + goto cont; } else { last_lost = TCP_SKB_CB(skb)->end_seq; @@ -2101,9 +2160,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk) } if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) - continue; + //continue; + goto cont; - if (tcp_retransmit_skb(sk, skb)) + if (tcp_retransmit_skb(sk, skb, fast_rexmit)) return; NET_INC_STATS_BH(sock_net(sk), mib_idx); @@ -2111,6 +2171,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); +cont: + skb = skb->next; + goto checknull; } } diff --git a/net/ipv4/tcp_reorder.c b/net/ipv4/tcp_reorder.c new file mode 100644 index 00000000000000..5f1ce99fa49238 --- /dev/null +++ b/net/ipv4/tcp_reorder.c @@ -0,0 +1,324 @@ +/* + * Plugable segment reordering modules + * Based on plugable congestion control + * + * Copyright (C) 2009 Carsten Wolff + */ + +#include +#include +#include +#include +#include + +int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; + +struct native { + u8 reorder_mode; +}; + +static DEFINE_SPINLOCK(tcp_reorder_list_lock); +static LIST_HEAD(tcp_reorder_list); + +/* Simple linear search, don't expect many entries! */ +static struct tcp_reorder_ops *tcp_ro_find(const char *name) +{ + struct tcp_reorder_ops *e; + + list_for_each_entry_rcu(e, &tcp_reorder_list, list) { + if (strcmp(e->name, name) == 0) + return e; + } + + return NULL; +} + +/* + * Attach new reordering algorithm to the list + * of available options. + */ +int tcp_register_reorder(struct tcp_reorder_ops *ro) +{ + int ret = 0; + + /* all algorithms must implement certain ops */ + if (!ro->dupthresh || !ro->update_mode) { + printk(KERN_ERR "TCP %s does not implement required ops\n", + ro->name); + return -EINVAL; + } + + spin_lock(&tcp_reorder_list_lock); + if (tcp_ro_find(ro->name)) { + printk(KERN_NOTICE "TCP %s already registered\n", ro->name); + ret = -EEXIST; + } else { + list_add_tail_rcu(&ro->list, &tcp_reorder_list); + printk(KERN_INFO "TCP %s registered\n", ro->name); + } + spin_unlock(&tcp_reorder_list_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(tcp_register_reorder); + +/* + * Remove reordering algorithm, called from + * the module's remove function. Module ref counts are used + * to ensure that this can't be done till all sockets using + * that method are closed. + */ +void tcp_unregister_reorder(struct tcp_reorder_ops *ro) +{ + spin_lock(&tcp_reorder_list_lock); + list_del_rcu(&ro->list); + spin_unlock(&tcp_reorder_list_lock); +} +EXPORT_SYMBOL_GPL(tcp_unregister_reorder); + +/* Assign choice of reordering algorithm. */ +void tcp_init_reorder(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_reorder_ops *ro; + + /* if no choice made yet assign the current value set as default */ + if (icsk->icsk_ro_ops == &tcp_init_reorder_ops) { + rcu_read_lock(); + list_for_each_entry_rcu(ro, &tcp_reorder_list, list) { + if (try_module_get(ro->owner)) { + icsk->icsk_ro_ops = ro; + break; + } + + /* fallback to next available */ + } + rcu_read_unlock(); + } + + if (icsk->icsk_ro_ops->init) + icsk->icsk_ro_ops->init(sk); +} + +/* Manage refcounts on socket close. */ +void tcp_cleanup_reorder(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ro_ops->release) + icsk->icsk_ro_ops->release(sk); + module_put(icsk->icsk_ro_ops->owner); +} + +/* Used by sysctl to change default congestion control */ +int tcp_set_default_reorder(const char *name) +{ + struct tcp_reorder_ops *ro; + int ret = -ENOENT; + + spin_lock(&tcp_reorder_list_lock); + ro = tcp_ro_find(name); +#ifdef CONFIG_MODULES + if (!ro && capable(CAP_SYS_MODULE)) { + spin_unlock(&tcp_reorder_list_lock); + + request_module("tcp_%s", name); + spin_lock(&tcp_reorder_list_lock); + ro = tcp_ro_find(name); + } +#endif + + if (ro) { + ro->flags |= TCP_REORDER_NON_RESTRICTED; /* default is always allowed */ + list_move(&ro->list, &tcp_reorder_list); + ret = 0; + } + spin_unlock(&tcp_reorder_list_lock); + + return ret; +} + +/* Set default value from kernel configuration at bootup */ +static int __init tcp_reorder_default(void) +{ + return tcp_set_default_reorder(CONFIG_DEFAULT_TCP_REORDER); +} +late_initcall(tcp_reorder_default); + + +/* Build string with list of available reordering algorithms */ +void tcp_get_available_reorder(char *buf, size_t maxlen) +{ + struct tcp_reorder_ops *ro; + size_t offs = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(ro, &tcp_reorder_list, list) { + offs += snprintf(buf + offs, maxlen - offs, + "%s%s", + offs == 0 ? "" : " ", ro->name); + + } + rcu_read_unlock(); +} + +/* Get current default reordering algorithm */ +void tcp_get_default_reorder(char *name) +{ + struct tcp_reorder_ops *ro; + /* We will always have linux native... */ + BUG_ON(list_empty(&tcp_reorder_list)); + + rcu_read_lock(); + ro = list_entry(tcp_reorder_list.next, struct tcp_reorder_ops, list); + strncpy(name, ro->name, TCP_REORDER_NAME_MAX); + rcu_read_unlock(); +} + +/* Built list of non-restricted reordering values */ +void tcp_get_allowed_reorder(char *buf, size_t maxlen) +{ + struct tcp_reorder_ops *ro; + size_t offs = 0; + + *buf = '\0'; + rcu_read_lock(); + list_for_each_entry_rcu(ro, &tcp_reorder_list, list) { + if (!(ro->flags & TCP_REORDER_NON_RESTRICTED)) + continue; + offs += snprintf(buf + offs, maxlen - offs, + "%s%s", + offs == 0 ? "" : " ", ro->name); + + } + rcu_read_unlock(); +} + +/* Change list of non-restricted reordering algorithms */ +int tcp_set_allowed_reorder(char *val) +{ + struct tcp_reorder_ops *ro; + char *clone, *name; + int ret = 0; + + clone = kstrdup(val, GFP_USER); + if (!clone) + return -ENOMEM; + + spin_lock(&tcp_reorder_list_lock); + /* pass 1 check for bad entries */ + while ((name = strsep(&clone, " ")) && *name) { + ro = tcp_ro_find(name); + if (!ro) { + ret = -ENOENT; + goto out; + } + } + + /* pass 2 clear old values */ + list_for_each_entry_rcu(ro, &tcp_reorder_list, list) + ro->flags &= ~TCP_REORDER_NON_RESTRICTED; + + /* pass 3 mark as allowed */ + while ((name = strsep(&val, " ")) && *name) { + ro = tcp_ro_find(name); + WARN_ON(!ro); + if (ro) + ro->flags |= TCP_REORDER_NON_RESTRICTED; + } +out: + spin_unlock(&tcp_reorder_list_lock); + + return ret; +} + + +/* Change reordering algorithm for socket */ +int tcp_set_reorder(struct sock *sk, const char *name) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_reorder_ops *ro; + struct native *ro_priv = inet_csk_ro(sk); + int err = 0; + + rcu_read_lock(); + ro = tcp_ro_find(name); + + /* no change asking for existing value */ + if (ro == icsk->icsk_ro_ops) + goto out; + +#ifdef CONFIG_MODULES + /* not found attempt to autoload module */ + if (!ro && capable(CAP_SYS_MODULE)) { + rcu_read_unlock(); + request_module("tcp_%s", name); + rcu_read_lock(); + ro = tcp_ro_find(name); + } +#endif + if (!ro) + err = -ENOENT; + + else if (!((ro->flags & TCP_REORDER_NON_RESTRICTED) || capable(CAP_NET_ADMIN))) + err = -EPERM; + + else if (!try_module_get(ro->owner)) + err = -EBUSY; + + else { + tcp_cleanup_reorder(sk); + icsk->icsk_ro_ops = ro; + + if (sk->sk_state != TCP_CLOSE) { + if (icsk->icsk_ro_ops->init) + icsk->icsk_ro_ops->init(sk); + if (icsk->icsk_ro_ops->update_mode) + icsk->icsk_ro_ops->update_mode(sk, ro_priv->reorder_mode); + } + } + out: + rcu_read_unlock(); + return err; +} + +/* + * TCP Linux native DupAck threshold + */ +u32 tcp_native_dupthresh(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + return tp->reordering; +} +EXPORT_SYMBOL_GPL(tcp_native_dupthresh); + +static void tcp_native_update_mode(struct sock *sk, int val) { + struct native *ro_priv = inet_csk_ro(sk); + + ro_priv->reorder_mode = val; +} + +struct tcp_reorder_ops tcp_native = { + .flags = TCP_REORDER_NON_RESTRICTED, + .name = "native", + .owner = THIS_MODULE, + .dupthresh = tcp_native_dupthresh, + .update_mode= tcp_native_update_mode, + .allow_moderation = 1, + .allow_head_to = 1, + .moddupthresh = tcp_native_dupthresh, +}; + +/* Initial reordering algorithm used (until SYN) + * really native under another name so we can tell difference + * during tcp_set_default_reorder + */ +struct tcp_reorder_ops tcp_init_reorder_ops = { + .name = "", + .owner = THIS_MODULE, + .dupthresh = tcp_native_dupthresh, + .update_mode= tcp_native_update_mode, + .allow_moderation = 1, + .moddupthresh = tcp_native_dupthresh, +}; +EXPORT_SYMBOL_GPL(tcp_init_reorder_ops); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index cdb2ca7684d4ab..df2cde7b4cef71 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -320,7 +320,7 @@ void tcp_retransmit_timer(struct sock *sk) goto out; } tcp_enter_loss(sk, 0); - tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); + tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 0); __sk_dst_reset(sk); goto out_reset_timer; } @@ -349,13 +349,16 @@ void tcp_retransmit_timer(struct sock *sk) NET_INC_STATS_BH(sock_net(sk), mib_idx); } + if (icsk->icsk_ro_ops->rto_happened) + icsk->icsk_ro_ops->rto_happened(sk); + if (tcp_use_frto(sk)) { tcp_enter_frto(sk); } else { tcp_enter_loss(sk, 0); } - if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { + if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 0) > 0) { /* Retransmission failed because of local congestion, * do not backoff. */ diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 21d100b68b190d..0230cfe2065789 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1856,6 +1856,7 @@ static int tcp_v6_init_sock(struct sock *sk) icsk->icsk_af_ops = &ipv6_specific; icsk->icsk_ca_ops = &tcp_init_congestion_ops; + icsk->icsk_ro_ops = &tcp_init_reorder_ops; icsk->icsk_sync_mss = tcp_sync_mss; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);