diff --git a/NEWS b/NEWS index d7d0f3f1528..66d5a4ea375 100644 --- a/NEWS +++ b/NEWS @@ -37,9 +37,9 @@ Post-v3.1.0 - SRv6 Tunnel Protocol * Added support for userspace datapath (only). - Userspace datapath: - * IP checksum offload support is now enabled by default for interfaces - that support it. See the 'status' column in the 'interface' table to - check the status. + * IP and L4 checksum offload support is now enabled by default for + interfaces that support it. See the 'status' column in the 'interface' + table to check the status. v3.1.0 - 16 Feb 2023 diff --git a/lib/conntrack.c b/lib/conntrack.c index 78c3e578cb2..f5ebfa05bad 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -2060,13 +2060,12 @@ conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type, } if (ok) { - bool hwol_bad_l4_csum = dp_packet_l4_checksum_bad(pkt); - if (!hwol_bad_l4_csum) { - bool hwol_good_l4_csum = dp_packet_l4_checksum_good(pkt) - || dp_packet_hwol_tx_l4_checksum(pkt); + if (!dp_packet_l4_checksum_bad(pkt)) { /* Validate the checksum only when hwol is not supported. */ if (extract_l4(&ctx->key, l4, dp_packet_l4_size(pkt), - &ctx->icmp_related, l3, !hwol_good_l4_csum, + &ctx->icmp_related, l3, + !dp_packet_l4_checksum_good(pkt) && + !dp_packet_hwol_tx_l4_checksum(pkt), NULL)) { ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis); return true; @@ -3395,8 +3394,10 @@ handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx, adj_seqnum(&th->tcp_seq, ec->seq_skew); } - th->tcp_csum = 0; - if (!dp_packet_hwol_tx_l4_checksum(pkt)) { + if (dp_packet_hwol_tx_l4_checksum(pkt)) { + dp_packet_ol_reset_l4_csum_good(pkt); + } else { + th->tcp_csum = 0; if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) { th->tcp_csum = packet_csum_upperlayer6(nh6, th, ctx->key.nw_proto, dp_packet_l4_size(pkt)); diff --git a/lib/dp-packet.c b/lib/dp-packet.c index 35856bf5396..27114a9a998 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -38,6 +38,9 @@ dp_packet_init__(struct dp_packet *b, size_t allocated, enum dp_packet_source so dp_packet_init_specific(b); /* By default assume the packet type to be Ethernet. */ b->packet_type = htonl(PT_ETH); + /* Reset csum start and offset. */ + b->csum_start = 0; + b->csum_offset = 0; } static void @@ -546,4 +549,30 @@ dp_packet_ol_send_prepare(struct dp_packet *p, uint64_t flags) dp_packet_hwol_reset_tx_ip_csum(p); } } + + if (!dp_packet_hwol_tx_l4_checksum(p)) { + return; + } + + if (dp_packet_l4_checksum_good(p)) { + dp_packet_hwol_reset_tx_l4_csum(p); + return; + } + + if (dp_packet_hwol_l4_is_tcp(p) + && !(flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) { + packet_tcp_complete_csum(p); + dp_packet_ol_set_l4_csum_good(p); + dp_packet_hwol_reset_tx_l4_csum(p); + } else if (dp_packet_hwol_l4_is_udp(p) + && !(flags & NETDEV_TX_OFFLOAD_UDP_CKSUM)) { + packet_udp_complete_csum(p); + dp_packet_ol_set_l4_csum_good(p); + dp_packet_hwol_reset_tx_l4_csum(p); + } else if (!(flags & NETDEV_TX_OFFLOAD_SCTP_CKSUM) + && dp_packet_hwol_l4_is_sctp(p)) { + packet_sctp_complete_csum(p); + dp_packet_ol_set_l4_csum_good(p); + dp_packet_hwol_reset_tx_l4_csum(p); + } } diff --git a/lib/dp-packet.h b/lib/dp-packet.h index af0a2b7f0db..70ddf8aa45a 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -140,6 +140,8 @@ struct dp_packet { or UINT16_MAX. */ uint32_t cutlen; /* length in bytes to cut from the end. */ ovs_be32 packet_type; /* Packet type as defined in OpenFlow */ + uint16_t csum_start; /* Position to start checksumming from. */ + uint16_t csum_offset; /* Offset to place checksum. */ union { struct pkt_metadata md; uint64_t data[DP_PACKET_CONTEXT_SIZE / 8]; @@ -997,6 +999,13 @@ dp_packet_hwol_is_ipv4(const struct dp_packet *b) return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_IPV4); } +/* Returns 'true' if packet 'p' is marked as IPv6. */ +static inline bool +dp_packet_hwol_tx_ipv6(const struct dp_packet *p) +{ + return !!(*dp_packet_ol_flags_ptr(p) & DP_PACKET_OL_TX_IPV6); +} + /* Returns 'true' if packet 'b' is marked for TCP checksum offloading. */ static inline bool dp_packet_hwol_l4_is_tcp(const struct dp_packet *b) @@ -1021,18 +1030,26 @@ dp_packet_hwol_l4_is_sctp(struct dp_packet *b) DP_PACKET_OL_TX_SCTP_CKSUM; } -/* Mark packet 'b' for IPv4 checksum offloading. */ static inline void -dp_packet_hwol_set_tx_ipv4(struct dp_packet *b) +dp_packet_hwol_reset_tx_l4_csum(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_TX_L4_MASK; +} + +/* Mark packet 'p' as IPv4. */ +static inline void +dp_packet_hwol_set_tx_ipv4(struct dp_packet *p) { - *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_IPV4; + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_TX_IPV6; + *dp_packet_ol_flags_ptr(p) |= DP_PACKET_OL_TX_IPV4; } -/* Mark packet 'b' for IPv6 checksum offloading. */ +/* Mark packet 'a' as IPv6. */ static inline void -dp_packet_hwol_set_tx_ipv6(struct dp_packet *b) +dp_packet_hwol_set_tx_ipv6(struct dp_packet *a) { - *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_IPV6; + *dp_packet_ol_flags_ptr(a) &= ~DP_PACKET_OL_TX_IPV4; + *dp_packet_ol_flags_ptr(a) |= DP_PACKET_OL_TX_IPV6; } /* Returns 'true' if packet 'p' is marked for IPv4 checksum offloading. */ @@ -1147,6 +1164,55 @@ dp_packet_l4_checksum_bad(const struct dp_packet *p) DP_PACKET_OL_RX_L4_CKSUM_BAD; } +/* Returns 'true' if the packet has good integrity though the + * checksum in the packet 'p' is not complete. */ +static inline bool +dp_packet_ol_l4_csum_partial(const struct dp_packet *p) +{ + return (*dp_packet_ol_flags_ptr(p) & DP_PACKET_OL_RX_L4_CKSUM_MASK) == + DP_PACKET_OL_RX_L4_CKSUM_MASK; +} + +/* Marks packet 'p' with good integrity though the checksum in the + * packet is not complete. */ +static inline void +dp_packet_ol_set_l4_csum_partial(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) |= DP_PACKET_OL_RX_L4_CKSUM_MASK; +} + +/* Marks packet 'p' with good L4 checksum. */ +static inline void +dp_packet_ol_set_l4_csum_good(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_RX_L4_CKSUM_BAD; + *dp_packet_ol_flags_ptr(p) |= DP_PACKET_OL_RX_L4_CKSUM_GOOD; +} + +/* Marks packet 'p' with good L4 checksum as modified. */ +static inline void +dp_packet_ol_reset_l4_csum_good(struct dp_packet *p) +{ + if (!dp_packet_ol_l4_csum_partial(p)) { + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_RX_L4_CKSUM_GOOD; + } +} + +/* Marks packet 'p' with good integrity if the 'start' and 'offset' + * matches with the 'csum_start' and 'csum_offset' in packet 'p'. + * The 'start' is the offset from the begin of the packet headers. + * The 'offset' is the offset from start to place the checksum. + * The csum_start and csum_offset fields are set from the virtio_net_hdr + * struct that may be provided by a netdev on packet ingress. */ +static inline void +dp_packet_ol_l4_csum_check_partial(struct dp_packet *p, uint16_t start, + uint16_t offset) +{ + if (p->csum_start == start && p->csum_offset == offset) { + dp_packet_ol_set_l4_csum_partial(p); + } +} + static inline uint32_t ALWAYS_INLINE dp_packet_calc_hash_ipv4(const uint8_t *pkt, const uint16_t l3_ofs, uint32_t hash) diff --git a/lib/dpif-netdev-extract-avx512.c b/lib/dpif-netdev-extract-avx512.c index 66884eaf041..1bc7e8d0e08 100644 --- a/lib/dpif-netdev-extract-avx512.c +++ b/lib/dpif-netdev-extract-avx512.c @@ -698,7 +698,6 @@ mfex_ipv6_set_l2_pad_size(struct dp_packet *pkt, return -1; } dp_packet_set_l2_pad_size(pkt, len_from_ipv6 - (p_len + IPV6_HEADER_LEN)); - dp_packet_hwol_set_tx_ipv6(pkt); return 0; } @@ -729,10 +728,6 @@ mfex_ipv4_set_l2_pad_size(struct dp_packet *pkt, struct ip_header *nh, return -1; } dp_packet_set_l2_pad_size(pkt, len_from_ipv4 - ip_tot_len); - dp_packet_hwol_set_tx_ipv4(pkt); - if (dp_packet_ip_checksum_good(pkt)) { - dp_packet_hwol_set_tx_ip_csum(pkt); - } return 0; } @@ -763,6 +758,45 @@ mfex_check_tcp_data_offset(const struct tcp_header *tcp) return ret; } +static void +mfex_ipv4_set_hwol(struct dp_packet *pkt) +{ + dp_packet_hwol_set_tx_ipv4(pkt); + if (dp_packet_ip_checksum_good(pkt)) { + dp_packet_hwol_set_tx_ip_csum(pkt); + } +} + +static void +mfex_ipv6_set_hwol(struct dp_packet *pkt) +{ + dp_packet_hwol_set_tx_ipv6(pkt); +} + +static void +mfex_tcp_set_hwol(struct dp_packet *pkt) +{ + dp_packet_ol_l4_csum_check_partial(pkt, pkt->l4_ofs, + offsetof(struct tcp_header, + tcp_csum)); + if (dp_packet_l4_checksum_good(pkt) + || dp_packet_ol_l4_csum_partial(pkt)) { + dp_packet_hwol_set_csum_tcp(pkt); + } +} + +static void +mfex_udp_set_hwol(struct dp_packet *pkt) +{ + dp_packet_ol_l4_csum_check_partial(pkt, pkt->l4_ofs, + offsetof(struct udp_header, + udp_csum)); + if (dp_packet_l4_checksum_good(pkt) + || dp_packet_ol_l4_csum_partial(pkt)) { + dp_packet_hwol_set_csum_udp(pkt); + } +} + /* Generic loop to process any mfex profile. This code is specialized into * multiple actual MFEX implementation functions. Its marked ALWAYS_INLINE * to ensure the compiler specializes each instance. The code is marked "hot" @@ -864,6 +898,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, const struct tcp_header *tcp = (void *)&pkt[38]; mfex_handle_tcp_flags(tcp, &blocks[7]); dp_packet_update_rss_hash_ipv4_tcp_udp(packet); + mfex_ipv4_set_hwol(packet); + mfex_tcp_set_hwol(packet); } break; case PROFILE_ETH_VLAN_IPV4_UDP: { @@ -876,6 +912,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, continue; } dp_packet_update_rss_hash_ipv4_tcp_udp(packet); + mfex_ipv4_set_hwol(packet); + mfex_udp_set_hwol(packet); } break; case PROFILE_ETH_IPV4_TCP: { @@ -891,6 +929,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, continue; } dp_packet_update_rss_hash_ipv4_tcp_udp(packet); + mfex_ipv4_set_hwol(packet); + mfex_tcp_set_hwol(packet); } break; case PROFILE_ETH_IPV4_UDP: { @@ -902,6 +942,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, continue; } dp_packet_update_rss_hash_ipv4_tcp_udp(packet); + mfex_ipv4_set_hwol(packet); + mfex_udp_set_hwol(packet); } break; case PROFILE_ETH_IPV6_UDP: { @@ -920,6 +962,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, /* Process UDP header. */ mfex_handle_ipv6_l4((void *)&pkt[54], &blocks[9]); dp_packet_update_rss_hash_ipv6_tcp_udp(packet); + mfex_ipv6_set_hwol(packet); + mfex_udp_set_hwol(packet); } break; case PROFILE_ETH_IPV6_TCP: { @@ -943,6 +987,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, } mfex_handle_tcp_flags(tcp, &blocks[9]); dp_packet_update_rss_hash_ipv6_tcp_udp(packet); + mfex_ipv6_set_hwol(packet); + mfex_tcp_set_hwol(packet); } break; case PROFILE_ETH_VLAN_IPV6_TCP: { @@ -969,6 +1015,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, } mfex_handle_tcp_flags(tcp, &blocks[10]); dp_packet_update_rss_hash_ipv6_tcp_udp(packet); + mfex_ipv6_set_hwol(packet); + mfex_tcp_set_hwol(packet); } break; case PROFILE_ETH_VLAN_IPV6_UDP: { @@ -990,6 +1038,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, /* Process UDP header. */ mfex_handle_ipv6_l4((void *)&pkt[58], &blocks[10]); dp_packet_update_rss_hash_ipv6_tcp_udp(packet); + mfex_ipv6_set_hwol(packet); + mfex_udp_set_hwol(packet); } break; case PROFILE_ETH_IPV4_NVGRE: { @@ -1000,6 +1050,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, continue; } dp_packet_update_rss_hash_ipv4(packet); + mfex_ipv4_set_hwol(packet); + mfex_udp_set_hwol(packet); } break; default: diff --git a/lib/flow.c b/lib/flow.c index 9397c99254c..fe226cf0fe5 100644 --- a/lib/flow.c +++ b/lib/flow.c @@ -1054,6 +1054,13 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst) } else if (dl_type == htons(ETH_TYPE_IPV6)) { dp_packet_update_rss_hash_ipv6_tcp_udp(packet); } + dp_packet_ol_l4_csum_check_partial(packet, packet->l4_ofs, + offsetof(struct tcp_header, + tcp_csum)); + if (dp_packet_l4_checksum_good(packet) + || dp_packet_ol_l4_csum_partial(packet)) { + dp_packet_hwol_set_csum_tcp(packet); + } } } } else if (OVS_LIKELY(nw_proto == IPPROTO_UDP)) { @@ -1069,6 +1076,13 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst) } else if (dl_type == htons(ETH_TYPE_IPV6)) { dp_packet_update_rss_hash_ipv6_tcp_udp(packet); } + dp_packet_ol_l4_csum_check_partial(packet, packet->l4_ofs, + offsetof(struct udp_header, + udp_csum)); + if (dp_packet_l4_checksum_good(packet) + || dp_packet_ol_l4_csum_partial(packet)) { + dp_packet_hwol_set_csum_udp(packet); + } } } else if (OVS_LIKELY(nw_proto == IPPROTO_SCTP)) { if (OVS_LIKELY(size >= SCTP_HEADER_LEN)) { @@ -1078,6 +1092,13 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst) miniflow_push_be16(mf, tp_dst, sctp->sctp_dst); miniflow_push_be16(mf, ct_tp_src, ct_tp_src); miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst); + dp_packet_ol_l4_csum_check_partial(packet, packet->l4_ofs, + offsetof(struct sctp_header, + sctp_csum)); + if (dp_packet_l4_checksum_good(packet) + || dp_packet_ol_l4_csum_partial(packet)) { + dp_packet_hwol_set_csum_sctp(packet); + } } } else if (OVS_LIKELY(nw_proto == IPPROTO_ICMP)) { if (OVS_LIKELY(size >= ICMP_HEADER_LEN)) { @@ -3196,6 +3217,7 @@ flow_compose_l4_csum(struct dp_packet *p, const struct flow *flow, tcp->tcp_csum = 0; tcp->tcp_csum = csum_finish(csum_continue(pseudo_hdr_csum, tcp, l4_len)); + dp_packet_ol_set_l4_csum_good(p); } else if (flow->nw_proto == IPPROTO_UDP) { struct udp_header *udp = dp_packet_l4(p); @@ -3205,6 +3227,7 @@ flow_compose_l4_csum(struct dp_packet *p, const struct flow *flow, if (!udp->udp_csum) { udp->udp_csum = htons(0xffff); } + dp_packet_ol_set_l4_csum_good(p); } else if (flow->nw_proto == IPPROTO_ICMP) { struct icmp_header *icmp = dp_packet_l4(p); diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index cac46eac781..63dac689e38 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -412,8 +412,10 @@ enum dpdk_hw_ol_features { NETDEV_RX_HW_CRC_STRIP = 1 << 1, NETDEV_RX_HW_SCATTER = 1 << 2, NETDEV_TX_IPV4_CKSUM_OFFLOAD = 1 << 3, - NETDEV_TX_TSO_OFFLOAD = 1 << 4, - NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 5, + NETDEV_TX_TCP_CKSUM_OFFLOAD = 1 << 4, + NETDEV_TX_UDP_CKSUM_OFFLOAD = 1 << 5, + NETDEV_TX_SCTP_CKSUM_OFFLOAD = 1 << 6, + NETDEV_TX_TSO_OFFLOAD = 1 << 7, }; /* @@ -1008,6 +1010,37 @@ dpdk_watchdog(void *dummy OVS_UNUSED) return NULL; } +static void +netdev_dpdk_update_netdev_flag(struct netdev_dpdk *dev, + enum dpdk_hw_ol_features hw_ol_features, + enum netdev_ol_flags flag) + OVS_REQUIRES(dev->mutex) +{ + struct netdev *netdev = &dev->up; + + if (dev->hw_ol_features & hw_ol_features) { + netdev->ol_flags |= flag; + } else { + netdev->ol_flags &= ~flag; + } +} + +static void +netdev_dpdk_update_netdev_flags(struct netdev_dpdk *dev) + OVS_REQUIRES(dev->mutex) +{ + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_IPV4_CKSUM_OFFLOAD, + NETDEV_TX_OFFLOAD_IPV4_CKSUM); + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_TCP_CKSUM_OFFLOAD, + NETDEV_TX_OFFLOAD_TCP_CKSUM); + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_UDP_CKSUM_OFFLOAD, + NETDEV_TX_OFFLOAD_UDP_CKSUM); + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_SCTP_CKSUM_OFFLOAD, + NETDEV_TX_OFFLOAD_SCTP_CKSUM); + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_TSO_OFFLOAD, + NETDEV_TX_OFFLOAD_TCP_TSO); +} + static int dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq) { @@ -1044,11 +1077,20 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq) conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_IPV4_CKSUM; } + if (dev->hw_ol_features & NETDEV_TX_TCP_CKSUM_OFFLOAD) { + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_TCP_CKSUM; + } + + if (dev->hw_ol_features & NETDEV_TX_UDP_CKSUM_OFFLOAD) { + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_UDP_CKSUM; + } + + if (dev->hw_ol_features & NETDEV_TX_SCTP_CKSUM_OFFLOAD) { + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_SCTP_CKSUM; + } + if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) { - conf.txmode.offloads |= DPDK_TX_TSO_OFFLOAD_FLAGS; - if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) { - conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_SCTP_CKSUM; - } + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_TCP_TSO; } /* Limit configured rss hash functions to only those supported @@ -1154,7 +1196,6 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) struct rte_ether_addr eth_addr; int diag; int n_rxq, n_txq; - uint32_t tx_tso_offload_capa = DPDK_TX_TSO_OFFLOAD_FLAGS; uint32_t rx_chksm_offload_capa = RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_TCP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM; @@ -1190,18 +1231,28 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) dev->hw_ol_features &= ~NETDEV_TX_IPV4_CKSUM_OFFLOAD; } + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_TCP_CKSUM) { + dev->hw_ol_features |= NETDEV_TX_TCP_CKSUM_OFFLOAD; + } else { + dev->hw_ol_features &= ~NETDEV_TX_TCP_CKSUM_OFFLOAD; + } + + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_UDP_CKSUM) { + dev->hw_ol_features |= NETDEV_TX_UDP_CKSUM_OFFLOAD; + } else { + dev->hw_ol_features &= ~NETDEV_TX_UDP_CKSUM_OFFLOAD; + } + + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_SCTP_CKSUM) { + dev->hw_ol_features |= NETDEV_TX_SCTP_CKSUM_OFFLOAD; + } else { + dev->hw_ol_features &= ~NETDEV_TX_SCTP_CKSUM_OFFLOAD; + } + dev->hw_ol_features &= ~NETDEV_TX_TSO_OFFLOAD; if (userspace_tso_enabled()) { - if ((info.tx_offload_capa & tx_tso_offload_capa) - == tx_tso_offload_capa) { + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_TCP_TSO) { dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD; - if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_SCTP_CKSUM) { - dev->hw_ol_features |= NETDEV_TX_SCTP_CHECKSUM_OFFLOAD; - } else { - VLOG_WARN("%s: Tx SCTP checksum offload is not supported, " - "SCTP packets sent to this device will be dropped", - netdev_get_name(&dev->up)); - } } else { VLOG_WARN("%s: Tx TSO offload is not supported.", netdev_get_name(&dev->up)); @@ -2245,6 +2296,7 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) mbuf->l2_len = (char *) dp_packet_l3(pkt) - (char *) dp_packet_eth(pkt); mbuf->l3_len = (char *) dp_packet_l4(pkt) - (char *) dp_packet_l3(pkt); + mbuf->l4_len = 0; mbuf->outer_l2_len = 0; mbuf->outer_l3_len = 0; @@ -4181,6 +4233,7 @@ new_device(int vid) ovs_mutex_lock(&dev->mutex); if (nullable_string_is_equal(ifname, dev->vhost_id)) { uint32_t qp_num = rte_vhost_get_vring_num(vid) / VIRTIO_QNUM; + uint64_t features; /* Get NUMA information */ newnode = rte_vhost_get_numa_node(vid); @@ -4205,6 +4258,36 @@ new_device(int vid) dev->vhost_reconfigured = true; } + if (rte_vhost_get_negotiated_features(vid, &features)) { + VLOG_INFO("Error checking guest features for " + "vHost Device '%s'", dev->vhost_id); + } else { + if (features & (1ULL << VIRTIO_NET_F_GUEST_CSUM)) { + dev->hw_ol_features |= NETDEV_TX_TCP_CKSUM_OFFLOAD; + dev->hw_ol_features |= NETDEV_TX_UDP_CKSUM_OFFLOAD; + dev->hw_ol_features |= NETDEV_TX_SCTP_CKSUM_OFFLOAD; + } + + if (userspace_tso_enabled()) { + if (features & (1ULL << VIRTIO_NET_F_GUEST_TSO4) + && features & (1ULL << VIRTIO_NET_F_GUEST_TSO6)) { + + dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD; + VLOG_DBG("%s: TSO enabled on vhost port", + netdev_get_name(&dev->up)); + } else { + VLOG_WARN("%s: Tx TSO offload is not supported.", + netdev_get_name(&dev->up)); + } + } + } + + /* There is no support in virtio net to offload IPv4 csum, + * but the vhost library handles IPv4 csum offloading fine. */ + dev->hw_ol_features |= NETDEV_TX_IPV4_CKSUM_OFFLOAD; + + netdev_dpdk_update_netdev_flags(dev); + ovsrcu_index_set(&dev->vid, vid); exists = true; @@ -4268,6 +4351,10 @@ destroy_device(int vid) dev->up.n_rxq * sizeof *dev->vhost_rxq_enabled); netdev_dpdk_txq_map_clear(dev); + /* Clear offload capabilities before next new_device. */ + dev->hw_ol_features = 0; + netdev_dpdk_update_netdev_flags(dev); + netdev_change_seq_changed(&dev->up); ovs_mutex_unlock(&dev->mutex); exists = true; @@ -5278,22 +5365,7 @@ netdev_dpdk_reconfigure(struct netdev *netdev) } err = dpdk_eth_dev_init(dev); - - if (dev->hw_ol_features & NETDEV_TX_IPV4_CKSUM_OFFLOAD) { - netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; - } else { - netdev->ol_flags &= ~NETDEV_TX_OFFLOAD_IPV4_CKSUM; - } - - if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) { - netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO; - netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM; - netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM; - netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; - if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) { - netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM; - } - } + netdev_dpdk_update_netdev_flags(dev); /* If both requested and actual hwaddr were previously * unset (initialized to 0), then first device init above @@ -5340,11 +5412,6 @@ dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev) memset(dev->sw_stats, 0, sizeof *dev->sw_stats); rte_spinlock_unlock(&dev->stats_lock); - if (userspace_tso_enabled()) { - dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD; - VLOG_DBG("%s: TSO enabled on vhost port", netdev_get_name(&dev->up)); - } - netdev_dpdk_remap_txqs(dev); if (netdev_dpdk_get_vid(dev) >= 0) { @@ -5365,6 +5432,8 @@ dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev) } } + netdev_dpdk_update_netdev_flags(dev); + return 0; } @@ -5386,8 +5455,6 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); int err; - uint64_t vhost_flags = 0; - uint64_t vhost_unsup_flags; ovs_mutex_lock(&dev->mutex); @@ -5397,6 +5464,9 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) * 2. A path has been specified. */ if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT) && dev->vhost_id) { + uint64_t virtio_unsup_features = 0; + uint64_t vhost_flags = 0; + /* Register client-mode device. */ vhost_flags |= RTE_VHOST_USER_CLIENT; @@ -5443,22 +5513,22 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) } if (userspace_tso_enabled()) { - netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO; - netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM; - netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM; - netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM; - netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; - vhost_unsup_flags = 1ULL << VIRTIO_NET_F_HOST_ECN - | 1ULL << VIRTIO_NET_F_HOST_UFO; + virtio_unsup_features = 1ULL << VIRTIO_NET_F_HOST_ECN + | 1ULL << VIRTIO_NET_F_HOST_UFO; + VLOG_DBG("%s: TSO enabled on vhost port", + netdev_get_name(&dev->up)); } else { - /* This disables checksum offloading and all the features - * that depends on it (TSO, UFO, ECN) according to virtio - * specification. */ - vhost_unsup_flags = 1ULL << VIRTIO_NET_F_CSUM; + /* Advertise checksum offloading to the guest, but explicitly + * disable TSO and friends. + * NOTE: we can't disable HOST_ECN which may have been wrongly + * negotiated by a running guest. */ + virtio_unsup_features = 1ULL << VIRTIO_NET_F_HOST_TSO4 + | 1ULL << VIRTIO_NET_F_HOST_TSO6 + | 1ULL << VIRTIO_NET_F_HOST_UFO; } err = rte_vhost_driver_disable_features(dev->vhost_id, - vhost_unsup_flags); + virtio_unsup_features); if (err) { VLOG_ERR("rte_vhost_driver_disable_features failed for " "vhost user client port: %s\n", dev->up.name); diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 49c74346a42..3dba2ef1fe4 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -530,6 +530,11 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); * changes in the device miimon status, so we can use atomic_count. */ static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0); +/* Very old kernels from the 2.6 era don't support vnet headers with the tun + * device. We can detect this while constructing a netdev, but need this for + * packet rx/tx. */ +static bool tap_supports_vnet_hdr = true; + static int netdev_linux_parse_vnet_hdr(struct dp_packet *b); static void netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu); static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *, @@ -938,14 +943,6 @@ netdev_linux_common_construct(struct netdev *netdev_) netnsid_unset(&netdev->netnsid); ovs_mutex_init(&netdev->mutex); - if (userspace_tso_enabled()) { - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO; - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM; - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM; - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM; - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; - } - return 0; } @@ -959,6 +956,16 @@ netdev_linux_construct(struct netdev *netdev_) return error; } + /* The socket interface doesn't offer the option to enable only + * csum offloading without TSO. */ + if (userspace_tso_enabled()) { + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO; + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM; + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM; + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM; + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; + } + error = get_flags(&netdev->up, &netdev->ifi_flags); if (error == ENODEV) { if (netdev->up.netdev_class != &netdev_internal_class) { @@ -984,9 +991,12 @@ netdev_linux_construct(struct netdev *netdev_) static int netdev_linux_construct_tap(struct netdev *netdev_) { + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; struct netdev_linux *netdev = netdev_linux_cast(netdev_); static const char tap_dev[] = "/dev/net/tun"; const char *name = netdev_->name; + unsigned long oflags; + unsigned int up; struct ifreq ifr; int error = netdev_linux_common_construct(netdev_); @@ -1004,8 +1014,21 @@ netdev_linux_construct_tap(struct netdev *netdev_) /* Create tap device. */ get_flags(&netdev->up, &netdev->ifi_flags); + + if (ovsthread_once_start(&once)) { + if (ioctl(netdev->tap_fd, TUNGETFEATURES, &up) == -1) { + VLOG_WARN("%s: querying tap features failed: %s", name, + ovs_strerror(errno)); + tap_supports_vnet_hdr = false; + } else if (!(up & IFF_VNET_HDR)) { + VLOG_WARN("TAP interfaces do not support virtio-net headers"); + tap_supports_vnet_hdr = false; + } + ovsthread_once_done(&once); + } + ifr.ifr_flags = IFF_TAP | IFF_NO_PI; - if (userspace_tso_enabled()) { + if (tap_supports_vnet_hdr) { ifr.ifr_flags |= IFF_VNET_HDR; } @@ -1030,21 +1053,23 @@ netdev_linux_construct_tap(struct netdev *netdev_) goto error_close; } + oflags = TUN_F_CSUM; if (userspace_tso_enabled()) { - /* Old kernels don't support TUNSETOFFLOAD. If TUNSETOFFLOAD is - * available, it will return EINVAL when a flag is unknown. - * Therefore, try enabling offload with no flags to check - * if TUNSETOFFLOAD support is available or not. */ - if (ioctl(netdev->tap_fd, TUNSETOFFLOAD, 0) == 0 || errno != EINVAL) { - unsigned long oflags = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6; - - if (ioctl(netdev->tap_fd, TUNSETOFFLOAD, oflags) == -1) { - VLOG_WARN("%s: enabling tap offloading failed: %s", name, - ovs_strerror(errno)); - error = errno; - goto error_close; - } + oflags |= (TUN_F_TSO4 | TUN_F_TSO6); + } + + if (tap_supports_vnet_hdr + && ioctl(netdev->tap_fd, TUNSETOFFLOAD, oflags) == 0) { + netdev_->ol_flags |= (NETDEV_TX_OFFLOAD_IPV4_CKSUM + | NETDEV_TX_OFFLOAD_TCP_CKSUM + | NETDEV_TX_OFFLOAD_UDP_CKSUM); + + if (userspace_tso_enabled()) { + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO; } + } else { + VLOG_INFO("%s: Disabling checksum and segment offloading due to " + "missing kernel support", name); } netdev->present = true; @@ -1344,18 +1369,23 @@ netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu, pkt = buffers[i]; } - if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(pkt)) { - struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up); - struct netdev_linux *netdev = netdev_linux_cast(netdev_); + if (virtio_net_hdr_size) { + int ret = netdev_linux_parse_vnet_hdr(pkt); + if (OVS_UNLIKELY(ret)) { + struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up); + struct netdev_linux *netdev = netdev_linux_cast(netdev_); - /* Unexpected error situation: the virtio header is not present - * or corrupted. Drop the packet but continue in case next ones - * are correct. */ - dp_packet_delete(pkt); - netdev->rx_dropped += 1; - VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net header", - netdev_get_name(netdev_)); - continue; + /* Unexpected error situation: the virtio header is not + * present or corrupted or contains unsupported features. + * Drop the packet but continue in case next ones are + * correct. */ + dp_packet_delete(pkt); + netdev->rx_dropped += 1; + VLOG_WARN_RL(&rl, "%s: Dropped packet: vnet header is missing " + "or corrupt: %s", netdev_get_name(netdev_), + ovs_strerror(ret)); + continue; + } } for (cmsg = CMSG_FIRSTHDR(&mmsgs[i].msg_hdr); cmsg; @@ -1413,10 +1443,13 @@ netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu, /* Use the buffer from the allocated packet below to receive MTU * sized packets and an aux_buf for extra TSO data. */ iovlen = IOV_TSO_SIZE; - virtio_net_hdr_size = sizeof(struct virtio_net_hdr); } else { /* Use only the buffer from the allocated packet. */ iovlen = IOV_STD_SIZE; + } + if (OVS_LIKELY(tap_supports_vnet_hdr)) { + virtio_net_hdr_size = sizeof(struct virtio_net_hdr); + } else { virtio_net_hdr_size = 0; } @@ -1462,7 +1495,8 @@ netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu, pkt = buffer; } - if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(pkt)) { + if (OVS_LIKELY(virtio_net_hdr_size) && + netdev_linux_parse_vnet_hdr(pkt)) { struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up); struct netdev_linux *netdev = netdev_linux_cast(netdev_); @@ -1611,7 +1645,7 @@ netdev_linux_sock_batch_send(int sock, int ifindex, bool tso, int mtu, * on other interface types because we attach a socket filter to the rx * socket. */ static int -netdev_linux_tap_batch_send(struct netdev *netdev_, bool tso, int mtu, +netdev_linux_tap_batch_send(struct netdev *netdev_, int mtu, struct dp_packet_batch *batch) { struct netdev_linux *netdev = netdev_linux_cast(netdev_); @@ -1632,7 +1666,7 @@ netdev_linux_tap_batch_send(struct netdev *netdev_, bool tso, int mtu, ssize_t retval; int error; - if (tso) { + if (OVS_LIKELY(tap_supports_vnet_hdr)) { netdev_linux_prepend_vnet_hdr(packet, mtu); } @@ -1765,7 +1799,7 @@ netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED, error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu, batch); } else { - error = netdev_linux_tap_batch_send(netdev_, tso, mtu, batch); + error = netdev_linux_tap_batch_send(netdev_, mtu, batch); } if (error) { if (error == ENOBUFS) { @@ -6846,53 +6880,76 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto) return 0; } +/* Initializes packet 'b' with features enabled in the prepended + * struct virtio_net_hdr. Returns 0 if successful, otherwise a + * positive errno value. */ static int netdev_linux_parse_vnet_hdr(struct dp_packet *b) { struct virtio_net_hdr *vnet = dp_packet_pull(b, sizeof *vnet); - uint16_t l4proto = 0; if (OVS_UNLIKELY(!vnet)) { - return -EINVAL; + return EINVAL; } if (vnet->flags == 0 && vnet->gso_type == VIRTIO_NET_HDR_GSO_NONE) { return 0; } - if (netdev_linux_parse_l2(b, &l4proto)) { - return -EINVAL; - } - if (vnet->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) { - if (l4proto == IPPROTO_TCP) { - dp_packet_hwol_set_csum_tcp(b); - } else if (l4proto == IPPROTO_UDP) { + uint16_t l4proto = 0; + + if (netdev_linux_parse_l2(b, &l4proto)) { + return EINVAL; + } + + if (l4proto == IPPROTO_UDP) { dp_packet_hwol_set_csum_udp(b); - } else if (l4proto == IPPROTO_SCTP) { - dp_packet_hwol_set_csum_sctp(b); } + /* The packet has offloaded checksum. However, there is no + * additional information like the protocol used, so it would + * require to parse the packet here. The checksum starting point + * and offset are going to be verified when the packet headers + * are parsed during miniflow extraction. */ + b->csum_start = (OVS_FORCE uint16_t) vnet->csum_start; + b->csum_offset = (OVS_FORCE uint16_t) vnet->csum_offset; + } else { + b->csum_start = 0; + b->csum_offset = 0; } - if (l4proto && vnet->gso_type != VIRTIO_NET_HDR_GSO_NONE) { - uint8_t allowed_mask = VIRTIO_NET_HDR_GSO_TCPV4 - | VIRTIO_NET_HDR_GSO_TCPV6 - | VIRTIO_NET_HDR_GSO_UDP; - uint8_t type = vnet->gso_type & allowed_mask; + int ret = 0; + switch (vnet->gso_type) { + case VIRTIO_NET_HDR_GSO_TCPV4: + case VIRTIO_NET_HDR_GSO_TCPV6: + /* FIXME: The packet has offloaded TCP segmentation. The gso_size + * is given and needs to be respected. */ + dp_packet_hwol_set_tcp_seg(b); + break; - if (type == VIRTIO_NET_HDR_GSO_TCPV4 - || type == VIRTIO_NET_HDR_GSO_TCPV6) { - dp_packet_hwol_set_tcp_seg(b); - } + case VIRTIO_NET_HDR_GSO_UDP: + /* UFO is not supported. */ + VLOG_WARN_RL(&rl, "Received an unsupported packet with UFO enabled."); + ret = ENOTSUP; + break; + + case VIRTIO_NET_HDR_GSO_NONE: + break; + + default: + ret = ENOTSUP; + VLOG_WARN_RL(&rl, "Received an unsupported packet with GSO type: 0x%x", + vnet->gso_type); } - return 0; + return ret; } static void netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu) { - struct virtio_net_hdr *vnet = dp_packet_push_zeros(b, sizeof *vnet); + struct virtio_net_hdr v; + struct virtio_net_hdr *vnet = &v; if (dp_packet_hwol_is_tso(b)) { uint16_t hdr_len = ((char *)dp_packet_l4(b) - (char *)dp_packet_eth(b)) @@ -6902,30 +6959,91 @@ netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu) vnet->gso_size = (OVS_FORCE __virtio16)(mtu - hdr_len); if (dp_packet_hwol_is_ipv4(b)) { vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; - } else { + } else if (dp_packet_hwol_tx_ipv6(b)) { vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; } } else { - vnet->flags = VIRTIO_NET_HDR_GSO_NONE; + vnet->hdr_len = 0; + vnet->gso_size = 0; + vnet->gso_type = VIRTIO_NET_HDR_GSO_NONE; } - if (dp_packet_hwol_l4_mask(b)) { - vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - vnet->csum_start = (OVS_FORCE __virtio16)((char *)dp_packet_l4(b) - - (char *)dp_packet_eth(b)); - + if (dp_packet_l4_checksum_good(b)) { + /* The packet has good L4 checksum. No need to validate again. */ + vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0; + vnet->flags = VIRTIO_NET_HDR_F_DATA_VALID; + } else if (dp_packet_hwol_tx_l4_checksum(b)) { + /* The csum calculation is offloaded. */ if (dp_packet_hwol_l4_is_tcp(b)) { + /* Virtual I/O Device (VIRTIO) Version 1.1 + * 5.1.6.2 Packet Transmission + * If the driver negotiated VIRTIO_NET_F_CSUM, it can skip + * checksumming the packet: + * - flags has the VIRTIO_NET_HDR_F_NEEDS_CSUM set, + * - csum_start is set to the offset within the packet + * to begin checksumming, and + * - csum_offset indicates how many bytes after the + * csum_start the new (16 bit ones complement) checksum + * is placed by the device. + * The TCP checksum field in the packet is set to the sum of + * the TCP pseudo header, so that replacing it by the ones + * complement checksum of the TCP header and body will give + * the correct result. */ + + struct tcp_header *tcp_hdr = dp_packet_l4(b); + ovs_be16 csum = 0; + if (dp_packet_hwol_is_ipv4(b)) { + const struct ip_header *ip_hdr = dp_packet_l3(b); + csum = ~csum_finish(packet_csum_pseudoheader(ip_hdr)); + } else if (dp_packet_hwol_tx_ipv6(b)) { + const struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(b); + csum = ~csum_finish(packet_csum_pseudoheader6(ip6_hdr)); + } + + tcp_hdr->tcp_csum = csum; + vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + vnet->csum_start = (OVS_FORCE __virtio16) b->l4_ofs; vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof( struct tcp_header, tcp_csum); } else if (dp_packet_hwol_l4_is_udp(b)) { + struct udp_header *udp_hdr = dp_packet_l4(b); + ovs_be16 csum = 0; + + if (dp_packet_hwol_is_ipv4(b)) { + const struct ip_header *ip_hdr = dp_packet_l3(b); + csum = ~csum_finish(packet_csum_pseudoheader(ip_hdr)); + } else if (dp_packet_hwol_tx_ipv6(b)) { + const struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(b); + csum = ~csum_finish(packet_csum_pseudoheader6(ip6_hdr)); + } + + udp_hdr->udp_csum = csum; + vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + vnet->csum_start = (OVS_FORCE __virtio16) b->l4_ofs; vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof( struct udp_header, udp_csum); } else if (dp_packet_hwol_l4_is_sctp(b)) { - vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof( - struct sctp_header, sctp_csum); + /* The Linux kernel networking stack only supports csum_start + * and csum_offset when SCTP GSO is enabled. See kernel's + * skb_csum_hwoffload_help(). Currently there is no SCTP + * segmentation offload support in OVS. */ + vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0; + vnet->flags = 0; } else { - VLOG_WARN_RL(&rl, "Unsupported L4 protocol"); + /* This should only happen when DP_PACKET_OL_TX_L4_MASK includes + * a new flag that is not covered in above checks. */ + VLOG_WARN_RL(&rl, "Unsupported L4 checksum offload. " + "Flags: %"PRIu64, + (uint64_t)*dp_packet_ol_flags_ptr(b)); + vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0; + vnet->flags = 0; } + } else { + /* Packet L4 csum is unknown. */ + vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0; + vnet->flags = 0; } + + dp_packet_push(b, vnet, sizeof *vnet); } diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c index 72d24459854..715bbab2bec 100644 --- a/lib/netdev-native-tnl.c +++ b/lib/netdev-native-tnl.c @@ -225,28 +225,6 @@ udp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl, return udp + 1; } -static void -netdev_tnl_calc_udp_csum(struct udp_header *udp, struct dp_packet *packet, - int ip_tot_size) -{ - uint32_t csum; - - if (netdev_tnl_is_header_ipv6(dp_packet_data(packet))) { - csum = packet_csum_pseudoheader6(netdev_tnl_ipv6_hdr( - dp_packet_data(packet))); - } else { - csum = packet_csum_pseudoheader(netdev_tnl_ip_hdr( - dp_packet_data(packet))); - } - - csum = csum_continue(csum, udp, ip_tot_size); - udp->udp_csum = csum_finish(csum); - - if (!udp->udp_csum) { - udp->udp_csum = htons(0xffff); - } -} - void netdev_tnl_push_udp_header(const struct netdev *netdev OVS_UNUSED, struct dp_packet *packet, @@ -262,8 +240,12 @@ netdev_tnl_push_udp_header(const struct netdev *netdev OVS_UNUSED, udp->udp_src = netdev_tnl_get_src_port(packet); udp->udp_len = htons(ip_tot_size); + /* Postpone checksum to the egress netdev. */ + dp_packet_hwol_set_csum_udp(packet); if (udp->udp_csum) { - netdev_tnl_calc_udp_csum(udp, packet, ip_tot_size); + dp_packet_ol_reset_l4_csum_good(packet); + } else { + dp_packet_ol_set_l4_csum_good(packet); } } @@ -793,7 +775,9 @@ netdev_gtpu_push_header(const struct netdev *netdev, &ip_tot_size, 0); udp->udp_src = netdev_tnl_get_src_port(packet); udp->udp_len = htons(ip_tot_size); - netdev_tnl_calc_udp_csum(udp, packet, ip_tot_size); + /* Postpone checksum to the egress netdev. */ + dp_packet_hwol_set_csum_udp(packet); + dp_packet_ol_reset_l4_csum_good(packet); gtpuh = ALIGNED_CAST(struct gtpuhdr *, udp + 1); diff --git a/lib/netdev.c b/lib/netdev.c index b86afbf36d4..8df7f873715 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -799,8 +799,6 @@ static bool netdev_send_prepare_packet(const uint64_t netdev_flags, struct dp_packet *packet, char **errormsg) { - uint64_t l4_mask; - if (dp_packet_hwol_is_tso(packet) && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { /* Fall back to GSO in software. */ @@ -813,36 +811,16 @@ netdev_send_prepare_packet(const uint64_t netdev_flags, * netdev to decide what would be the best to do. * Provide a software fallback in case the device doesn't support IP csum * offloading. Note: Encapsulated packet must have the inner IP header + * csum already calculated. + * Packet with L4 csum offloading enabled was received with verified csum. + * Leave the L4 csum offloading enabled even with good checksum for the + * netdev to decide what would be the best to do. + * Netdev that requires pseudo header csum needs to calculate that. + * Provide a software fallback in case the netdev doesn't support L4 csum + * offloading. Note: Encapsulated packet must have the inner L4 header * csum already calculated. */ dp_packet_ol_send_prepare(packet, netdev_flags); - l4_mask = dp_packet_hwol_l4_mask(packet); - if (l4_mask) { - if (dp_packet_hwol_l4_is_tcp(packet)) { - if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) { - /* Fall back to TCP csum in software. */ - VLOG_ERR_BUF(errormsg, "No TCP checksum support"); - return false; - } - } else if (dp_packet_hwol_l4_is_udp(packet)) { - if (!(netdev_flags & NETDEV_TX_OFFLOAD_UDP_CKSUM)) { - /* Fall back to UDP csum in software. */ - VLOG_ERR_BUF(errormsg, "No UDP checksum support"); - return false; - } - } else if (dp_packet_hwol_l4_is_sctp(packet)) { - if (!(netdev_flags & NETDEV_TX_OFFLOAD_SCTP_CKSUM)) { - /* Fall back to SCTP csum in software. */ - VLOG_ERR_BUF(errormsg, "No SCTP checksum support"); - return false; - } - } else { - VLOG_ERR_BUF(errormsg, "No L4 checksum support: mask: %"PRIu64, - l4_mask); - return false; - } - } - return true; } @@ -975,20 +953,16 @@ netdev_push_header(const struct netdev *netdev, size_t i, size = dp_packet_batch_size(batch); DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) { - if (OVS_UNLIKELY(dp_packet_hwol_is_tso(packet) - || dp_packet_hwol_l4_mask(packet))) { + if (OVS_UNLIKELY(dp_packet_hwol_is_tso(packet))) { COVERAGE_INC(netdev_push_header_drops); dp_packet_delete(packet); - VLOG_WARN_RL(&rl, "%s: Tunneling packets with HW offload flags is " + VLOG_WARN_RL(&rl, "%s: Tunneling packets with TSO is " "not supported: packet dropped", netdev_get_name(netdev)); } else { /* The packet is going to be encapsulated and there is * no support yet for inner network header csum offloading. */ - if (dp_packet_hwol_tx_ip_csum(packet) - && !dp_packet_ip_checksum_good(packet)) { - dp_packet_ip_set_header_csum(packet); - } + dp_packet_ol_send_prepare(packet, 0); netdev->netdev_class->push_header(netdev, packet, data); diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index 9597f3554ce..747e04014ab 100644 --- a/lib/odp-execute-avx512.c +++ b/lib/odp-execute-avx512.c @@ -486,9 +486,11 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch, size_t l4_size = dp_packet_l4_size(packet); if (nh->ip_proto == IPPROTO_UDP && l4_size >= UDP_HEADER_LEN) { - /* New UDP checksum. */ struct udp_header *uh = dp_packet_l4(packet); - if (uh->udp_csum) { + if (dp_packet_hwol_l4_is_udp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else if (uh->udp_csum) { + /* New UDP checksum. */ uint16_t old_udp_checksum = ~uh->udp_csum; uint32_t udp_checksum = old_udp_checksum + delta_checksum; udp_checksum = csum_finish(udp_checksum); @@ -501,13 +503,17 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch, } } else if (nh->ip_proto == IPPROTO_TCP && l4_size >= TCP_HEADER_LEN) { - /* New TCP checksum. */ - struct tcp_header *th = dp_packet_l4(packet); - uint16_t old_tcp_checksum = ~th->tcp_csum; - uint32_t tcp_checksum = old_tcp_checksum + delta_checksum; - tcp_checksum = csum_finish(tcp_checksum); - - th->tcp_csum = tcp_checksum; + if (dp_packet_hwol_l4_is_tcp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else { + /* New TCP checksum. */ + struct tcp_header *th = dp_packet_l4(packet); + uint16_t old_tcp_checksum = ~th->tcp_csum; + uint32_t tcp_checksum = old_tcp_checksum + delta_checksum; + tcp_checksum = csum_finish(tcp_checksum); + + th->tcp_csum = tcp_checksum; + } } pkt_metadata_init_conn(&packet->md); @@ -569,11 +575,22 @@ avx512_ipv6_sum_header(__m512i ip6_header) static inline uint16_t ALWAYS_INLINE __attribute__((__target__("avx512vbmi"))) -avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i new_header) +avx512_ipv6_addr_csum_delta(__m512i v_packet, __m512i v_new_hdr, + bool rh_present) { - uint16_t old_delta = avx512_ipv6_sum_header(old_header); - uint16_t new_delta = avx512_ipv6_sum_header(new_header); - uint32_t csum_delta = ((uint16_t) ~old_delta) + new_delta; + __m512i v_new_hdr_for_cksum = v_new_hdr; + uint32_t csum_delta; + uint16_t old_delta; + uint16_t new_delta; + + if (rh_present) { + v_new_hdr_for_cksum = _mm512_mask_blend_epi64(0x18, v_new_hdr, + v_packet); + } + + old_delta = avx512_ipv6_sum_header(v_packet); + new_delta = avx512_ipv6_sum_header(v_new_hdr_for_cksum); + csum_delta = ((uint16_t) ~old_delta) + new_delta; return ~csum_finish(csum_delta); } @@ -656,25 +673,19 @@ action_avx512_set_ipv6(struct dp_packet_batch *batch, const struct nlattr *a) if (do_csum) { size_t l4_size = dp_packet_l4_size(packet); - __m512i v_new_hdr_for_cksum = v_new_hdr; uint16_t delta_checksum; - /* In case of routing header being present, checksum should not be - * updated for the destination address. */ - if (rh_present) { - v_new_hdr_for_cksum = _mm512_mask_blend_epi64(0x18, v_new_hdr, - v_packet); - } - - delta_checksum = avx512_ipv6_addr_csum_delta(v_packet, - v_new_hdr_for_cksum); - if (proto == IPPROTO_UDP && l4_size >= UDP_HEADER_LEN) { struct udp_header *uh = dp_packet_l4(packet); - - if (uh->udp_csum) { + if (dp_packet_hwol_l4_is_udp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else if (uh->udp_csum) { + delta_checksum = avx512_ipv6_addr_csum_delta(v_packet, + v_new_hdr, + rh_present); uint16_t old_udp_checksum = ~uh->udp_csum; - uint32_t udp_checksum = old_udp_checksum + delta_checksum; + uint32_t udp_checksum = old_udp_checksum + + delta_checksum; udp_checksum = csum_finish(udp_checksum); @@ -684,15 +695,26 @@ action_avx512_set_ipv6(struct dp_packet_batch *batch, const struct nlattr *a) uh->udp_csum = udp_checksum; } - } else if (proto == IPPROTO_TCP && l4_size >= TCP_HEADER_LEN) { - struct tcp_header *th = dp_packet_l4(packet); - uint16_t old_tcp_checksum = ~th->tcp_csum; - uint32_t tcp_checksum = old_tcp_checksum + delta_checksum; - tcp_checksum = csum_finish(tcp_checksum); - th->tcp_csum = tcp_checksum; + } else if (proto == IPPROTO_TCP && l4_size >= TCP_HEADER_LEN) { + if (dp_packet_hwol_l4_is_tcp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else { + delta_checksum = avx512_ipv6_addr_csum_delta(v_packet, + v_new_hdr, + rh_present); + struct tcp_header *th = dp_packet_l4(packet); + uint16_t old_tcp_checksum = ~th->tcp_csum; + uint32_t tcp_checksum = old_tcp_checksum + delta_checksum; + + tcp_checksum = csum_finish(tcp_checksum); + th->tcp_csum = tcp_checksum; + } } else if (proto == IPPROTO_ICMPV6 && l4_size >= sizeof(struct icmp6_header)) { + delta_checksum = avx512_ipv6_addr_csum_delta(v_packet, + v_new_hdr, + rh_present); struct icmp6_header *icmp = dp_packet_l4(packet); uint16_t old_icmp6_checksum = ~icmp->icmp6_cksum; uint32_t icmp6_checksum = old_icmp6_checksum + delta_checksum; diff --git a/lib/packets.c b/lib/packets.c index a4ccc21f823..462b51f92dc 100644 --- a/lib/packets.c +++ b/lib/packets.c @@ -1131,16 +1131,22 @@ packet_set_ipv4_addr(struct dp_packet *packet, pkt_metadata_init_conn(&packet->md); if (nh->ip_proto == IPPROTO_TCP && l4_size >= TCP_HEADER_LEN) { - struct tcp_header *th = dp_packet_l4(packet); - - th->tcp_csum = recalc_csum32(th->tcp_csum, old_addr, new_addr); + if (dp_packet_hwol_l4_is_tcp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else { + struct tcp_header *th = dp_packet_l4(packet); + th->tcp_csum = recalc_csum32(th->tcp_csum, old_addr, new_addr); + } } else if (nh->ip_proto == IPPROTO_UDP && l4_size >= UDP_HEADER_LEN ) { - struct udp_header *uh = dp_packet_l4(packet); - - if (uh->udp_csum) { - uh->udp_csum = recalc_csum32(uh->udp_csum, old_addr, new_addr); - if (!uh->udp_csum) { - uh->udp_csum = htons(0xffff); + if (dp_packet_hwol_l4_is_udp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else { + struct udp_header *uh = dp_packet_l4(packet); + if (uh->udp_csum) { + uh->udp_csum = recalc_csum32(uh->udp_csum, old_addr, new_addr); + if (!uh->udp_csum) { + uh->udp_csum = htons(0xffff); + } } } } @@ -1246,16 +1252,24 @@ packet_update_csum128(struct dp_packet *packet, uint8_t proto, size_t l4_size = dp_packet_l4_size(packet); if (proto == IPPROTO_TCP && l4_size >= TCP_HEADER_LEN) { - struct tcp_header *th = dp_packet_l4(packet); + if (dp_packet_hwol_l4_is_tcp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else { + struct tcp_header *th = dp_packet_l4(packet); - th->tcp_csum = recalc_csum128(th->tcp_csum, addr, new_addr); + th->tcp_csum = recalc_csum128(th->tcp_csum, addr, new_addr); + } } else if (proto == IPPROTO_UDP && l4_size >= UDP_HEADER_LEN) { - struct udp_header *uh = dp_packet_l4(packet); + if (dp_packet_hwol_l4_is_udp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else { + struct udp_header *uh = dp_packet_l4(packet); - if (uh->udp_csum) { - uh->udp_csum = recalc_csum128(uh->udp_csum, addr, new_addr); - if (!uh->udp_csum) { - uh->udp_csum = htons(0xffff); + if (uh->udp_csum) { + uh->udp_csum = recalc_csum128(uh->udp_csum, addr, new_addr); + if (!uh->udp_csum) { + uh->udp_csum = htons(0xffff); + } } } } else if (proto == IPPROTO_ICMPV6 && @@ -1375,7 +1389,9 @@ static void packet_set_port(ovs_be16 *port, ovs_be16 new_port, ovs_be16 *csum) { if (*port != new_port) { - *csum = recalc_csum16(*csum, *port, new_port); + if (csum) { + *csum = recalc_csum16(*csum, *port, new_port); + } *port = new_port; } } @@ -1387,9 +1403,16 @@ void packet_set_tcp_port(struct dp_packet *packet, ovs_be16 src, ovs_be16 dst) { struct tcp_header *th = dp_packet_l4(packet); + ovs_be16 *csum = NULL; + + if (dp_packet_hwol_l4_is_tcp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else { + csum = &th->tcp_csum; + } - packet_set_port(&th->tcp_src, src, &th->tcp_csum); - packet_set_port(&th->tcp_dst, dst, &th->tcp_csum); + packet_set_port(&th->tcp_src, src, csum); + packet_set_port(&th->tcp_dst, dst, csum); pkt_metadata_init_conn(&packet->md); } @@ -1401,17 +1424,21 @@ packet_set_udp_port(struct dp_packet *packet, ovs_be16 src, ovs_be16 dst) { struct udp_header *uh = dp_packet_l4(packet); - if (uh->udp_csum) { - packet_set_port(&uh->udp_src, src, &uh->udp_csum); - packet_set_port(&uh->udp_dst, dst, &uh->udp_csum); + if (dp_packet_hwol_l4_is_udp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + packet_set_port(&uh->udp_src, src, NULL); + packet_set_port(&uh->udp_dst, dst, NULL); + } else { + ovs_be16 *csum = uh->udp_csum ? &uh->udp_csum : NULL; + + packet_set_port(&uh->udp_src, src, csum); + packet_set_port(&uh->udp_dst, dst, csum); - if (!uh->udp_csum) { + if (csum && !uh->udp_csum) { uh->udp_csum = htons(0xffff); } - } else { - uh->udp_src = src; - uh->udp_dst = dst; } + pkt_metadata_init_conn(&packet->md); } @@ -1422,18 +1449,27 @@ void packet_set_sctp_port(struct dp_packet *packet, ovs_be16 src, ovs_be16 dst) { struct sctp_header *sh = dp_packet_l4(packet); - ovs_be32 old_csum, old_correct_csum, new_csum; - uint16_t tp_len = dp_packet_l4_size(packet); - old_csum = get_16aligned_be32(&sh->sctp_csum); - put_16aligned_be32(&sh->sctp_csum, 0); - old_correct_csum = crc32c((void *)sh, tp_len); + if (dp_packet_hwol_l4_is_sctp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + sh->sctp_src = src; + sh->sctp_dst = dst; + } else { + ovs_be32 old_csum, old_correct_csum, new_csum; + uint16_t tp_len = dp_packet_l4_size(packet); - sh->sctp_src = src; - sh->sctp_dst = dst; + old_csum = get_16aligned_be32(&sh->sctp_csum); + put_16aligned_be32(&sh->sctp_csum, 0); + old_correct_csum = crc32c((void *) sh, tp_len); + + sh->sctp_src = src; + sh->sctp_dst = dst; + + new_csum = crc32c((void *) sh, tp_len); + put_16aligned_be32(&sh->sctp_csum, old_csum ^ old_correct_csum + ^ new_csum); + } - new_csum = crc32c((void *)sh, tp_len); - put_16aligned_be32(&sh->sctp_csum, old_csum ^ old_correct_csum ^ new_csum); pkt_metadata_init_conn(&packet->md); } @@ -1957,3 +1993,72 @@ IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6) } } } + +/* Set TCP checksum field in packet 'p' with complete checksum. + * The packet must have the L3 and L4 offsets. */ +void +packet_tcp_complete_csum(struct dp_packet *p) +{ + struct tcp_header *tcp = dp_packet_l4(p); + + tcp->tcp_csum = 0; + if (dp_packet_hwol_is_ipv4(p)) { + struct ip_header *ip = dp_packet_l3(p); + + tcp->tcp_csum = csum_finish(csum_continue(packet_csum_pseudoheader(ip), + tcp, dp_packet_l4_size(p))); + } else if (dp_packet_hwol_tx_ipv6(p)) { + struct ovs_16aligned_ip6_hdr *ip6 = dp_packet_l3(p); + + tcp->tcp_csum = packet_csum_upperlayer6(ip6, tcp, ip6->ip6_nxt, + dp_packet_l4_size(p)); + } else { + OVS_NOT_REACHED(); + } +} + +/* Set UDP checksum field in packet 'p' with complete checksum. + * The packet must have the L3 and L4 offsets. */ +void +packet_udp_complete_csum(struct dp_packet *p) +{ + struct udp_header *udp = dp_packet_l4(p); + + /* Skip csum calculation if the udp_csum is zero. */ + if (!udp->udp_csum) { + return; + } + + udp->udp_csum = 0; + if (dp_packet_hwol_is_ipv4(p)) { + struct ip_header *ip = dp_packet_l3(p); + + udp->udp_csum = csum_finish(csum_continue(packet_csum_pseudoheader(ip), + udp, dp_packet_l4_size(p))); + } else if (dp_packet_hwol_tx_ipv6(p)) { + struct ovs_16aligned_ip6_hdr *ip6 = dp_packet_l3(p); + + udp->udp_csum = packet_csum_upperlayer6(ip6, udp, ip6->ip6_nxt, + dp_packet_l4_size(p)); + } else { + OVS_NOT_REACHED(); + } + + if (!udp->udp_csum) { + udp->udp_csum = htons(0xffff); + } +} + +/* Set SCTP checksum field in packet 'p' with complete checksum. + * The packet must have the L3 and L4 offsets. */ +void +packet_sctp_complete_csum(struct dp_packet *p) +{ + struct sctp_header *sh = dp_packet_l4(p); + uint16_t tp_len = dp_packet_l4_size(p); + ovs_be32 csum; + + put_16aligned_be32(&sh->sctp_csum, 0); + csum = crc32c((void *) sh, tp_len); + put_16aligned_be32(&sh->sctp_csum, csum); +} diff --git a/lib/packets.h b/lib/packets.h index ac4c28e471e..200b25cf012 100644 --- a/lib/packets.h +++ b/lib/packets.h @@ -1671,6 +1671,9 @@ uint32_t packet_csum_pseudoheader(const struct ip_header *); bool packet_rh_present(struct dp_packet *packet, uint8_t *nexthdr, bool *first_frag); void IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6); +void packet_tcp_complete_csum(struct dp_packet *); +void packet_udp_complete_csum(struct dp_packet *); +void packet_sctp_complete_csum(struct dp_packet *); #define DNS_HEADER_LEN 12 struct dns_header {