diff --git a/lib/automake.mk b/lib/automake.mk index 1be13a420a7..0dc8a35cc43 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -118,6 +118,8 @@ lib_libopenvswitch_la_SOURCES = \ lib/dpctl.h \ lib/dp-packet.h \ lib/dp-packet.c \ + lib/dp-packet-gso.c \ + lib/dp-packet-gso.h \ lib/dpdk.h \ lib/dpif-netdev-extract-study.c \ lib/dpif-netdev-lookup.h \ diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c new file mode 100644 index 00000000000..e2c141b32a7 --- /dev/null +++ b/lib/dp-packet-gso.c @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2023 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "dp-packet.h" +#include "dp-packet-gso.h" +#include "netdev-provider.h" +#include "openvswitch/vlog.h" + +VLOG_DEFINE_THIS_MODULE(dp_packet_gso); + +/* Retuns a new packet that is a segment of packet 'p'. + * + * The new packet is initialized with 'hdr_len' bytes from the + * start of packet 'p' and then appended with 'data_len' bytes + * from the 'data' buffer. + * + * Note: The packet headers are not updated. */ +static struct dp_packet * +dp_packet_gso_seg_new(const struct dp_packet *p, size_t hdr_len, + const char *data, size_t data_len) +{ + struct dp_packet *seg = dp_packet_new_with_headroom(hdr_len + data_len, + dp_packet_headroom(p)); + + /* Append the original packet headers and then the payload. */ + dp_packet_put(seg, dp_packet_data(p), hdr_len); + dp_packet_put(seg, data, data_len); + + /* The new segment should have the same offsets. */ + seg->l2_5_ofs = p->l2_5_ofs; + seg->l3_ofs = p->l3_ofs; + seg->l4_ofs = p->l4_ofs; + + /* The protocol headers remain the same, so preserve hash and mark. */ + *dp_packet_rss_ptr(seg) = *dp_packet_rss_ptr(p); + *dp_packet_flow_mark_ptr(seg) = *dp_packet_flow_mark_ptr(p); + + /* The segment should inherit all the offloading flags from the + * original packet, except for the TCP segmentation, external + * buffer and indirect buffer flags. */ + *dp_packet_ol_flags_ptr(seg) = *dp_packet_ol_flags_ptr(p) + & DP_PACKET_OL_SUPPORTED_MASK; + + dp_packet_hwol_reset_tcp_seg(seg); + + return seg; +} + +/* Returns the calculated number of TCP segments in packet 'p'. */ +int +dp_packet_gso_nr_segs(struct dp_packet *p) +{ + uint16_t segsz = dp_packet_get_tso_segsz(p); + const char *data_tail; + const char *data_pos; + + data_pos = dp_packet_get_tcp_payload(p); + data_tail = (char *) dp_packet_tail(p) - dp_packet_l2_pad_size(p); + + return DIV_ROUND_UP(data_tail - data_pos, segsz); +} + +/* Perform software segmentation on packet 'p'. + * + * Segments packet 'p' into the array of preallocated batches in 'batches', + * updating the 'batches' pointer as needed and returns true. + * + * Returns false if the packet cannot be segmented. */ +bool +dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + struct dp_packet_batch *curr_batch = *batches; + struct tcp_header *tcp_hdr; + struct ip_header *ip_hdr; + struct dp_packet *seg; + uint16_t tcp_offset; + uint16_t tso_segsz; + uint32_t tcp_seq; + uint16_t ip_id; + int hdr_len; + int seg_len; + + tso_segsz = dp_packet_get_tso_segsz(p); + if (!tso_segsz) { + VLOG_WARN_RL(&rl, "GSO packet with len %d with no segment size.", + dp_packet_size(p)); + return false; + } + + tcp_hdr = dp_packet_l4(p); + tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl); + tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq)); + hdr_len = ((char *) dp_packet_l4(p) - (char *) dp_packet_eth(p)) + + tcp_offset * 4; + ip_id = 0; + if (dp_packet_hwol_is_ipv4(p)) { + ip_hdr = dp_packet_l3(p); + ip_id = ntohs(ip_hdr->ip_id); + } + + const char *data_tail = (char *) dp_packet_tail(p) + - dp_packet_l2_pad_size(p); + const char *data_pos = dp_packet_get_tcp_payload(p); + int n_segs = dp_packet_gso_nr_segs(p); + + for (int i = 0; i < n_segs; i++) { + seg_len = data_tail - data_pos; + if (seg_len > tso_segsz) { + seg_len = tso_segsz; + } + + seg = dp_packet_gso_seg_new(p, hdr_len, data_pos, seg_len); + data_pos += seg_len; + + /* Update L3 header. */ + if (dp_packet_hwol_is_ipv4(seg)) { + ip_hdr = dp_packet_l3(seg); + ip_hdr->ip_tot_len = htons(sizeof *ip_hdr + + dp_packet_l4_size(seg)); + ip_hdr->ip_id = htons(ip_id); + ip_hdr->ip_csum = 0; + ip_id++; + } else { + struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(seg); + + ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen + = htons(sizeof *ip_hdr + dp_packet_l4_size(seg)); + } + + /* Update L4 header. */ + tcp_hdr = dp_packet_l4(seg); + put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq)); + tcp_seq += seg_len; + if (OVS_LIKELY(i < (n_segs - 1))) { + /* Reset flags PUSH and FIN unless it is the last segment. */ + uint16_t tcp_flags = TCP_FLAGS(tcp_hdr->tcp_ctl) + & ~(TCP_PSH | TCP_FIN); + tcp_hdr->tcp_ctl = TCP_CTL(tcp_flags, tcp_offset); + } + + if (dp_packet_batch_is_full(curr_batch)) { + curr_batch++; + } + + dp_packet_batch_add(curr_batch, seg); + } + + *batches = curr_batch; + return true; +} diff --git a/lib/dp-packet-gso.h b/lib/dp-packet-gso.h new file mode 100644 index 00000000000..9c282fb86cc --- /dev/null +++ b/lib/dp-packet-gso.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2023 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef DP_PACKET_GSO_H +#define DP_PACKET_GSO_H 1 + +bool dp_packet_gso(struct dp_packet *, struct dp_packet_batch **); +int dp_packet_gso_nr_segs(struct dp_packet *); + +#endif /* dp-packet-gso.h */ diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 30a1d9dc090..11aa0072354 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -1131,6 +1131,13 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b) *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG; } +/* Resets TCP Segmentation flag in packet 'p'. */ +static inline void +dp_packet_hwol_reset_tcp_seg(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_TX_TCP_SEG; +} + /* Returns 'true' if the IP header has good integrity and the * checksum in it is complete. */ static inline bool diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 706036d4e42..1ff25c24692 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -2471,6 +2471,7 @@ static bool netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) { struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf); + struct tcp_header *th; if (!(mbuf->ol_flags & (RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_L4_MASK | RTE_MBUF_F_TX_TCP_SEG))) { @@ -2483,27 +2484,36 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) mbuf->l4_len = 0; mbuf->outer_l2_len = 0; mbuf->outer_l3_len = 0; + th = dp_packet_l4(pkt); if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) { - struct tcp_header *th = dp_packet_l4(pkt); - int hdr_len; - if (!th) { VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header" " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len); return false; } + } + + if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_CKSUM) { + if (!th) { + VLOG_WARN_RL(&rl, "%s: TCP offloading without L4 header" + " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len); + return false; + } mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4; - mbuf->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM; - hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len; mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len; - if (OVS_UNLIKELY((hdr_len + mbuf->tso_segsz) > dev->max_packet_len)) { - VLOG_WARN_RL(&rl, "%s: Oversized TSO packet. " - "hdr: %"PRIu32", gso: %"PRIu32", max len: %"PRIu32"", - dev->up.name, hdr_len, mbuf->tso_segsz, - dev->max_packet_len); - return false; + + if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) { + int hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len; + if (OVS_UNLIKELY((hdr_len + + mbuf->tso_segsz) > dev->max_packet_len)) { + VLOG_WARN_RL(&rl, "%s: Oversized TSO packet. hdr: %"PRIu32", " + "gso: %"PRIu32", max len: %"PRIu32"", + dev->up.name, hdr_len, mbuf->tso_segsz, + dev->max_packet_len); + return false; + } } if (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4) { @@ -2891,6 +2901,7 @@ dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig) mbuf_dest->packet_type = pkt_orig->mbuf.packet_type; mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags & ~(RTE_MBUF_F_EXTERNAL | RTE_MBUF_F_INDIRECT)); + mbuf_dest->tso_segsz = pkt_orig->mbuf.tso_segsz; memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size, sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size)); @@ -2949,11 +2960,20 @@ netdev_dpdk_common_send(struct netdev *netdev, struct dp_packet_batch *batch, struct rte_mbuf **pkts = (struct rte_mbuf **) batch->packets; struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); size_t cnt, pkt_cnt = dp_packet_batch_size(batch); + struct dp_packet *packet; + bool need_copy = false; memset(stats, 0, sizeof *stats); + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + if (packet->source != DPBUF_DPDK) { + need_copy = true; + break; + } + } + /* Copy dp-packets to mbufs. */ - if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) { + if (OVS_UNLIKELY(need_copy)) { cnt = dpdk_copy_batch_to_mbuf(netdev, batch); stats->tx_failure_drops += pkt_cnt - cnt; pkt_cnt = cnt; diff --git a/lib/netdev.c b/lib/netdev.c index e5ac7713d2e..3ed8049f76a 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -35,6 +35,7 @@ #include "coverage.h" #include "dpif.h" #include "dp-packet.h" +#include "dp-packet-gso.h" #include "openvswitch/dynamic-string.h" #include "fatal-signal.h" #include "hash.h" @@ -56,6 +57,7 @@ #include "svec.h" #include "openvswitch/vlog.h" #include "flow.h" +#include "userspace-tso.h" #include "util.h" #ifdef __linux__ #include "tc.h" @@ -67,8 +69,9 @@ COVERAGE_DEFINE(netdev_received); COVERAGE_DEFINE(netdev_sent); COVERAGE_DEFINE(netdev_add_router); COVERAGE_DEFINE(netdev_get_stats); -COVERAGE_DEFINE(netdev_send_prepare_drops); COVERAGE_DEFINE(netdev_push_header_drops); +COVERAGE_DEFINE(netdev_soft_seg_good); +COVERAGE_DEFINE(netdev_soft_seg_drops); struct netdev_saved_flags { struct netdev *netdev; @@ -792,60 +795,84 @@ netdev_get_pt_mode(const struct netdev *netdev) : NETDEV_PT_LEGACY_L2); } -/* Check if a 'packet' is compatible with 'netdev_flags'. - * If a packet is incompatible, return 'false' with the 'errormsg' - * pointing to a reason. */ -static bool -netdev_send_prepare_packet(const uint64_t netdev_flags, - struct dp_packet *packet, char **errormsg) -{ - if (dp_packet_hwol_is_tso(packet) - && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { - /* Fall back to GSO in software. */ - VLOG_ERR_BUF(errormsg, "No TSO support"); - return false; - } - - /* Packet with IP csum offloading enabled was received with verified csum. - * Leave the IP csum offloading enabled even with good checksum to the - * netdev to decide what would be the best to do. - * Provide a software fallback in case the device doesn't support IP csum - * offloading. Note: Encapsulated packet must have the inner IP header - * csum already calculated. - * Packet with L4 csum offloading enabled was received with verified csum. - * Leave the L4 csum offloading enabled even with good checksum for the - * netdev to decide what would be the best to do. - * Netdev that requires pseudo header csum needs to calculate that. - * Provide a software fallback in case the netdev doesn't support L4 csum - * offloading. Note: Encapsulated packet must have the inner L4 header - * csum already calculated. */ - dp_packet_ol_send_prepare(packet, netdev_flags); - - return true; -} - -/* Check if each packet in 'batch' is compatible with 'netdev' features, - * otherwise either fall back to software implementation or drop it. */ -static void -netdev_send_prepare_batch(const struct netdev *netdev, - struct dp_packet_batch *batch) +/* Attempts to segment GSO flagged packets and send them as multiple bundles. + * This function is only used if at least one packet in the current batch is + * flagged for TSO and the netdev does not support this. + * + * The return value is 0 if all batches sent successfully, and an error code + * from netdev_class->send() if at least one batch failed to send. */ +static int +netdev_send_tso(struct netdev *netdev, int qid, + struct dp_packet_batch *batch, bool concurrent_txq) { + struct dp_packet_batch *batches; struct dp_packet *packet; - size_t i, size = dp_packet_batch_size(batch); - - DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) { - char *errormsg = NULL; + int retval = 0; + int n_packets; + int n_batches; + int error; - if (netdev_send_prepare_packet(netdev->ol_flags, packet, &errormsg)) { - dp_packet_batch_refill(batch, packet, i); + /* Calculate the total number of packets in the batch after + * the segmentation. */ + n_packets = 0; + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + if (dp_packet_hwol_is_tso(packet)) { + n_packets += dp_packet_gso_nr_segs(packet); } else { + n_packets++; + } + } + + if (!n_packets) { + return 0; + } + + /* Allocate enough batches to store all the packets in order. */ + n_batches = DIV_ROUND_UP(n_packets, NETDEV_MAX_BURST); + batches = xmalloc(n_batches * sizeof *batches); + + struct dp_packet_batch *curr_batch = batches; + struct dp_packet_batch *last_batch = &batches[n_batches - 1]; + for (curr_batch = batches; curr_batch <= last_batch; curr_batch++) { + dp_packet_batch_init(curr_batch); + } + + /* Do the packet segmentation if TSO is flagged. */ + size_t size = dp_packet_batch_size(batch); + size_t k; + curr_batch = batches; + DP_PACKET_BATCH_REFILL_FOR_EACH (k, size, packet, batch) { + if (dp_packet_hwol_is_tso(packet)) { + if (dp_packet_gso(packet, &curr_batch)) { + COVERAGE_INC(netdev_soft_seg_good); + } else { + COVERAGE_INC(netdev_soft_seg_drops); + } dp_packet_delete(packet); - COVERAGE_INC(netdev_send_prepare_drops); - VLOG_WARN_RL(&rl, "%s: Packet dropped: %s", - netdev_get_name(netdev), errormsg); - free(errormsg); + } else { + if (dp_packet_batch_is_full(curr_batch)) { + curr_batch++; + } + + dp_packet_batch_add(curr_batch, packet); + } + } + + for (curr_batch = batches; curr_batch <= last_batch; curr_batch++) { + DP_PACKET_BATCH_FOR_EACH (i, packet, curr_batch) { + dp_packet_ol_send_prepare(packet, netdev->ol_flags); + } + + error = netdev->netdev_class->send(netdev, qid, curr_batch, + concurrent_txq); + if (!error) { + COVERAGE_INC(netdev_sent); + } else { + retval = error; } } + free(batches); + return retval; } /* Sends 'batch' on 'netdev'. Returns 0 if successful (for every packet), @@ -877,11 +904,21 @@ int netdev_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch, bool concurrent_txq) { + const uint64_t netdev_flags = netdev->ol_flags; + struct dp_packet *packet; int error; - netdev_send_prepare_batch(netdev, batch); - if (OVS_UNLIKELY(dp_packet_batch_is_empty(batch))) { - return 0; + if (userspace_tso_enabled() && + !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + if (dp_packet_hwol_is_tso(packet)) { + return netdev_send_tso(netdev, qid, batch, concurrent_txq); + } + } + } + + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + dp_packet_ol_send_prepare(packet, netdev_flags); } error = netdev->netdev_class->send(netdev, qid, batch, concurrent_txq); diff --git a/lib/packets.c b/lib/packets.c index 462b51f92dc..dab823ba225 100644 --- a/lib/packets.c +++ b/lib/packets.c @@ -427,7 +427,7 @@ add_mpls(struct dp_packet *packet, ovs_be16 ethtype, ovs_be32 lse, } if (!l3_encap) { - struct mpls_hdr *header = dp_packet_push_uninit(packet, MPLS_HLEN); + struct mpls_hdr *header = dp_packet_resize_l2(packet, MPLS_HLEN); put_16aligned_be32(&header->mpls_lse, lse); packet->l2_5_ofs = 0; @@ -513,7 +513,7 @@ push_nsh(struct dp_packet *packet, const struct nsh_hdr *nsh_hdr_src) OVS_NOT_REACHED(); } - nsh = (struct nsh_hdr *) dp_packet_push_uninit(packet, length); + nsh = (struct nsh_hdr *) dp_packet_resize_l2(packet, length); memcpy(nsh, nsh_hdr_src, length); nsh->next_proto = next_proto; packet->packet_type = htonl(PT_NSH); diff --git a/tests/system-traffic.at b/tests/system-traffic.at index a7d4ed83bdc..dcafa8b29c7 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -1,5 +1,50 @@ AT_BANNER([datapath-sanity]) +AT_SETUP([datapath - netdev offload software fallback]) +AT_SKIP_IF([test $HAVE_NC = no]) +OVS_TRAFFIC_VSWITCHD_START() + +AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +dnl Test the case where only one side has all checksum and tso offload disabled. +AT_CHECK([ethtool -K ovs-p1 tso off], [0], [ignore], [ignore]) +AT_CHECK([ethtool -K ovs-p1 sg off], [0], [ignore], [ignore]) + +dnl Reinitialize. +AT_CHECK([ovs-vsctl del-port ovs-p1]) +AT_CHECK([ovs-vsctl add-port br0 ovs-p1]) + +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +NS_CHECK_EXEC([at_ns1], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.1 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +NETNS_DAEMONIZE([at_ns0], [nc -l 1234 > data_0], [nc1.pid]) +NETNS_DAEMONIZE([at_ns1], [nc -l 1234 > data_1], [nc2.pid]) + +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=60000 count=1 2> /dev/null]) +on_exit 'rm -f payload.bin' + +NS_CHECK_EXEC([at_ns0], [nc $NC_EOF_OPT 10.1.1.2 1234 < payload.bin]) +NS_CHECK_EXEC([at_ns1], [nc $NC_EOF_OPT 10.1.1.1 1234 < payload.bin]) + +dnl Wait until transfer completes. +OVS_WAIT_WHILE([kill -0 $(cat nc1.pid) $(cat nc2.pid)]) + +AT_CHECK([diff -q payload.bin data_0], [0]) +AT_CHECK([diff -q payload.bin data_1], [0]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([datapath - ping between two ports]) OVS_TRAFFIC_VSWITCHD_START()