diff --git a/lib/automake.mk b/lib/automake.mk index f5b53223d49..0341859201a 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -772,3 +772,15 @@ lib_dpif_netlink2_la_LDFLAGS = -shared -module \ $(OVS_LTINFO) \ $(AM_LDFLAGS) lib_dpif_netlink2_la_SOURCES = lib/dpif-netlink2.c + +lib_LTLIBRARIES += lib/dpif-netdev2.la +lib_dpif_netdev2_la_LDFLAGS = -shared -module \ + $(OVS_LTINFO) \ + $(AM_LDFLAGS) +lib_dpif_netdev2_la_SOURCES = \ + lib/dpif-netdev2.c \ + lib/dpif-netdev2.h \ + lib/dpif-netdev-ext2.c \ + lib/dpif-netdev-ext2.h \ + lib/netdev-offload-ext.c \ + lib/netdev-offload-ext.h diff --git a/lib/dpif-netdev-ext2.c b/lib/dpif-netdev-ext2.c new file mode 100644 index 00000000000..84e4241b3c1 --- /dev/null +++ b/lib/dpif-netdev-ext2.c @@ -0,0 +1,4393 @@ +/* + * Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "dpif-netdev2.h" +#include "dpif-netdev-private.h" +#include "dpif-netdev-private-dfc.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "batch.h" +#include "bitmap.h" +#include "ccmap.h" +#include "cmap.h" +#include "conntrack.h" +#include "conntrack-offload.h" +#include "conntrack-tp.h" +#include "coverage.h" +#include "ct-dpif.h" +#include "csum.h" +#include "dp-packet.h" +#include "dpif.h" +#include "dpif-netdev-lookup.h" +#include "dpif-netdev-perf.h" +#include "dpif-netdev-private-extract.h" +#include "dpif-provider.h" +#include "dummy.h" +#include "fat-rwlock.h" +#include "flow.h" +#include "histogram.h" +#include "hmapx.h" +#include "id-pool.h" +#include "id-fpool.h" +#include "ipf.h" +#include "metrics.h" +#include "mov-avg.h" +#include "mpsc-queue.h" +#include "netdev.h" +#include "netdev-offload.h" +#include "netdev-offload-dpdk-ext.h" +#include "netdev-offload-ext.h" +#include "netdev-provider.h" +#include "netdev-vport.h" +#include "netdev-dpdk.h" +#include "netlink.h" +#include "odp-execute.h" +#include "odp-util.h" +#include "openvswitch/dynamic-string.h" +#include "openvswitch/list.h" +#include "openvswitch/match.h" +#include "openvswitch/ofp-parse.h" +#include "openvswitch/ofp-print.h" +#include "openvswitch/ofpbuf.h" +#include "openvswitch/shash.h" +#include "openvswitch/vlog.h" +#include "ovs-doca.h" +#include "ovs-numa.h" +#include "ovs-rcu.h" +#include "packets.h" +#include "openvswitch/poll-loop.h" +#include "pvector.h" +#include "random.h" +#include "seq.h" +#include "smap.h" +#include "sset.h" +#include "timeval.h" +#include "tnl-neigh-cache.h" +#include "tnl-ports.h" +#include "unixctl.h" +#include "util.h" +#include "uuid.h" + +VLOG_DEFINE_THIS_MODULE(dpif_netdev_ext); + +extern struct odp_support dp_netdev_support; + +static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600); + +static atomic_bool dump_packets_enabled = false; +static unsigned int n_pmd_threads; + +bool dp_netdev_e2e_cache_enabled = false; +static uint32_t dp_netdev_e2e_cache_size = 0; +#define E2E_CACHE_MAX_TRACE_Q_SIZE (10000u) +static uint32_t dp_netdev_e2e_cache_trace_q_size = E2E_CACHE_MAX_TRACE_Q_SIZE; +#define INVALID_OFFLOAD_THREAD_NB (MAX_OFFLOAD_THREAD_NB + 1) + +static struct ovs_mutex flows_map_mutex = OVS_MUTEX_INITIALIZER; +static struct hmap flows_map OVS_GUARDED_BY(flows_map_mutex) = + HMAP_INITIALIZER(&flows_map); +static atomic_count flows_map_count = ATOMIC_COUNT_INIT(0); +static struct ovs_mutex merged_flows_map_mutex = OVS_MUTEX_INITIALIZER; +static struct hmap merged_flows_map OVS_GUARDED_BY(merged_flows_map_mutex) = + HMAP_INITIALIZER(&merged_flows_map); +static atomic_count merged_flows_map_count = ATOMIC_COUNT_INIT(0); + +COVERAGE_DEFINE(flow_offload_200ms_latency); +COVERAGE_DEFINE(ct_offload_30us_latency); +COVERAGE_DEFINE(ct_offload_50us_latency); +COVERAGE_DEFINE(ct_offload_100us_latency); + +#define DEFAULT_MAX_RECIRC_DEPTH 8 +unsigned int max_recirc_depth = DEFAULT_MAX_RECIRC_DEPTH; + +struct ovs_numa_dump * +dp_netdev_pmd_cmask2cores(const char *pmd_cmask) +{ + struct ovs_numa_dump *pmd_cores; + + if (pmd_cmask && pmd_cmask[0]) { + pmd_cores = ovs_numa_dump_cores_with_cmask(pmd_cmask); + } else { + pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS); + } + + return pmd_cores; +} + +unsigned int +dpif_netdev2_get_n_pmd_threads(void) +{ + return n_pmd_threads; +} + +void +dpif_netdev2_set_n_pmd_threads(const char *pmd_cmask) +{ + struct ovs_numa_dump *pmd_cores; + + pmd_cores = dp_netdev_pmd_cmask2cores(pmd_cmask); + n_pmd_threads = ovs_numa_dump_count(pmd_cores); + ovs_numa_dump_destroy(pmd_cores); +} + +void +dp_netdev_dump_packets_toggle(struct unixctl_conn *conn, int argc, + const char *argv[], void *aux OVS_UNUSED) +{ + bool flag = false; + + if (argc == 1) { + flag = true; + } else { + if (!strcmp(argv[1], "on")) { + flag = true; + } else if (!strcmp(argv[1], "off")) { + flag = false; + } else { + unixctl_command_reply_error(conn, "Invalid parameters"); + return; + } + } + + atomic_store_relaxed(&dump_packets_enabled, flag); + unixctl_command_reply(conn, flag ? "ON" : "OFF"); +} + +void +dp_netdev_read_dump_packets_enabled(bool *flag) +{ + atomic_read_relaxed(&dump_packets_enabled, flag); +} + +struct dp_offload_thread * +dp_netdev_offload_thread_next(struct dp_offload_thread *start, + unsigned int *tid, bool include_main) +{ + size_t array_size = ARRAY_SIZE(dp_offload_threads); + struct dp_offload_thread *next; + bool active; + + if (start == NULL) { + unsigned int first = NETDEV_OFFLOAD_THREAD_MAIN; + + if (!include_main) { + first += 1; + } + next = &dp_offload_threads[first]; + } else { + next = start + 1; + } + + while (next != &dp_offload_threads[array_size]) { + atomic_read(&next->active, &active); + if (active) { + if (tid) { + *tid = (unsigned int) (ptrdiff_t) (next - dp_offload_threads); + } + return next; + } + next++; + } + + return NULL; +} + +void +dp_netdev_offload_thread_enqueue(struct dp_offload_thread *thread, + struct dp_offload_thread_item *offload) +{ + dp_netdev_offload_init(); + + mpsc_queue_insert(&thread->offload_queue, &offload->node); + atomic_count_inc64(&thread->enqueued_offload); +} + +#define CT_ADD_DEFAULT_QUEUE_SIZE 200000 +static unsigned int offload_ct_add_queue_size = CT_ADD_DEFAULT_QUEUE_SIZE; + +void +packet_enqueue_to_flow_map(struct dp_packet *packet, + struct dp_netdev_flow *flow, + uint16_t tcp_flags, + struct dp_packet_flow_map *flow_map, + size_t index) +{ + struct dp_packet_flow_map *map = &flow_map[index]; + map->flow = flow; + map->packet = packet; + map->tcp_flags = tcp_flags; +} + +static struct hmap counter_map = HMAP_INITIALIZER(&counter_map); + +static inline int +e2e_cache_counter_cmp_key(const struct e2e_cache_counter_item *item, + const struct flows_counter_key *key) +{ + if (item->is_ct) { + /* In case of CT compare only first 128 bits where 'ptr_key' + * resides. It's not enough to compare only 'ptr_key' - second + * argument can be not CT but first 64 bits of its key can be + * equal to 'ptr_key' value of the first argument. In case of CT + * next 64 bits after 'ptr_key' must always be 0, which cannot + * happen in case of UFID. + */ + const ovs_u128 *key0 = &item->key.ufid_key[0]; + const ovs_u128 *key1 = &key->ufid_key[0]; + + return ovs_u128_equals(*key0, *key1) ? 0 : 1; + } + return memcmp(&item->key, key, sizeof *key); +} + +static struct e2e_cache_counter_item * +e2e_cache_counter_find(size_t hash, const struct flows_counter_key *key) +{ + struct e2e_cache_counter_item *data; + + HMAP_FOR_EACH_WITH_HASH (data, node, hash, &counter_map) { + if (data->hash == hash && !e2e_cache_counter_cmp_key(data, key)) { + return data; + } + } + return NULL; +} + +static struct e2e_cache_counter_item * +e2e_cache_counter_alloc(const struct flows_counter_key *key, size_t hash, + bool is_ct) +{ + struct e2e_cache_counter_item *item; + + item = (struct e2e_cache_counter_item *) xmalloc(sizeof *item); + item->hash = hash; + item->is_ct = is_ct; + ovs_list_init(&item->merged_flows); + memcpy(&item->key, key, sizeof *key); + return item; +} + +#define merged_match_to_match_field(dst, src, field) \ + memcpy(&dst->flow.field, &src->spec.field, sizeof dst->flow.field); \ + memcpy(&dst->wc.masks.field, &src->mask.field, sizeof dst->wc.masks.field); + +static void +merged_match_to_match(struct match *match, + struct merged_match *merged_match) +{ + memset(match, 0, sizeof *match); + + merged_match_to_match_field(match, merged_match, in_port); + merged_match_to_match_field(match, merged_match, tunnel.ip_dst); + merged_match_to_match_field(match, merged_match, tunnel.ipv6_dst); + merged_match_to_match_field(match, merged_match, tunnel.ip_src); + merged_match_to_match_field(match, merged_match, tunnel.ipv6_src); + merged_match_to_match_field(match, merged_match, tunnel.tun_id); + merged_match_to_match_field(match, merged_match, tunnel.tp_dst); + + merged_match_to_match_field(match, merged_match, dl_dst); + merged_match_to_match_field(match, merged_match, dl_src); + merged_match_to_match_field(match, merged_match, dl_type); + + merged_match_to_match_field(match, merged_match, vlans[0].tci); + + merged_match_to_match_field(match, merged_match, nw_src); + merged_match_to_match_field(match, merged_match, nw_dst); + merged_match_to_match_field(match, merged_match, ipv6_src); + merged_match_to_match_field(match, merged_match, ipv6_dst); + merged_match_to_match_field(match, merged_match, nw_frag); + merged_match_to_match_field(match, merged_match, nw_proto); + + merged_match_to_match_field(match, merged_match, tp_src); + merged_match_to_match_field(match, merged_match, tp_dst); + + merged_match_to_match_field(match, merged_match, ct_zone); +} + +void +dpif_netdev_dump_e2e_flows(struct hmap *portno_names, + struct ofputil_port_map *port_map, struct ds *s) +{ + struct e2e_cache_merged_flow *merged_flow; + struct dp_netdev_flow netdev_flow; + struct match match; + + memset(&netdev_flow, 0, sizeof netdev_flow); + + ovs_mutex_lock(&merged_flows_map_mutex); + + HMAP_FOR_EACH (merged_flow, node.in_hmap, &merged_flows_map) { + merged_match_to_match(&match, &merged_flow->merged_match); + odp_format_ufid(&merged_flow->ufid, s); + ds_put_cstr(s, ", "); + match_format(&match, port_map, s, OFP_DEFAULT_PRIORITY); + *CONST_CAST(ovs_u128 *, &netdev_flow.mega_ufid) = merged_flow->ufid; + CONST_CAST(struct flow *, &netdev_flow.flow)->in_port = + match.flow.in_port; + ds_put_cstr(s, ", actions:"); + format_odp_actions(s, merged_flow->actions, merged_flow->actions_size, + portno_names); + ds_put_cstr(s, "\n"); + } + + ovs_mutex_unlock(&merged_flows_map_mutex); +} + +void +e2e_cache_trace_add_flow(struct dp_packet *p, + const ovs_u128 *ufid) +{ + uint32_t e2e_trace_size = p->e2e_trace_size; + + if (OVS_UNLIKELY(e2e_trace_size >= E2E_CACHE_MAX_TRACE)) { + p->e2e_trace_flags |= E2E_CACHE_TRACE_FLAG_OVERFLOW; + return; + } + p->e2e_trace[e2e_trace_size] = *ufid; + p->e2e_trace_size = e2e_trace_size + 1; +} + +static inline void +e2e_cache_trace_msg_enqueue(struct e2e_cache_trace_message *msg, + unsigned int tid) +{ + struct e2e_cache_stats *e2e_stats = &dp_offload_threads[tid].e2e_stats; + + mpsc_queue_insert(&dp_offload_threads[tid].trace_queue, &msg->node); + atomic_count_inc(&e2e_stats->queue_trcs); +} + +/* Associate the merged flow to each of its composing flows, + * to allow accessing: + * - From the merged flow to all its composing flows. + * - From each flow to all the merged flow it is part of. + */ +static void +e2e_cache_associate_merged_flow(struct e2e_cache_merged_flow *merged_flow, + struct e2e_cache_ovs_flow *flows[], + uint16_t num_flows) +{ + uint16_t i, j; + + ovs_mutex_lock(&flows_map_mutex); + + for (j = 0, i = 0; j < num_flows; j++) { + if (flows[j]->offload_state != E2E_OL_STATE_FLOW && j > 0 && + flows[j - 1]->offload_state != E2E_OL_STATE_FLOW) { + continue; + } + merged_flow->associated_flows[i].index = i; + ovs_list_push_back(&flows[j]->associated_merged_flows, + &merged_flow->associated_flows[i].list); + merged_flow->associated_flows[i].mt_flow = flows[j]; + i++; + } + merged_flow->associated_flows_len = i; + + ovs_mutex_unlock(&flows_map_mutex); +} + +static void +e2e_cache_disassociate_merged_flow(struct e2e_cache_merged_flow *merged_flow) +{ + uint16_t i, num_flows = merged_flow->associated_flows_len; + + for (i = 0; i < num_flows; i++) { + ovs_list_remove(&merged_flow->associated_flows[i].list); + } +} + +/* Find e2e_cache_merged_flow with @ufid. + * merged_flows_map_mutex mutex must be locked. + */ +static inline struct e2e_cache_merged_flow * +e2e_cache_merged_flow_find(const ovs_u128 *ufid, uint32_t hash) +{ + struct e2e_cache_merged_flow *merged_flow; + + HMAP_FOR_EACH_WITH_HASH (merged_flow, node.in_hmap, hash, + &merged_flows_map) { + if (ovs_u128_equals(*ufid, merged_flow->ufid)) { + return merged_flow; + } + } + + return NULL; +} + +static inline struct e2e_cache_ovs_flow * +e2e_cache_flow_alloc(bool is_ct) +{ + struct e2e_cache_ovs_flow *flow; + size_t alloc_bytes; + + alloc_bytes = sizeof *flow; + alloc_bytes += is_ct ? sizeof flow->ct_match[0] : sizeof flow->match[0]; + + flow = (struct e2e_cache_ovs_flow *) xzalloc(alloc_bytes); + return flow; +} + +static void +e2e_cache_flow_free(void *arg) +{ + struct e2e_cache_ovs_flow *flow = (struct e2e_cache_ovs_flow *) arg; + + if (flow->actions) { + free(flow->actions); + } + free(flow); +} + +static inline struct e2e_cache_ufid_msg * +e2e_cache_ufid_msg_alloc(int op, bool is_ct, size_t actions_len, + long long int now) +{ + struct e2e_cache_ufid_msg *msg; + struct nlattr *actions = NULL; + size_t alloc_size; + + alloc_size = sizeof *msg; + if (op == E2E_UFID_MSG_PUT) { + if (actions_len) { + actions = (struct nlattr *) xmalloc(actions_len); + if (OVS_UNLIKELY(!actions)) { + return NULL; + } + } + alloc_size += is_ct ? sizeof msg->ct_match[0] : + sizeof msg->match[0]; + } + + msg = (struct e2e_cache_ufid_msg *) xmalloc(alloc_size); + if (OVS_UNLIKELY(!msg)) { + goto err; + } + + msg->op = op; + msg->is_ct = is_ct; + msg->actions = actions; + msg->actions_len = actions_len; + msg->timestamp = now; + return msg; + +err: + if (actions) { + free(actions); + } + return NULL; +} + +static inline void +e2e_cache_ufid_msg_free(struct e2e_cache_ufid_msg *msg) +{ + if (msg->actions) { + free(msg->actions); + } + free(msg); +} + +static void +e2e_cache_disassociate_counters(struct e2e_cache_merged_flow *merged_flow); + +static void +e2e_cache_merged_flow_free(struct e2e_cache_merged_flow *merged_flow) +{ + e2e_cache_disassociate_counters(merged_flow); + if (merged_flow->actions) { + free(merged_flow->actions); + } + free(merged_flow); +} + +static void +e2e_cache_merged_flow_db_rem(struct e2e_cache_merged_flow *merged_flow) +{ + e2e_cache_disassociate_merged_flow(merged_flow); + + ovs_mutex_lock(&merged_flows_map_mutex); + hmap_remove(&merged_flows_map, &merged_flow->node.in_hmap); + ovs_mutex_unlock(&merged_flows_map_mutex); + atomic_count_dec(&merged_flows_map_count); +} + +static void +e2e_cache_merged_flow_db_del(struct e2e_cache_merged_flow *merged_flow) +{ + /* Lock/unlock to prevent race condition with + * e2e_cache_get_merged_flows_stats() + */ + ovs_mutex_lock(&flows_map_mutex); + e2e_cache_merged_flow_db_rem(merged_flow); + ovs_mutex_unlock(&flows_map_mutex); + + e2e_cache_merged_flow_free(merged_flow); +} + +static int +e2e_cache_merged_flow_offload_del(struct e2e_cache_merged_flow *merged_flow); + +static inline int +e2e_cache_merged_flow_db_put(struct e2e_cache_merged_flow *merged_flow) +{ + uint32_t hash = + hash_bytes(&merged_flow->ufid, sizeof merged_flow->ufid, 0); + struct e2e_cache_merged_flow *old_merged_flow; + + ovs_mutex_lock(&merged_flows_map_mutex); + + old_merged_flow = e2e_cache_merged_flow_find(&merged_flow->ufid, hash); + /* In case the merged flow exists do nothing. */ + if (old_merged_flow) { + uint16_t actions_size = merged_flow->actions_size; + + if (old_merged_flow->actions_size == actions_size && + !memcmp(old_merged_flow->actions, merged_flow->actions, + actions_size)) { + ovs_mutex_unlock(&merged_flows_map_mutex); + return -1; + } + + /* Must unlock merged_flows_map_mutex before calling next functions */ + ovs_mutex_unlock(&merged_flows_map_mutex); + + /* In case it's a flow modification delete the current flow + * before inserting the updated one. + */ + e2e_cache_merged_flow_offload_del(old_merged_flow); + e2e_cache_merged_flow_db_del(old_merged_flow); + + ovs_mutex_lock(&merged_flows_map_mutex); + } + + hmap_insert(&merged_flows_map, &merged_flow->node.in_hmap, hash); + + ovs_mutex_unlock(&merged_flows_map_mutex); + atomic_count_inc(&merged_flows_map_count); + return 0; +} + +/* Find e2e_cache_ovs_flow with @ufid and calculated @hash */ +static inline struct e2e_cache_ovs_flow * +e2e_cache_flow_find(const ovs_u128 *ufid, uint32_t hash) + OVS_REQUIRES(flows_map_mutex) +{ + struct e2e_cache_ovs_flow *flow; + + HMAP_FOR_EACH_WITH_HASH (flow, node, hash, &flows_map) { + if (ovs_u128_equals(*ufid, flow->ufid)) { + return flow; + } + } + + return NULL; +} + +static void +e2e_cache_update_ct_stats(struct e2e_cache_ovs_flow *mt_flow, int op) +{ + unsigned int tid = netdev_offload_thread_id(); + struct dp_offload_thread *ofl_thread; + struct e2e_cache_ovs_flow *ct_peer; + + ofl_thread = &dp_offload_threads[tid]; + + ct_peer = mt_flow->ct_peer; + if (op == DP_NETDEV_FLOW_OFFLOAD_OP_ADD) { + if (ct_peer && + (ct_peer->offload_state == E2E_OL_STATE_CT_HW || + ct_peer->offload_state == E2E_OL_STATE_CT_MT || + ct_peer->offload_state == E2E_OL_STATE_CT2CT)) { + atomic_count_inc64(&ofl_thread->ct_bi_dir_connections); + atomic_count_dec64(&ofl_thread->ct_uni_dir_connections); + } else { + atomic_count_inc64(&ofl_thread->ct_uni_dir_connections); + } + } else if (op == DP_NETDEV_FLOW_OFFLOAD_OP_DEL) { + if (ct_peer && + (ct_peer->offload_state == E2E_OL_STATE_CT_HW || + ct_peer->offload_state == E2E_OL_STATE_CT_MT || + ct_peer->offload_state == E2E_OL_STATE_CT2CT)) { + atomic_count_dec64(&ofl_thread->ct_bi_dir_connections); + atomic_count_inc64(&ofl_thread->ct_uni_dir_connections); + } else { + atomic_count_dec64(&ofl_thread->ct_uni_dir_connections); + } + } else { + OVS_NOT_REACHED(); + } +} + +static void +e2e_cache_del_associated_merged_flows(struct e2e_cache_ovs_flow *flow, + struct ovs_list *merged_flows_to_delete) +{ + struct flow2flow_item *associated_flow_item, *next_item; + unsigned int tid = netdev_offload_thread_id(); + struct e2e_cache_merged_flow *merged_flow; + + LIST_FOR_EACH_SAFE (associated_flow_item, next_item, list, + &flow->associated_merged_flows) { + merged_flow = + CONTAINER_OF(associated_flow_item, + struct e2e_cache_merged_flow, + associated_flows[associated_flow_item->index]); + if (merged_flow->tid != tid) { + continue; + } + + e2e_cache_merged_flow_db_rem(merged_flow); + ovs_list_push_back(merged_flows_to_delete, &merged_flow->node.in_list); + } +} + +static void +e2e_cache_del_merged_flows(struct ovs_list *merged_flows_to_delete) +{ + struct e2e_cache_merged_flow *merged_flow; + struct ovs_list *l; + + while (!ovs_list_is_empty(merged_flows_to_delete)) { + l = ovs_list_pop_front(merged_flows_to_delete); + + merged_flow = + CONTAINER_OF(l, struct e2e_cache_merged_flow, node.in_list); + + e2e_cache_merged_flow_offload_del(merged_flow); + e2e_cache_merged_flow_free(merged_flow); + } +} + +static struct e2e_cache_ovs_flow * +e2e_cache_flow_db_del_protected(const ovs_u128 *ufid, uint32_t hash, + struct ovs_list *merged_flows_to_delete, + struct ovs_refcount *del_refcnt) + OVS_REQUIRES(flows_map_mutex) +{ + struct e2e_cache_ovs_flow *flow; + + flow = e2e_cache_flow_find(ufid, hash); + if (OVS_UNLIKELY(!flow)) { + return NULL; + } + e2e_cache_del_associated_merged_flows(flow, merged_flows_to_delete); + if (flow->offload_state == E2E_OL_STATE_FLOW) { + if (del_refcnt && ovs_refcount_unref(del_refcnt) > 1) { + return NULL; + } + hmap_remove(&flows_map, &flow->node); + atomic_count_dec(&flows_map_count); + ovsrcu_postpone(e2e_cache_flow_free, flow); + flow = NULL; + ovsrcu_postpone(free, del_refcnt); + } + return flow; +} + +static inline void +e2e_cache_populate_offload_item(struct dp_offload_thread_item *offload_item, + int op, + struct dp_netdev *dp, + struct dp_netdev_flow *flow, + long long now); + +static int +e2e_cache_ct_flow_offload_del_mt(struct dp_netdev *dp, + struct e2e_cache_ovs_flow *ct_flow) +{ + unsigned int tid = netdev_offload_thread_id(); + struct dp_offload_thread_item *offload_item; + struct e2e_cache_stats *e2e_stats; + struct dp_netdev_flow flow; + long long now = time_usec(); + int ret; + + e2e_stats = &dp_offload_threads[tid].e2e_stats; + memset(&flow, 0, sizeof flow); + *CONST_CAST(ovs_u128 *, &flow.mega_ufid) = ct_flow->ufid; + CONST_CAST(struct flow *, &flow.flow)->in_port.odp_port = + ct_flow->ct_match[0].odp_port; + + offload_item = dp_netdev_alloc_flow_offload(dp, &flow, + DP_NETDEV_FLOW_OFFLOAD_OP_DEL, + now); + e2e_cache_populate_offload_item(offload_item, + DP_NETDEV_FLOW_OFFLOAD_OP_DEL, dp, &flow, + now); + + ret = dp_netdev_flow_offload_del(offload_item); + free(offload_item); + if (!ret) { + e2e_stats->del_ct_mt_flow_hw++; + } else { + e2e_stats->del_ct_mt_flow_err++; + } + return ret; +} + +static void +e2e_cache_flow_state_set_at(struct e2e_cache_ovs_flow *flow, + enum e2e_offload_state next_state, + const char *where) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(100, 100); + static const int op[E2E_OL_STATE_NUM][E2E_OL_STATE_NUM] = { + [E2E_OL_STATE_CT_SW] = { + [E2E_OL_STATE_CT_HW] = DP_NETDEV_FLOW_OFFLOAD_OP_ADD, + [E2E_OL_STATE_CT2CT] = DP_NETDEV_FLOW_OFFLOAD_OP_ADD, + [E2E_OL_STATE_CT_MT] = DP_NETDEV_FLOW_OFFLOAD_OP_ADD, + }, + [E2E_OL_STATE_CT_HW] = { + [E2E_OL_STATE_CT_SW] = DP_NETDEV_FLOW_OFFLOAD_OP_DEL, + [E2E_OL_STATE_CT_ERR] = DP_NETDEV_FLOW_OFFLOAD_OP_DEL, + }, + [E2E_OL_STATE_CT_MT] = { + [E2E_OL_STATE_CT_SW] = DP_NETDEV_FLOW_OFFLOAD_OP_DEL, + [E2E_OL_STATE_CT2CT] = DP_NETDEV_FLOW_OFFLOAD_OP_DEL, + [E2E_OL_STATE_CT_ERR] = DP_NETDEV_FLOW_OFFLOAD_OP_DEL, + }, + [E2E_OL_STATE_CT2CT] = { + [E2E_OL_STATE_CT_SW] = DP_NETDEV_FLOW_OFFLOAD_OP_DEL, + [E2E_OL_STATE_CT_MT] = DP_NETDEV_FLOW_OFFLOAD_OP_DEL, + [E2E_OL_STATE_CT_ERR] = DP_NETDEV_FLOW_OFFLOAD_OP_DEL, + }, + [E2E_OL_STATE_CT_ERR] = { + [E2E_OL_STATE_CT_HW] = DP_NETDEV_FLOW_OFFLOAD_OP_ADD, + [E2E_OL_STATE_CT_MT] = DP_NETDEV_FLOW_OFFLOAD_OP_ADD, + [E2E_OL_STATE_CT2CT] = DP_NETDEV_FLOW_OFFLOAD_OP_ADD, + }, + }; + enum e2e_offload_state prev_state = flow->offload_state; + int flow_op; + + if (prev_state < E2E_OL_STATE_FLOW || prev_state >= E2E_OL_STATE_NUM) { + /* If flow state was not yet initialized, assume a start state that + * is assured to be a no-op regarding CT stats. + */ + prev_state = E2E_OL_STATE_FLOW; + } + + ovs_assert(next_state >= E2E_OL_STATE_FLOW && + next_state < E2E_OL_STATE_NUM); + + VLOG_DBG_RL(&rl, "%s: e2e-flow " UUID_FMT " state is %s", where, + UUID_ARGS((struct uuid *) &flow->ufid), + e2e_offload_state_names[next_state]); + flow->offload_state = next_state; + + flow_op = op[prev_state][next_state]; + if (flow_op == DP_NETDEV_FLOW_OFFLOAD_OP_NONE) { + return; + } + + e2e_cache_update_ct_stats(flow, flow_op); +} +#define e2e_cache_flow_state_set(f, s) \ + e2e_cache_flow_state_set_at(f, s, __func__) + +static int +dp_netdev_ct_offload_add_cb(struct ct_flow_offload_item *ct_offload, + struct ct_match *ct_match, struct nlattr *actions, + int actions_len); + +static struct nlattr * +e2e_cache_ct_flow_offload_add_mt(struct dp_netdev *dp, + struct e2e_cache_ovs_flow *ct_flow, + struct nlattr *actions, + uint16_t *actions_size) +{ + unsigned int tid = netdev_offload_thread_id(); + uint16_t max_actions_len = *actions_size; + struct ct_flow_offload_item offload; + struct e2e_cache_stats *e2e_stats; + int ret; + + e2e_stats = &dp_offload_threads[tid].e2e_stats; + + memset(&offload, 0, sizeof offload); + offload.dp = dp; + + /* Only non-offloaded CTs. Either to MT or cache. */ + if (ct_flow->offload_state != E2E_OL_STATE_CT_SW || + !ovs_list_is_empty(&ct_flow->associated_merged_flows)) { + return actions; + } + + offload.ufid = ct_flow->ufid; + if (!actions || max_actions_len < ct_flow->actions_size) { + if (actions) { + free(actions); + } + max_actions_len = ct_flow->actions_size; + actions = xmalloc(max_actions_len); + } + memcpy(actions, ct_flow->actions, ct_flow->actions_size); + ret = dp_netdev_ct_offload_add_cb(&offload, ct_flow->ct_match, actions, + ct_flow->actions_size); + + if (OVS_LIKELY(ret == 0)) { + e2e_cache_flow_state_set(ct_flow, E2E_OL_STATE_CT_MT); + e2e_stats->add_ct_mt_flow_hw++; + } else { + e2e_cache_flow_state_set(ct_flow, E2E_OL_STATE_CT_ERR); + e2e_stats->add_ct_mt_flow_err++; + } + + *actions_size = max_actions_len; + return actions; +} + +static void +e2e_cache_flow_db_del(struct e2e_cache_ufid_msg *ufid_msg) +{ + struct ovs_list merged_flows_to_delete = + OVS_LIST_INITIALIZER(&merged_flows_to_delete); + size_t hash = hash_bytes(&ufid_msg->ufid, sizeof ufid_msg->ufid, 0); + struct e2e_cache_ovs_flow *ct_flow, *iter_flow; + struct e2e_cache_merged_flow *merged_flow; + uint16_t i; + + ovs_mutex_lock(&flows_map_mutex); + ct_flow = e2e_cache_flow_db_del_protected(&ufid_msg->ufid, hash, + &merged_flows_to_delete, + ufid_msg->del_refcnt); + ovs_mutex_unlock(&flows_map_mutex); + + /* Update CT stats affected by deletion of the merged flows. */ + LIST_FOR_EACH (merged_flow, node.in_list, &merged_flows_to_delete) { + for (i = 0; i < merged_flow->associated_flows_len; i++) { + iter_flow = merged_flow->associated_flows[i].mt_flow; + if (iter_flow->offload_state == E2E_OL_STATE_FLOW) { + continue; + } + if (ovs_list_is_empty(&iter_flow->associated_merged_flows)) { + e2e_cache_flow_state_set(iter_flow, E2E_OL_STATE_CT_SW); + } + } + } + if (ct_flow) { + if (ovs_refcount_unref(ufid_msg->del_refcnt) == 1) { + ovs_mutex_lock(&flows_map_mutex); + hmap_remove(&flows_map, &ct_flow->node); + ovs_mutex_unlock(&flows_map_mutex); + atomic_count_dec(&flows_map_count); + ovsrcu_postpone(e2e_cache_flow_free, ct_flow); + if (ufid_msg->del_refcnt) { + ovsrcu_postpone(free, ufid_msg->del_refcnt); + } + conntrack_conn_unref(ufid_msg->conn); + } + /* Only the thread that merged ct_flow should should change statistics + * etc. + */ + if (ct_flow->merge_tid == netdev_offload_thread_id()) { + /* This is a CT MT flow that is deleted. If it is offloaded using + * MT remove it and update CT stats. + */ + if (ct_flow->offload_state == E2E_OL_STATE_CT_MT) { + e2e_cache_ct_flow_offload_del_mt(ufid_msg->dp, ct_flow); + } + e2e_cache_flow_state_set(ct_flow, E2E_OL_STATE_CT_SW); + if (ct_flow->ct_peer) { + ct_flow->ct_peer->ct_peer = NULL; + } + } + } + e2e_cache_del_merged_flows(&merged_flows_to_delete); +} + +static int +e2e_cache_flow_db_put(struct e2e_cache_ufid_msg *ufid_msg) +{ + struct e2e_cache_ovs_flow *flow_prev, *flow; + struct ovs_list merged_flows_to_delete = + OVS_LIST_INITIALIZER(&merged_flows_to_delete); + const ovs_u128 *ufid; + size_t hash; + + flow = e2e_cache_flow_alloc(ufid_msg->is_ct); + if (OVS_UNLIKELY(!flow)) { + return -1; + } + + flow->ufid = ufid_msg->ufid; + if (ufid_msg->is_ct) { + flow->ct_match[0] = ufid_msg->ct_match[0]; + e2e_cache_flow_state_set(flow, E2E_OL_STATE_CT_SW); + } else { + flow->match[0] = ufid_msg->match[0]; + e2e_cache_flow_state_set(flow, E2E_OL_STATE_FLOW); + } + flow->actions = ufid_msg->actions; + ufid_msg->actions = NULL; + flow->actions_size = ufid_msg->actions_len; + ovs_list_init(&flow->associated_merged_flows); + flow->merge_tid = INVALID_OFFLOAD_THREAD_NB; + hmap_init(&flow->merged_counters); + + ufid = &flow->ufid; + hash = hash_bytes(ufid, sizeof *ufid, 0); + + ovs_mutex_lock(&flows_map_mutex); + + flow_prev = e2e_cache_flow_find(ufid, hash); + if (flow_prev) { + e2e_cache_flow_db_del_protected(ufid, hash, &merged_flows_to_delete, + NULL); + } + + hmap_insert(&flows_map, &flow->node, hash); + + ovs_mutex_unlock(&flows_map_mutex); + atomic_count_inc(&flows_map_count); + + e2e_cache_del_merged_flows(&merged_flows_to_delete); + return 0; +} + +int +e2e_cache_flow_del(const ovs_u128 *ufid, struct dp_netdev *dp, + struct conn *conn, long long int now) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10); + struct e2e_cache_ufid_msg *del_msg; + struct e2e_cache_stats *e2e_stats; + struct dp_offload_thread *thread; + struct e2e_cache_ovs_flow *flow; + struct ovs_refcount *del_refcnt; + uint32_t hash; + + VLOG_DBG_RL(&rl, "%s: ufid="UUID_FMT, __FUNCTION__, + UUID_ARGS((struct uuid *)ufid)); + + ovs_mutex_lock(&flows_map_mutex); + hash = hash_bytes(ufid, sizeof *ufid, 0); + flow = e2e_cache_flow_find(ufid, hash); + ovs_mutex_unlock(&flows_map_mutex); + if (!flow) { + return -1; + } + del_refcnt = xmalloc(sizeof *del_refcnt); + ovs_refcount_init(del_refcnt); + DP_NETDEV_OFFLOAD_FOREACH_THREAD_NO_MAIN (thread) { + ovs_refcount_ref(del_refcnt); + } + /* Compensate for the reference taken at initialization. */ + ovs_refcount_unref(del_refcnt); + DP_NETDEV_OFFLOAD_FOREACH_THREAD_NO_MAIN (thread) { + del_msg = e2e_cache_ufid_msg_alloc(E2E_UFID_MSG_DEL, false, 0, now); + if (OVS_UNLIKELY(!del_msg)) { + free(del_refcnt); + return -1; + } + del_msg->ufid = *ufid; + del_msg->dp = dp; + del_msg->del_refcnt = del_refcnt; + del_msg->conn = conn; + + /* Insert message into queue, e2e_cache_ufid_msg_dequeue() + * is used to dequeue it from there. + */ + mpsc_queue_insert(&thread->ufid_queue, &del_msg->node); + e2e_stats = &thread->e2e_stats; + atomic_count_inc(&e2e_stats->flow_del_msgs); + } + return 0; +} + +int +e2e_cache_flow_put(bool is_ct, const ovs_u128 *ufid, const void *match, + const struct nlattr *actions, size_t actions_len, + long long int now) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10); + struct e2e_cache_ufid_msg *put_msg; + struct e2e_cache_stats *e2e_stats; + unsigned int tid; + + VLOG_DBG_RL(&rl, "%s: ufid="UUID_FMT, __FUNCTION__, + UUID_ARGS((struct uuid *)ufid)); + put_msg = e2e_cache_ufid_msg_alloc(E2E_UFID_MSG_PUT, is_ct, actions_len, + now); + if (OVS_UNLIKELY(!put_msg)) { + return -1; + } + + put_msg->ufid = *ufid; + if (actions_len) { + memcpy(put_msg->actions, actions, actions_len); + } + if (is_ct) { + put_msg->ct_match[0] = *((const struct ct_match *) match); + } else { + put_msg->match[0] = *((const struct match *) match); + } + + /* Insert message into queue, e2e_cache_ufid_msg_dequeue() + * is used to dequeue it from there. + */ + tid = netdev_offload_ufid_to_thread_id(*ufid); + mpsc_queue_insert(&dp_offload_threads[tid].ufid_queue, &put_msg->node); + e2e_stats = &dp_offload_threads[tid].e2e_stats; + atomic_count_inc(&e2e_stats->flow_add_msgs); + return 0; +} + +static int +e2e_cache_ufids_to_flows(const ovs_u128 *ufids, + uint16_t num_elements, + struct e2e_cache_ovs_flow *flows[]); +static unsigned int +netdev_offload_trace_to_thread_id(ovs_u128 *ufids, + uint16_t num_elements) +{ + struct e2e_cache_ovs_flow *mt_flows[E2E_CACHE_MAX_TRACE]; + uint32_t ufid_hash; + unsigned int tid; + uint16_t i; + + ovs_mutex_lock(&flows_map_mutex); + e2e_cache_ufids_to_flows(ufids, num_elements, mt_flows); + /* If a previous trace already determined the tid to handle, send it to + * the same one. + */ + tid = INVALID_OFFLOAD_THREAD_NB; + for (i = 0; i < num_elements; i++) { + if (!mt_flows[i]) { + ovs_mutex_unlock(&flows_map_mutex); + return INVALID_OFFLOAD_THREAD_NB; + } + } + for (i = 0; i < num_elements; i++) { + /* Skip megaflows. */ + if (i % 3 == 0) { + continue; + } + if (mt_flows[i]->merge_tid != INVALID_OFFLOAD_THREAD_NB) { + tid = mt_flows[i]->merge_tid; + break; + } + } + + if (tid == INVALID_OFFLOAD_THREAD_NB) { + ufid_hash = 1; + for (i = 0; i < num_elements; i++) { + /* Skip ct peers. */ + if (i % 3 == 2) { + continue; + } + ufid_hash = hash_words64( + (const uint64_t [2]){ ufids[i].u64.lo, + ufids[i].u64.hi }, 2, ufid_hash); + } + /* Skip main thread at ID 0. */ + tid = 1 + ufid_hash % netdev_offload_thread_nb(); + } + + /* Set the selected tid to CTs. */ + for (i = 0; i < num_elements; i++) { + /* Skip megaflows. */ + if (i % 3 == 0) { + continue; + } + mt_flows[i]->merge_tid = tid; + } + ovs_mutex_unlock(&flows_map_mutex); + return tid; +} + +void +e2e_cache_dispatch_trace_message(struct dp_netdev *dp, + struct dp_packet_batch *batch, + long long int now) +{ + struct e2e_cache_trace_info *cur_trace_info[MAX_OFFLOAD_THREAD_NB]; + struct e2e_cache_trace_message *buffer[MAX_OFFLOAD_THREAD_NB]; + struct e2e_cache_stats *e2e_stats[MAX_OFFLOAD_THREAD_NB]; + unsigned int first_tid = NETDEV_OFFLOAD_THREAD_MAIN; + uint32_t num_elements[MAX_OFFLOAD_THREAD_NB]; + uint32_t cur_q_size[MAX_OFFLOAD_THREAD_NB]; + struct dp_offload_thread *thread; + struct dp_packet *packet; + size_t buffer_size; + unsigned int tid; + + buffer_size = sizeof(struct e2e_cache_trace_message) + + 2 * batch->count * sizeof(struct e2e_cache_trace_info); + + DP_NETDEV_OFFLOAD_FOREACH_THREAD_NO_MAIN (thread, tid) { + if (first_tid == NETDEV_OFFLOAD_THREAD_MAIN) { + first_tid = tid; + } + buffer[tid] = + (struct e2e_cache_trace_message *) xmalloc_cacheline(buffer_size); + num_elements[tid] = 0; + cur_trace_info[tid] = &buffer[tid]->data[0]; + cur_q_size[tid] = + atomic_count_get(&dp_offload_threads[tid].e2e_stats.queue_trcs); + e2e_stats[tid] = &dp_offload_threads[tid].e2e_stats; + } + + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + uint32_t e2e_trace_size = packet->e2e_trace_size; + ovs_u128 *e2e_trace = &packet->e2e_trace[0]; + + /* Don't send untraced packets. */ + if (!e2e_trace_size) { + continue; + } + + /* Don't send aborted traces */ + if (OVS_UNLIKELY(packet->e2e_trace_flags & + E2E_CACHE_TRACE_FLAG_ABORT)) { + atomic_count_inc(&e2e_stats[first_tid]->aborted_trcs); + continue; + } + /* In case the packet had tnl_pop, we split the trace to the tnl_pop + * ufid (the 1st one in the trace), and the rest of the trace, + * representing the path of the packet with the virtual port. Once the + * tnl_pop flow is offloaded, we will get only the virtual port path. + */ + if (packet->e2e_trace_flags & E2E_CACHE_TRACE_FLAG_TNL_POP) { + tid = netdev_offload_ufid_to_thread_id(e2e_trace[0]); + if (tid == INVALID_OFFLOAD_THREAD_NB) { + continue; + } + if (dp_netdev_e2e_cache_trace_q_size && + cur_q_size[tid] >= dp_netdev_e2e_cache_trace_q_size) { + atomic_count_inc(&e2e_stats[tid]->overflow_trcs); + continue; + } + + cur_trace_info[tid]->num_elements = 1; + cur_trace_info[tid]->e2e_trace_ct_ufids = 0; + packet->e2e_trace_ct_ufids >>= 1; + + memcpy(&cur_trace_info[tid]->ufids[0], e2e_trace, + sizeof *e2e_trace); + + e2e_trace_size--; + e2e_trace++; + num_elements[tid]++; + cur_trace_info[tid]++; + packet->e2e_trace_flags &= ~E2E_CACHE_TRACE_FLAG_TNL_POP; + if (!packet->e2e_trace_ct_ufids) { + continue; + } + } + /* If the trace is marked as "throttled" this means that it must be + * omitted from sending due to high messages rate. + */ + if (packet->e2e_trace_flags & E2E_CACHE_TRACE_FLAG_THROTTLED) { + atomic_count_inc(&e2e_stats[first_tid]->throttled_trcs); + continue; + } + /* Don't send "partial" traces due to overflow of the trace storage */ + if (OVS_UNLIKELY(packet->e2e_trace_flags & + E2E_CACHE_TRACE_FLAG_OVERFLOW)) { + atomic_count_inc(&e2e_stats[first_tid]->discarded_trcs); + continue; + } + /* Send only traces for packet that passed conntrack */ + if (!packet->e2e_trace_ct_ufids) { + atomic_count_inc(&e2e_stats[first_tid]->discarded_trcs); + continue; + } + + tid = netdev_offload_trace_to_thread_id(e2e_trace, e2e_trace_size); + if (tid == INVALID_OFFLOAD_THREAD_NB) { + continue; + } + + if (dp_netdev_e2e_cache_trace_q_size && + cur_q_size[tid] >= dp_netdev_e2e_cache_trace_q_size) { + atomic_count_inc(&e2e_stats[tid]->overflow_trcs); + continue; + } + + cur_trace_info[tid]->e2e_trace_ct_ufids = packet->e2e_trace_ct_ufids; + cur_trace_info[tid]->num_elements = e2e_trace_size; + cur_trace_info[tid]->orig_in_port = packet->md.orig_in_port; + + memcpy(&cur_trace_info[tid]->ufids[0], e2e_trace, + e2e_trace_size * sizeof *e2e_trace); + + num_elements[tid]++; + cur_trace_info[tid]++; + } + + DP_NETDEV_OFFLOAD_FOREACH_THREAD_NO_MAIN (thread, tid) { + if (num_elements[tid] == 0) { + free_cacheline(buffer[tid]); + continue; + } + + buffer[tid]->dp = dp; + buffer[tid]->num_elements = num_elements[tid]; + buffer[tid]->timestamp = now; + + e2e_cache_trace_msg_enqueue(buffer[tid], tid); + atomic_count_inc(&e2e_stats[tid]->generated_trcs); + } +} + +void +e2e_cache_trace_tnl_pop(struct dp_packet *packet) +{ + packet->e2e_trace_flags |= E2E_CACHE_TRACE_FLAG_TNL_POP; +} + +static int +e2e_cache_ufids_to_flows(const ovs_u128 *ufids, + uint16_t num_elements, + struct e2e_cache_ovs_flow *flows[]) + OVS_REQUIRES(flows_map_mutex) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10); + const ovs_u128 *ufid; + uint32_t hash; + uint16_t i; + + for (i = 0; i < num_elements; i++) { + ufid = &ufids[i]; + hash = hash_bytes(ufid, sizeof *ufid, 0); + flows[i] = e2e_cache_flow_find(ufid, hash); + VLOG_DBG_RL(&rl, "%s: ufids[%d]="UUID_FMT" flows[%d]=%p", __func__, + i, UUID_ARGS((struct uuid *)ufid), i, flows[i]); + if (OVS_UNLIKELY(!flows[i])) { + return -1; + } + if (i > 0 && flows[i - 1]->offload_state != E2E_OL_STATE_FLOW && + flows[i]->offload_state != E2E_OL_STATE_FLOW) { + if (!flows[i - 1]->ct_peer) { + flows[i - 1]->ct_peer = flows[i]; + } + if (!flows[i]->ct_peer) { + flows[i]->ct_peer = flows[i - 1]; + } + } + } + return 0; +} + +static inline void +e2e_cache_populate_offload_item(struct dp_offload_thread_item *offload_item, + int op, + struct dp_netdev *dp, + struct dp_netdev_flow *flow, + long long now) +{ + struct dp_offload_flow_item *flow_offload; + flow_offload = &offload_item->data->flow; + + memset(offload_item, 0, sizeof *offload_item); + offload_item->type = DP_OFFLOAD_FLOW; + offload_item->dp = dp; + offload_item->timestamp = now; + flow_offload->flow = flow; + flow_offload->op = op; + flow_offload->is_e2e_cache_flow = true; + flow_offload->orig_in_port = flow->orig_in_port; +} + +static void +e2e_cache_disassociate_counters(struct e2e_cache_merged_flow *merged_flow) +{ + size_t counter_hash, ct_counter_hash, flows_counter_hash; + struct flows_counter_key counter_key_on_stack; + const struct flows_counter_key *counter_key; + struct e2e_cache_counter_item *counter_item; + struct e2e_cache_ovs_flow *mt_flow; + struct ovs_list *next_counter; + uint16_t i; + + /* If flow_counter_list is empty this means e2e_cache_associate_counters + * was not executed for this e2e_cache_merged_flow. + * In such case ct_counter_list must also empty. + */ + if (OVS_UNLIKELY(ovs_list_is_empty(&merged_flow->flow_counter_list))) { + return; + } + + memset(&counter_key_on_stack, 0, sizeof counter_key_on_stack); + ct_counter_hash = merged_flow->ct_counter_key; + counter_key_on_stack.ptr_key = ct_counter_hash; + flows_counter_hash = hash_bytes(&merged_flow->flows_counter_key, + sizeof merged_flow->flows_counter_key, + 0); + + ovs_mutex_lock(&flows_map_mutex); + for (i = 0; i < merged_flow->associated_flows_len; i++) { + mt_flow = merged_flow->associated_flows[i].mt_flow; + if (mt_flow->offload_state == E2E_OL_STATE_FLOW) { + counter_hash = flows_counter_hash; + counter_key = &merged_flow->flows_counter_key; + } else { + counter_hash = ct_counter_hash; + counter_key = &counter_key_on_stack; + } + HMAP_FOR_EACH_WITH_HASH (counter_item, node, counter_hash, + &mt_flow->merged_counters) { + if (counter_item->hash == counter_hash && + !e2e_cache_counter_cmp_key(counter_item, counter_key)) { + break; + } + } + if (OVS_LIKELY(counter_item)) { + hmap_remove(&mt_flow->merged_counters, &counter_item->node); + free(counter_item); + } + } + next_counter = ovs_list_front(&merged_flow->flow_counter_list); + ovs_list_remove(&merged_flow->flow_counter_list); + if (ovs_list_is_empty(next_counter)) { + counter_item = CONTAINER_OF(next_counter, + struct e2e_cache_counter_item, + merged_flows); + hmap_remove(&counter_map, &counter_item->node); + free(counter_item); + } + next_counter = ovs_list_front(&merged_flow->ct_counter_list); + ovs_list_remove(&merged_flow->ct_counter_list); + if (ovs_list_is_empty(next_counter)) { + counter_item = CONTAINER_OF(next_counter, + struct e2e_cache_counter_item, + merged_flows); + hmap_remove(&counter_map, &counter_item->node); + free(counter_item); + } + ovs_mutex_unlock(&flows_map_mutex); +} + +static void +e2e_cache_associate_counters(struct e2e_cache_merged_flow *merged_flow, + struct e2e_cache_ovs_flow *mt_flows[], + const struct e2e_cache_trace_info *trc_info, + uint16_t num_elements) +{ + size_t counter_hash, ct_counter_key_hash, flows_counter_key_hash; + struct flows_counter_key counter_key_on_stack; + const struct flows_counter_key *counter_key; + struct e2e_cache_counter_item *counter_item; + uint16_t mt_index, flows_index = 0; + + BUILD_ASSERT_DECL(sizeof(size_t) >= sizeof(uintptr_t)); + + memset(&counter_key_on_stack, 0, sizeof counter_key_on_stack); + ct_counter_key_hash = merged_flow->ct_counter_key; + counter_key_on_stack.ptr_key = ct_counter_key_hash; + flows_counter_key_hash = hash_bytes(&merged_flow->flows_counter_key, + sizeof merged_flow->flows_counter_key, + 0); + + ovs_mutex_lock(&flows_map_mutex); + + for (mt_index = 0; mt_index < num_elements; mt_index++) { + bool is_ct = trc_info->e2e_trace_ct_ufids & (1 << mt_index); + bool counter_found = false; + + if (is_ct) { + counter_key = &counter_key_on_stack; + counter_hash = ct_counter_key_hash; + } else { + counter_key = &merged_flow->flows_counter_key; + counter_hash = flows_counter_key_hash; + } + /* Search if this counter is already used by this flow. */ + HMAP_FOR_EACH_WITH_HASH (counter_item, node, counter_hash, + &mt_flows[flows_index]->merged_counters) { + if (counter_item->hash == counter_hash && + !e2e_cache_counter_cmp_key(counter_item, counter_key)) { + if (OVS_UNLIKELY(counter_item->is_ct != is_ct)) { + OVS_NOT_REACHED(); + } + counter_found = true; + break; + } + } + /* If this counter is not in use by this flow, add it. */ + if (!counter_found) { + counter_item = e2e_cache_counter_alloc(counter_key, counter_hash, + is_ct); + hmap_insert(&mt_flows[flows_index]->merged_counters, + &counter_item->node, counter_hash); + } + flows_index++; + } + + /* Search for an already existing CT counter item, or create if not. */ + counter_hash = ct_counter_key_hash; + counter_key = &counter_key_on_stack; + counter_item = e2e_cache_counter_find(counter_hash, counter_key); + if (!counter_item) { + counter_item = e2e_cache_counter_alloc(counter_key, counter_hash, + true); + hmap_insert(&counter_map, &counter_item->node, counter_hash); + } + /* Add the merged flow to the counter item. */ + ovs_list_push_back(&counter_item->merged_flows, + &merged_flow->ct_counter_list); + + /* Search for an already existing flows counter item, or create if not. */ + counter_hash = flows_counter_key_hash; + counter_key = &merged_flow->flows_counter_key; + counter_item = e2e_cache_counter_find(counter_hash, counter_key); + if (!counter_item) { + counter_item = e2e_cache_counter_alloc(counter_key, counter_hash, + false); + hmap_insert(&counter_map, &counter_item->node, counter_hash); + } + /* Add the merged flow to the counter item. */ + ovs_list_push_back(&counter_item->merged_flows, + &merged_flow->flow_counter_list); + + ovs_mutex_unlock(&flows_map_mutex); +} + +static int +e2e_cache_merged_flow_offload_del(struct e2e_cache_merged_flow *merged_flow) +{ + unsigned int tid = netdev_offload_thread_id(); + struct dp_offload_thread_item *offload_item; + struct dp_netdev *dp = merged_flow->dp; + struct e2e_cache_stats *e2e_stats; + struct dp_netdev_flow flow; + int rv; + + e2e_stats = &dp_offload_threads[tid].e2e_stats; + + ovs_assert(dp); + + memset(&flow, 0, sizeof flow); + *CONST_CAST(ovs_u128 *, &flow.mega_ufid) = merged_flow->ufid; + CONST_CAST(struct flow *, &flow.flow)->in_port = + merged_flow->merged_match.spec.in_port; + + offload_item = xmalloc(sizeof *offload_item + + sizeof offload_item->data->flow); + e2e_cache_populate_offload_item(offload_item, + DP_NETDEV_FLOW_OFFLOAD_OP_DEL, dp, &flow, + time_usec()); + + merged_flow->dp = NULL; + e2e_stats->del_merged_flow_hw++; + rv = dp_netdev_flow_offload_del(offload_item); + free(offload_item); + return rv; +} + +static void +e2e_cache_calc_counters(struct e2e_cache_merged_flow *merged_flow, + struct e2e_cache_ovs_flow *mt_flows[], + const struct e2e_cache_trace_info *trc_info, + uint16_t num_elements) +{ + uintptr_t ptr, ct_counter_key = UINTPTR_MAX; + uint16_t mt_index, flows_index = 0; + + merged_flow->ct_counter_key = 0; + memset(&merged_flow->flows_counter_key, 0, + sizeof merged_flow->flows_counter_key); + + for (mt_index = 0; mt_index < num_elements; mt_index++) { + if (trc_info->e2e_trace_ct_ufids & (1 << mt_index)) { + if (trc_info->e2e_trace_ct_ufids & (1 << (mt_index + 1))) { + continue; + } + /* CT are traced only for both directions, adjacent. Calc + * ct_counter as the lowest value among all pointers to DB items + * for all CT in the trace. + */ + ptr = (uintptr_t) mt_flows[mt_index]; + if (ptr < ct_counter_key) { + ct_counter_key = ptr; + } + ptr = (uintptr_t) mt_flows[mt_index - 1]; + if (ptr < ct_counter_key) { + ct_counter_key = ptr; + } + } else { + merged_flow->flows_counter_key.ufid_key[flows_index++] = + trc_info->ufids[mt_index]; + } + } + + if (trc_info->e2e_trace_ct_ufids) { + merged_flow->ct_counter_key = ct_counter_key; + } +} + +static int +e2e_cache_merged_flow_offload_put(struct dp_netdev *dp, + struct e2e_cache_merged_flow *merged_flow, + struct e2e_cache_ovs_flow *mt_flows[], + const struct e2e_cache_trace_info *trc_info) +{ + unsigned int tid = netdev_offload_thread_id(); + struct dp_offload_thread_item *offload_item; + struct dp_offload_flow_item *flow_offload; + struct e2e_cache_stats *e2e_stats; + struct dp_netdev_flow flow; + union flow_in_port in_port; + uint16_t num_elements; + int err; + + e2e_stats = &dp_offload_threads[tid].e2e_stats; + + in_port = merged_flow->merged_match.spec.in_port; + + memset(&flow, 0, sizeof flow); + flow.mark = merged_flow->flow_mark; + flow.dead = false; + *CONST_CAST(ovs_u128 *, &flow.mega_ufid) = merged_flow->ufid; + CONST_CAST(struct flow *, &flow.flow)->in_port = in_port; + flow.orig_in_port = trc_info->orig_in_port; + + offload_item = xmalloc(sizeof *offload_item + + sizeof offload_item->data->flow); + e2e_cache_populate_offload_item(offload_item, + DP_NETDEV_FLOW_OFFLOAD_OP_ADD, dp, &flow, + time_usec()); + + num_elements = trc_info->num_elements; + /* For CT2CT, don't associate the last megaflow. */ + if (flow.mark != INVALID_FLOW_MARK) { + num_elements--; + } + e2e_cache_calc_counters(merged_flow, mt_flows, trc_info, num_elements); + e2e_cache_associate_counters(merged_flow, mt_flows, trc_info, + num_elements); + + flow_offload = &offload_item->data->flow; + merged_match_to_match(&flow_offload->match, &merged_flow->merged_match); + flow_offload->actions = xmalloc(merged_flow->actions_size); + if (OVS_UNLIKELY(!flow_offload->actions)) { + err = -1; + goto error; + } + + memcpy(flow_offload->actions, merged_flow->actions, + merged_flow->actions_size); + flow_offload->actions_len = merged_flow->actions_size; + flow_offload->ct_counter_key = merged_flow->ct_counter_key; + memcpy(&flow_offload->flows_counter_key, &merged_flow->flows_counter_key, + sizeof flow_offload->flows_counter_key); + err = dp_netdev_flow_offload_put(offload_item); + free(flow_offload->actions); + free(offload_item); + if (OVS_UNLIKELY(err != 0)) { + goto error; + } + + merged_flow->dp = dp; + e2e_stats->add_merged_flow_hw++; + return 0; + +error: + return err; +} + +static void +e2e_cache_offload_ct_mt_flows(struct dp_netdev *dp, + struct e2e_cache_ovs_flow *mt_flows[], + uint16_t num_flows) +{ + struct nlattr *actions = NULL; + uint16_t max_actions_len = 0; + uint16_t i; + + for (i = 0; i < num_flows; i++) { + actions = e2e_cache_ct_flow_offload_add_mt(dp, mt_flows[i], + actions, &max_actions_len); + } + if (actions) { + free(actions); + } +} + +static int +e2e_cache_merge_flows(struct e2e_cache_ovs_flow **flows, + uint16_t num_flows, + struct e2e_cache_merged_flow *merged_flow, + struct ofpbuf *merged_actions); +static int +ct2ct_merge_flows(struct e2e_cache_ovs_flow **flows, + uint16_t num_flows, + struct e2e_cache_merged_flow *merged_flow, + struct ofpbuf *merged_actions); + +/* CT rules should be offloaded either to MT or cache, but not both. For a + * merged flow created, remove from HW the MT rules for its composing CT + * flows. + */ +static void +e2e_cache_purge_ct_flows_from_mt(struct dp_netdev *dp, + struct e2e_cache_ovs_flow *mt_flows[], + uint16_t num_flows) +{ + uint16_t i; + + for (i = 0; i < num_flows; i++) { + /* When an e2e flow is created, it should remove its MT HW rule if + * exists, but not its peer MT HW rule. Skip if not in HW or peer + * CT flows. + */ + if (mt_flows[i]->offload_state != E2E_OL_STATE_CT_MT || + (i > 0 && mt_flows[i - 1]->offload_state != E2E_OL_STATE_FLOW)) { + continue; + } + e2e_cache_ct_flow_offload_del_mt(dp, mt_flows[i]); + e2e_cache_flow_state_set(mt_flows[i], E2E_OL_STATE_CT_SW); + } +} + +int +e2e_cache_process_trace_info(struct dp_netdev *dp, + const struct e2e_cache_trace_info *trc_info, + unsigned int tid) +{ + struct e2e_cache_merged_flow *merged_flow; + struct nlattr *actions; + size_t actions_len; + struct ofpbuf merged_actions; + struct e2e_cache_ovs_flow *mt_flows[E2E_CACHE_MAX_TRACE]; + uint64_t merged_actions_buf[1024 / sizeof(uint64_t)]; + uint32_t merged_flows_count; + uint16_t i, num_flows; + bool ct2ct; + int err; + + ovs_mutex_lock(&flows_map_mutex); + err = e2e_cache_ufids_to_flows(trc_info->ufids, trc_info->num_elements, + mt_flows); + ovs_mutex_unlock(&flows_map_mutex); + if (OVS_UNLIKELY(err)) { + return -1; + } + num_flows = trc_info->num_elements; + merged_flows_count = atomic_count_get(&merged_flows_map_count); + ct2ct = num_flows > 4 && merged_flows_count >= dp_netdev_e2e_cache_size; + if (!ct2ct && merged_flows_count >= dp_netdev_e2e_cache_size) { + e2e_cache_offload_ct_mt_flows(dp, mt_flows, num_flows); + return -1; + } + + merged_flow = xzalloc(sizeof *merged_flow + + num_flows * sizeof merged_flow->associated_flows[0]); + if (OVS_UNLIKELY(!merged_flow)) { + return -1; + } + merged_flow->tid = tid; + ovs_list_init(&merged_flow->flow_counter_list); + ovs_list_init(&merged_flow->ct_counter_list); + for (i = 0; i < num_flows; i++) { + ovs_list_init(&merged_flow->associated_flows[i].list); + } + + ofpbuf_use_stack(&merged_actions, &merged_actions_buf, + sizeof merged_actions_buf); + + err = ct2ct + ? ct2ct_merge_flows(mt_flows, num_flows, merged_flow, + &merged_actions) + : e2e_cache_merge_flows(mt_flows, num_flows, merged_flow, + &merged_actions); + if (OVS_UNLIKELY(err)) { + goto free_merged_flow; + } + + actions = (struct nlattr *) ofpbuf_at_assert(&merged_actions, 0, + sizeof(struct nlattr)); + actions_len = merged_actions.size; + merged_flow->actions = (struct nlattr *) xmalloc(actions_len); + if (OVS_UNLIKELY(!merged_flow->actions)) { + goto free_merged_flow; + } + memcpy(merged_flow->actions, actions, actions_len); + merged_flow->actions_size = actions_len; + + e2e_cache_associate_merged_flow(merged_flow, mt_flows, + ct2ct ? num_flows - 1 : num_flows); + + err = e2e_cache_merged_flow_db_put(merged_flow); + if (OVS_UNLIKELY(err)) { + goto disassociate_merged_flow; + } + + err = e2e_cache_merged_flow_offload_put(dp, merged_flow, mt_flows, + trc_info); + if (OVS_UNLIKELY(err)) { + goto remove_flow_from_db; + } + e2e_cache_purge_ct_flows_from_mt(dp, mt_flows, num_flows); + for (i = 0; i < num_flows; i++) { + if (mt_flows[i]->offload_state == E2E_OL_STATE_FLOW || + (i > 0 && mt_flows[i - 1]->offload_state != E2E_OL_STATE_FLOW)) { + continue; + } + /* Update CT stats affected by offloading the merged flow. */ + e2e_cache_flow_state_set(mt_flows[i], ct2ct ? E2E_OL_STATE_CT2CT + : E2E_OL_STATE_CT_HW); + } + return 0; + +remove_flow_from_db: + e2e_cache_merged_flow_db_del(merged_flow); + return err; +disassociate_merged_flow: + /* Lock/unlock to prevent race condition with + * e2e_cache_get_merged_flows_stats() + */ + ovs_mutex_lock(&flows_map_mutex); + e2e_cache_disassociate_merged_flow(merged_flow); + ovs_mutex_unlock(&flows_map_mutex); +free_merged_flow: + e2e_cache_merged_flow_free(merged_flow); + return err; +} + +bool +e2e_cache_get_merged_flows_stats(struct netdev *netdev, + struct match *match, + struct nlattr **actions, + const ovs_u128 *mt_ufid, + struct dpif_flow_stats *stats, + struct ofpbuf *buf, + long long now, + long long prev_now) +{ + struct e2e_cache_counter_item *mt_counter_item, *mapped_counter_item; + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10); + struct e2e_cache_merged_flow *merged_flow; + struct dpif_flow_stats merged_stats; + struct dpif_flow_attrs merged_attr; + struct ovs_list *merged_flow_node; + struct e2e_cache_ovs_flow *flow; + bool rv = false; + uint32_t hash; + int ret; + + hash = hash_bytes(mt_ufid, sizeof *mt_ufid, 0); + + ovs_mutex_lock(&flows_map_mutex); + flow = e2e_cache_flow_find(mt_ufid, hash); + if (OVS_UNLIKELY(!flow) || + ovs_list_is_empty(&flow->associated_merged_flows)) { + ovs_mutex_unlock(&flows_map_mutex); + return false; + } + if (flow->offload_state == E2E_OL_STATE_CT2CT) { + struct flow2flow_item *associated_flow_item; + + associated_flow_item = + CONTAINER_OF(ovs_list_front(&flow->associated_merged_flows), + struct flow2flow_item, list); + merged_flow = + CONTAINER_OF(associated_flow_item, + struct e2e_cache_merged_flow, + associated_flows[associated_flow_item->index]); + /* Query the counter. */ + ret = netdev_flow_get(netdev, match, actions, &merged_flow->ufid, + &merged_stats, &merged_attr, buf, now); + if (ret) { + VLOG_ERR_RL(&rl, "Failed to get merged flow ufid "UUID_FMT, + UUID_ARGS((struct uuid *) &merged_flow->ufid)); + goto out; + } + stats->n_bytes += merged_stats.n_bytes; + stats->n_packets += merged_stats.n_packets; + stats->used = MAX(stats->used, merged_stats.used); + rv = true; + goto out; + } + + HMAP_FOR_EACH (mt_counter_item, node, &flow->merged_counters) { + if (flow->offload_state == E2E_OL_STATE_FLOW) { + /* Get the counter item from the global map. */ + mapped_counter_item = + e2e_cache_counter_find(mt_counter_item->hash, + &mt_counter_item->key); + if (OVS_UNLIKELY(!mapped_counter_item)) { + VLOG_ERR_RL(&rl, "Failed to get counter item for ufid " + UUID_FMT, UUID_ARGS((struct uuid *) &flow->ufid)); + continue; + } + + /* Get one of the merged flows using this counter. */ + merged_flow_node = + ovs_list_front(&mapped_counter_item->merged_flows); + merged_flow = CONTAINER_OF(merged_flow_node, + struct e2e_cache_merged_flow, + flow_counter_list); + /* Query the counter. */ + ret = netdev_flow_get(netdev, match, actions, &merged_flow->ufid, + &merged_stats, &merged_attr, buf, now); + if (ret) { + VLOG_ERR_RL(&rl, "Failed to get merged flow ufid "UUID_FMT, + UUID_ARGS((struct uuid *) &merged_flow->ufid)); + continue; + } + stats->n_bytes += merged_stats.n_bytes; + stats->n_packets += merged_stats.n_packets; + stats->used = MAX(stats->used, merged_stats.used); + rv = true; + } else { + ret = netdev_ct_counter_query(netdev, mt_counter_item->key.ptr_key, + now, prev_now, stats); + if (ret) { + VLOG_ERR_RL(&rl, "Failed to query ct counter netdev=%s, " + "ptr_key=%"PRIxPTR, netdev_get_name(netdev), + mt_counter_item->key.ptr_key); + continue; + } + rv |= stats->used == now; + } + } +out: + ovs_mutex_unlock(&flows_map_mutex); + return rv; +} + +int +e2e_cache_flow_del(const ovs_u128 *ufid, struct dp_netdev *dp, + struct conn *conn, long long int now); + +static void +dp_netdev_ct_offload_e2e_del(ovs_u128 *ufid, void *dp, + struct conn *conn, long long int now) +{ + e2e_cache_flow_del(ufid, dp, conn, now); +} + +static int +dp_netdev_ct_offload_e2e_active(struct ct_flow_offload_item *offload, + long long now, long long prev_now); + +static void +pmd_thread_offload_netdevs(struct dp_netdev_pmd_thread *pmd, odp_port_t port_nos[CT_DIR_NUM], + struct netdev *netdevs[CT_DIR_NUM]); + +static void +dp_netdev_offload_conns(struct dp_netdev_pmd_thread *pmd, + struct conntrack *ct, struct batch *conns, int op) + OVS_NO_THREAD_SAFETY_ANALYSIS +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + struct dp_netdev *dp = conntrack_datapath(ct); + struct conn *first = batch_first(conns); + struct netdev *netdevs[CT_DIR_NUM]; + odp_port_t ports[CT_DIR_NUM]; + int n = batch_size(conns); + const char *desc = NULL; + int ret = 0; + + conntrack_offload_get_ports(first, ports); + if (pmd) { + memset(netdevs, 0, sizeof netdevs); + pmd_thread_offload_netdevs(pmd, ports, netdevs); + /* No need to lock the datapath port_rwlock here: we are in the context + * of a polling thread, which will safely pause itself whenever a thread + * has to modify the port map. + * + * Similarly, the netdev references are not kept, as they are + * already held within the thread poll_list. */ + for (int i = CT_DIR_INIT; i < CT_DIR_NUM; i++) { + if (!netdevs[i]) { + netdevs[i] = netdev_ports_get_short(ports[i], dpif_normalize_type(dp->class->type)); + } + } + } else { + for (int i = CT_DIR_INIT; i < CT_DIR_NUM; i++) { + netdevs[i] = netdev_ports_get(ports[i], dpif_normalize_type(dp->class->type)); + } + dp_netdev_port_rdlock_limit(dp, 10); + } + + if (op == DP_NETDEV_FLOW_OFFLOAD_OP_ADD) { + desc = "add"; + ret = netdev_conns_add(netdevs, conns); + conntrack_stats_add(ct, CT_DPIF_CONN_STATE_OFFLOAD_ADD_PROCESSED, n); + } else if (op == DP_NETDEV_FLOW_OFFLOAD_OP_DEL) { + struct conn *conn; + + desc = "delete"; + ret = netdev_conns_del(netdevs, conns); + conntrack_stats_add(ct, CT_DPIF_CONN_STATE_OFFLOAD_DEL_PROCESSED, n); + + /* Make sure that the offload provider left the batch intact, we + * must release all connection references. */ + ovs_assert(n == batch_size(conns)); + + BATCH_FOREACH (conn, conns) { + conntrack_conn_unref(conn); + conntrack_conn_unref(conn); + } + } + + if (!pmd) { + ovs_rwlock_unlock(&dp->port_rwlock); + netdev_close(netdevs[CT_DIR_INIT]); + netdev_close(netdevs[CT_DIR_REP]); + } + + if (ret) { + VLOG_ERR_RL(&rl, "failed to %s %d connection offloads", desc, n); + } else { + int nb_ok = batch_size(conns); + int nb_fail = n - nb_ok; + + if (nb_fail > 0) { + VLOG_ERR_RL(&rl, "failed to %s %d connection offloads", desc, nb_fail); + } + if (nb_ok > 0 && VLOG_IS_DBG_ENABLED()) { + VLOG_DBG_RL(&rl, "succeeded to %s %d connection offloads", desc, nb_ok); + } + } +} + +static void +dp_netdev_offload_conns_add(struct conntrack *ct, struct batch *conns) +{ + struct dp_netdev *dp = conntrack_datapath(ct); + struct dp_netdev_pmd_thread *pmd; + + if (dp_netdev_e2e_cache_enabled) { + return; + } + + pmd = ovsthread_getspecific(dp->per_pmd_key); + + while (!batch_is_empty(conns)) { + struct conn *first = batch_first(conns); + odp_port_t ports[CT_DIR_NUM]; + struct batch same_ports; + struct conn *conn; + + batch_init(&same_ports); + conntrack_offload_get_ports(first, ports); + BATCH_FOREACH_POP (conn, conns) { + odp_port_t it_ports[CT_DIR_NUM]; + + /* All connection handles in 'conns' must share the exact same ports, + * in the same order. */ + conntrack_offload_get_ports(conn, it_ports); + if (it_ports[CT_DIR_INIT] == ports[CT_DIR_INIT] && + it_ports[CT_DIR_REP] == ports[CT_DIR_REP]) { + batch_add(&same_ports, conn); + } else { + batch_add(conns, conn); + } + } + dp_netdev_offload_conns(pmd, ct, &same_ports, DP_NETDEV_FLOW_OFFLOAD_OP_ADD); + } +} + +static void +dp_netdev_offload_conn_del(struct conntrack *ct, struct conn *conn) +{ + struct ct_offload_handle *coh; + unsigned int insertion_tid; + + if (dp_netdev_e2e_cache_enabled) { + return; + } + + coh = conntrack_offload_get(conn); + insertion_tid = coh->insertion_tid; + + if (insertion_tid != netdev_offload_thread_id()) { + struct dp_netdev *dp = conntrack_datapath(ct); + struct dp_offload_thread_item *msg; + + msg = xzalloc(sizeof *msg + sizeof(struct dp_offload_conn_item)); + msg->timestamp = time_usec(); + msg->type = DP_OFFLOAD_CONN; + msg->dp = dp; + msg->data->conn.op = DP_NETDEV_FLOW_OFFLOAD_OP_DEL; + msg->data->conn.ct = ct; + msg->data->conn.conn = conn; + dp_netdev_offload_thread_enqueue(&dp_offload_threads[insertion_tid], msg); + } else { + struct dp_netdev *dp = conntrack_datapath(ct); + struct dp_netdev_pmd_thread *pmd = ovsthread_getspecific(dp->per_pmd_key); + struct batch conns = batch_init_one(conn); + + dp_netdev_offload_conns(pmd, ct, &conns, DP_NETDEV_FLOW_OFFLOAD_OP_DEL); + } +} + +static int +dp_netdev_offload_conn_active(struct conntrack *ct, struct conn *conn, + long long now, long long prev_now) +{ + struct netdev *netdevs[CT_DIR_NUM]; + odp_port_t ports[CT_DIR_NUM]; + struct dpif_flow_stats stats; + const struct dp_netdev *dp; + int ret = 0; + + if (!conntrack_offload_is_enabled()) { + return EINVAL; + } + + dp = conntrack_datapath(ct); + conntrack_offload_get_ports(conn, ports); + for (int i = CT_DIR_INIT; i < CT_DIR_NUM; i++) { + netdevs[i] = netdev_ports_get(ports[i], dpif_normalize_type(dp->class->type)); + } + ret = netdev_conn_stats(netdevs, conn, &stats, NULL, now); + netdev_close(netdevs[CT_DIR_INIT]); + netdev_close(netdevs[CT_DIR_REP]); + + if (ret) { + return ret; + } + + return stats.used > prev_now ? 0 : EINVAL; +} + +static void +dp_netdev_ct_offload_get_ufid(ovs_u128 *ufid) +{ + ufid->u64.hi = ufid->u64.lo = hash_pointer(ufid, 0); + uuid_set_bits_v4((struct uuid *) ufid, UUID_ATTR_1); +} + +static int +dp_netdev_ct_e2e_add_cb(struct ct_flow_offload_item *offload, + struct ct_match *match, struct nlattr *actions, + int actions_len) +{ + return e2e_cache_flow_put(true, &offload->ufid, match, actions, + actions_len, offload->timestamp); +} + +static void +dp_netdev_ct_offload_e2e_add(struct ct_flow_offload_item *offload); + +static struct conntrack_offload_class dpif_ct_offload_class = { + .conn_get_ufid = dp_netdev_ct_offload_get_ufid, + .conns_add = dp_netdev_offload_conns_add, + .conn_del = dp_netdev_offload_conn_del, + .conn_active = dp_netdev_offload_conn_active, + .conn_e2e_add = dp_netdev_ct_offload_e2e_add, + .conn_e2e_del = dp_netdev_ct_offload_e2e_del, + .conn_e2e_active = dp_netdev_ct_offload_e2e_active, +}; + +void +dp_netdev_ct_offload_init(struct dp_netdev *dp) +{ + dp->conntrack = conntrack_init(dp); + conntrack_set_offload_class(dp->conntrack, &dpif_ct_offload_class); +} + +void +dp_netdev_ct_offload_uninit(struct dp_netdev *dp) +{ + /* Set offload_class to NULL to mark dp is being destroyed. */ + conntrack_set_offload_class(dp->conntrack, NULL); + conntrack_destroy(dp->conntrack); +} + +unsigned int +dp_netdev_offload_thread_nb(void) +{ + struct dp_offload_thread *t; + unsigned int count = 0; + + DP_NETDEV_OFFLOAD_FOREACH_THREAD (t) { + count++; + } + return count; +} + +static void +dp_netdev_e2e_offload_init(struct e2e_cache_stats *e2e_stats) +{ + atomic_count_init(&e2e_stats->generated_trcs, 0); + e2e_stats->processed_trcs = 0; + atomic_count_init(&e2e_stats->discarded_trcs, 0); + atomic_count_init(&e2e_stats->aborted_trcs, 0); + atomic_count_init(&e2e_stats->throttled_trcs, 0); + atomic_count_init(&e2e_stats->queue_trcs, 0); + atomic_count_init(&e2e_stats->overflow_trcs, 0); + atomic_count_init(&e2e_stats->flow_add_msgs, 0); + atomic_count_init(&e2e_stats->flow_del_msgs, 0); + e2e_stats->flush_flow_msgs = 0; + e2e_stats->succ_merged_flows = 0; + e2e_stats->merge_rej_flows = 0; + e2e_stats->add_merged_flow_hw = 0; + e2e_stats->del_merged_flow_hw = 0; + e2e_stats->add_ct_mt_flow_hw = 0; + e2e_stats->del_ct_mt_flow_hw = 0; + e2e_stats->add_ct_mt_flow_err = 0; + e2e_stats->del_ct_mt_flow_err = 0; + e2e_stats->succ_ct2ct_merges = 0; + e2e_stats->rej_ct2ct_merges = 0; + e2e_stats->add_ct2ct_flows = 0; + e2e_stats->del_ct2ct_flows = 0; +} + +static void +dp_netdev_offload_thread_reset(struct dp_offload_thread *thread) +{ + dp_netdev_e2e_offload_init(&thread->e2e_stats); + atomic_init(&thread->ct_uni_dir_connections, 0); + atomic_init(&thread->ct_bi_dir_connections, 0); + atomic_init(&thread->enqueued_offload, 0); + atomic_init(&thread->enqueued_ct_add, 0); + mov_avg_cma_init(&thread->cma); + mov_avg_ema_init(&thread->ema, 100); +} + +void +dp_netdev_offload_thread_uninit(struct dp_offload_thread *thread) +{ + bool active; + + atomic_read(&thread->active, &active); + if (!active) { + return; + } + + atomic_store_relaxed(&thread->active, false); + dp_netdev_offload_thread_reset(thread); +} + +void +dp_netdev_offload_thread_init(struct dp_offload_thread *thread) +{ + mpsc_queue_init(&thread->offload_queue); + mpsc_queue_init(&thread->ufid_queue); + mpsc_queue_init(&thread->trace_queue); + cmap_init(&thread->megaflow_to_mark); + cmap_init(&thread->mark_to_flow); + histogram_walls_set_log(&thread->latency, 1, 2000); + + for (int i = 0; i < DP_OFFLOAD_TYPE_NUM; i++) { + struct dp_offload_queue_metrics *m; + + m = &thread->queue_metrics[i]; + histogram_walls_set_log(&m->wait_time, 1, 2000); + histogram_walls_set_log(&m->service_time, 1, 10000); + histogram_walls_set_log(&m->sojourn_time, 1, 2000); + } + + dp_netdev_offload_thread_reset(thread); + atomic_store_relaxed(&thread->active, true); +} + +int +dpif_netdev2_offload_stats_get(struct dpif *dpif, + struct netdev_custom_stats *stats, + bool verbose) +{ + enum { + DP_NETDEV_HW_OFFLOADS_STATS_ENQUEUED_OFFLOADS, + DP_NETDEV_HW_OFFLOADS_STATS_INSERTED, + DP_NETDEV_HW_OFFLOADS_STATS_CT_UNI_DIR_CONNS, + DP_NETDEV_HW_OFFLOADS_STATS_CT_BI_DIR_CONNS, + DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MEAN, + DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_STDDEV, + DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MAX, + DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MIN, + DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_MEAN, + DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_STDDEV, + DP_NETDEV_HW_OFFLOADS_STATS_LAST, + }; + enum { + DP_NETDEV_E2E_STATS_GENERATED_TRCS, + DP_NETDEV_E2E_STATS_PROCESSED_TRCS, + DP_NETDEV_E2E_STATS_DISCARRDED_TRCS, + DP_NETDEV_E2E_STATS_ABORTED_TRCS, + DP_NETDEV_E2E_STATS_THROTTLED_TRCS, + DP_NETDEV_E2E_STATS_QUEUE_TRCS, + DP_NETDEV_E2E_STATS_OVERFLOW_TRCS, + DP_NETDEV_E2E_STATS_FLOW_ADDS, + DP_NETDEV_E2E_STATS_FLOW_DELS, + DP_NETDEV_E2E_STATS_FLOW_FLUSHS, + DP_NETDEV_E2E_STATS_SUC_MERGES, + DP_NETDEV_E2E_STATS_REJ_MERGES, + DP_NETDEV_E2E_STATS_HW_ADD_E2E_FLOWS, + DP_NETDEV_E2E_STATS_HW_DEL_E2E_FLOWS, + DP_NETDEV_E2E_STATS_MERGED_FLOWS, + DP_NETDEV_E2E_STATS_DB_FLOWS, + DP_NETDEV_E2E_STATS_CT_MT_ADDS, + DP_NETDEV_E2E_STATS_CT_MT_DELS, + DP_NETDEV_E2E_STATS_FAILED_CT_MT_ADDS, + DP_NETDEV_E2E_STATS_FAILED_CT_MT_DELS, + DP_NETDEV_E2E_STATS_SUC_CT2CT_MERGES, + DP_NETDEV_E2E_STATS_REJ_CT2CT_MERGES, + DP_NETDEV_E2E_STATS_CT2CT_ADDS, + DP_NETDEV_E2E_STATS_CT2CT_DELS, + DP_NETDEV_E2E_STATS_LAST, + }; + struct { + const char *name; + uint64_t total; + } hwol_stats[] = { + [DP_NETDEV_HW_OFFLOADS_STATS_ENQUEUED_OFFLOADS] = + { " Enqueued offloads", 0 }, + [DP_NETDEV_HW_OFFLOADS_STATS_INSERTED] = + { " Inserted offloads", 0 }, + [DP_NETDEV_HW_OFFLOADS_STATS_CT_UNI_DIR_CONNS] = + { " CT uni-dir Connections", 0 }, + [DP_NETDEV_HW_OFFLOADS_STATS_CT_BI_DIR_CONNS] = + { " CT bi-dir Connections", 0 }, + [DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MEAN] = + { " Cumulative Average latency (us)", 0 }, + [DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_STDDEV] = + { " Cumulative Latency stddev (us)", 0 }, + [DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MAX] = + { " Cumulative Latency max (us)", 0 }, + [DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MIN] = + { " Cumulative Latency min (us)", 0 }, + [DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_MEAN] = + { " Exponential Average latency (us)", 0 }, + [DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_STDDEV] = + { " Exponential Latency stddev (us)", 0 }, + }, e2e_stats[] = { + [DP_NETDEV_E2E_STATS_GENERATED_TRCS] = + { " Generated traces", 0 }, + [DP_NETDEV_E2E_STATS_PROCESSED_TRCS] = + { " Processed traces", 0 }, + [DP_NETDEV_E2E_STATS_DISCARRDED_TRCS] = + { " Discarded traces", 0 }, + [DP_NETDEV_E2E_STATS_ABORTED_TRCS] = + { " Aborted traces", 0 }, + [DP_NETDEV_E2E_STATS_THROTTLED_TRCS] = + { " Throttled traces", 0 }, + [DP_NETDEV_E2E_STATS_QUEUE_TRCS] = + { " Queue traces", 0 }, + [DP_NETDEV_E2E_STATS_OVERFLOW_TRCS] = + { " Overflow traces", 0 }, + [DP_NETDEV_E2E_STATS_FLOW_ADDS] = + { " Flow add messages", 0 }, + [DP_NETDEV_E2E_STATS_FLOW_DELS] = + { " Flow del messages", 0 }, + [DP_NETDEV_E2E_STATS_FLOW_FLUSHS] = + { " Flow flush messages", 0 }, + [DP_NETDEV_E2E_STATS_SUC_MERGES] = + { " Successful merges", 0 }, + [DP_NETDEV_E2E_STATS_REJ_MERGES] = + { " Rejected merges", 0 }, + [DP_NETDEV_E2E_STATS_HW_ADD_E2E_FLOWS] = + { " HW add e2e flows", 0 }, + [DP_NETDEV_E2E_STATS_HW_DEL_E2E_FLOWS] = + { " HW del e2e flows", 0 }, + [DP_NETDEV_E2E_STATS_MERGED_FLOWS] = + { " Merged e2e flows", 0 }, + [DP_NETDEV_E2E_STATS_DB_FLOWS] = + { " e2e DB flows", 0 }, + [DP_NETDEV_E2E_STATS_CT_MT_ADDS] = + { " CT MT Adds", 0 }, + [DP_NETDEV_E2E_STATS_CT_MT_DELS] = + { " CT MT Dels", 0 }, + [DP_NETDEV_E2E_STATS_FAILED_CT_MT_ADDS] = + { " Failed CT MT Adds", 0 }, + [DP_NETDEV_E2E_STATS_FAILED_CT_MT_DELS] = + { " Failed CT MT Dels", 0 }, + [DP_NETDEV_E2E_STATS_SUC_CT2CT_MERGES] = + { " Successful CT2CT mrgs", 0 }, + [DP_NETDEV_E2E_STATS_REJ_CT2CT_MERGES] = + { " Rejected CT2CT merges", 0 }, + [DP_NETDEV_E2E_STATS_CT2CT_ADDS] = + { " CT2CT Adds", 0 }, + [DP_NETDEV_E2E_STATS_CT2CT_DELS] = + { " CT2CT Dels", 0 }, + }, *cur_stats; + + struct netdev_offload_stats per_port_nos[MAX_OFFLOAD_THREAD_NB]; + struct netdev_offload_stats total_nos[MAX_OFFLOAD_THREAD_NB]; + struct dp_netdev *dp = get_dp_netdev2(dpif); + struct dp_offload_thread *thread; + struct dp_netdev_port *port; + unsigned int nb_thread; + unsigned int nb_counts; + unsigned int tid; + size_t i; +#define DP_NETDEV_STATS_TOTAL_COUNTS \ + (ARRAY_SIZE(hwol_stats) + ARRAY_SIZE(e2e_stats)) + + if (!netdev_is_flow_api_enabled()) { + return EINVAL; + } + + /* /!\ NOTE: + * It is safe-ish to read nb_thread here and allocate + * counters, before iterating on the active threads. + * This function executes in the main thread. A change in + * active offload threads could only result from a configuration + * change executing in the main thread. As such, it is impossible + * for this thread number to change between the two reads. + * + * This is only valid if this function executes in the main thread. + */ + nb_thread = dp_netdev_offload_thread_nb(); + + ovs_assert(nb_thread > 0); + /* nb_thread counters for the overall total as well. */ + nb_counts = ARRAY_SIZE(hwol_stats); + if (netdev_is_e2e_cache_enabled() && verbose) { + nb_counts += ARRAY_SIZE(e2e_stats); + } + stats->size = (nb_thread + 1) * nb_counts; + stats->counters = xcalloc(stats->size, sizeof *stats->counters); + + memset(total_nos, 0, sizeof total_nos); + + dp_netdev_port_rdlock(dp); + HMAP_FOR_EACH (port, node, &dp->ports) { + memset(per_port_nos, 0, sizeof per_port_nos); + /* Do not abort on read error from a port, just report 0. */ + if (!netdev_offload_get_stats(port->netdev, per_port_nos)) { + for (i = 0; i < nb_thread; i++) { + netdev_offload_stats_add(&total_nos[i], per_port_nos[i]); + } + } + } + ovs_rwlock_unlock(&dp->port_rwlock); + + DP_NETDEV_OFFLOAD_FOREACH_THREAD (thread, tid) { + uint64_t counts[DP_NETDEV_STATS_TOTAL_COUNTS]; + uint64_t *e2e_counts = &counts[DP_NETDEV_HW_OFFLOADS_STATS_LAST]; + size_t idx = (tid + 1) * nb_counts; + struct e2e_cache_stats *cur_e2e_stats; + + memset(counts, 0, sizeof counts); + counts[DP_NETDEV_HW_OFFLOADS_STATS_INSERTED] = + total_nos[tid].n_inserted; + + atomic_read_relaxed(&thread->enqueued_offload, + &counts[DP_NETDEV_HW_OFFLOADS_STATS_ENQUEUED_OFFLOADS]); + atomic_read_relaxed(&thread->ct_uni_dir_connections, + &counts[DP_NETDEV_HW_OFFLOADS_STATS_CT_UNI_DIR_CONNS]); + atomic_read_relaxed(&thread->ct_bi_dir_connections, + &counts[DP_NETDEV_HW_OFFLOADS_STATS_CT_BI_DIR_CONNS]); + counts[DP_NETDEV_HW_OFFLOADS_STATS_CT_BI_DIR_CONNS] += + total_nos[tid].n_conns / 2; + counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MEAN] = + mov_avg_cma(&thread->cma); + counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_STDDEV] = + mov_avg_cma_std_dev(&thread->cma); + counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MAX] = + mov_avg_cma_max(&thread->cma); + counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MIN] = + mov_avg_cma_min(&thread->cma); + counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_MEAN] = + mov_avg_ema(&thread->ema); + counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_STDDEV] = + mov_avg_ema_std_dev(&thread->ema); + cur_e2e_stats = &thread->e2e_stats; + e2e_counts[DP_NETDEV_E2E_STATS_GENERATED_TRCS] = + atomic_count_get(&cur_e2e_stats->generated_trcs); + e2e_counts[DP_NETDEV_E2E_STATS_PROCESSED_TRCS] = + cur_e2e_stats->processed_trcs; + e2e_counts[DP_NETDEV_E2E_STATS_DISCARRDED_TRCS] = + atomic_count_get(&cur_e2e_stats->discarded_trcs); + e2e_counts[DP_NETDEV_E2E_STATS_ABORTED_TRCS] = + atomic_count_get(&cur_e2e_stats->aborted_trcs); + e2e_counts[DP_NETDEV_E2E_STATS_THROTTLED_TRCS] = + atomic_count_get(&cur_e2e_stats->throttled_trcs); + e2e_counts[DP_NETDEV_E2E_STATS_QUEUE_TRCS] = + atomic_count_get(&cur_e2e_stats->queue_trcs); + e2e_counts[DP_NETDEV_E2E_STATS_OVERFLOW_TRCS] = + atomic_count_get(&cur_e2e_stats->overflow_trcs); + e2e_counts[DP_NETDEV_E2E_STATS_FLOW_ADDS] = + atomic_count_get(&cur_e2e_stats->flow_add_msgs); + e2e_counts[DP_NETDEV_E2E_STATS_FLOW_DELS] = + atomic_count_get(&cur_e2e_stats->flow_del_msgs); + e2e_counts[DP_NETDEV_E2E_STATS_FLOW_FLUSHS] = + cur_e2e_stats->flush_flow_msgs; + e2e_counts[DP_NETDEV_E2E_STATS_SUC_MERGES] = + cur_e2e_stats->succ_merged_flows; + e2e_counts[DP_NETDEV_E2E_STATS_REJ_MERGES] = + cur_e2e_stats->merge_rej_flows; + e2e_counts[DP_NETDEV_E2E_STATS_HW_ADD_E2E_FLOWS] = + cur_e2e_stats->add_merged_flow_hw; + e2e_counts[DP_NETDEV_E2E_STATS_HW_DEL_E2E_FLOWS] = + cur_e2e_stats->del_merged_flow_hw; + e2e_counts[DP_NETDEV_E2E_STATS_MERGED_FLOWS] = + atomic_count_get(&merged_flows_map_count); + e2e_counts[DP_NETDEV_E2E_STATS_DB_FLOWS] = + atomic_count_get(&flows_map_count); + e2e_counts[DP_NETDEV_E2E_STATS_CT_MT_ADDS] = + cur_e2e_stats->add_ct_mt_flow_hw; + e2e_counts[DP_NETDEV_E2E_STATS_CT_MT_DELS] = + cur_e2e_stats->del_ct_mt_flow_hw; + e2e_counts[DP_NETDEV_E2E_STATS_FAILED_CT_MT_ADDS] = + cur_e2e_stats->add_ct_mt_flow_err; + e2e_counts[DP_NETDEV_E2E_STATS_FAILED_CT_MT_DELS] = + cur_e2e_stats->del_ct_mt_flow_err; + e2e_counts[DP_NETDEV_E2E_STATS_SUC_CT2CT_MERGES] = + cur_e2e_stats->succ_ct2ct_merges; + e2e_counts[DP_NETDEV_E2E_STATS_REJ_CT2CT_MERGES] = + cur_e2e_stats->rej_ct2ct_merges; + e2e_counts[DP_NETDEV_E2E_STATS_CT2CT_ADDS] = + cur_e2e_stats->add_ct2ct_flows; + e2e_counts[DP_NETDEV_E2E_STATS_CT2CT_DELS] = + cur_e2e_stats->del_ct2ct_flows; + + for (i = 0; i < nb_counts; i++) { + cur_stats = i < DP_NETDEV_HW_OFFLOADS_STATS_LAST + ? &hwol_stats[i] + : &e2e_stats[i - DP_NETDEV_HW_OFFLOADS_STATS_LAST]; + snprintf(stats->counters[idx + i].name, + sizeof(stats->counters[idx + i].name), + " [%3u] %s", tid, cur_stats->name); + stats->counters[idx + i].value = counts[i]; + cur_stats->total += counts[i]; + } + e2e_stats[DP_NETDEV_E2E_STATS_MERGED_FLOWS].total = + atomic_count_get(&merged_flows_map_count); + e2e_stats[DP_NETDEV_E2E_STATS_DB_FLOWS].total = + atomic_count_get(&flows_map_count); + } + + /* Do an average of the average for the aggregate. */ + hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MEAN].total /= nb_thread; + hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_STDDEV].total /= nb_thread; + hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MAX].total /= nb_thread; + hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MIN].total /= nb_thread; + hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_MEAN].total /= nb_thread; + hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_STDDEV].total /= nb_thread; + + for (i = 0; i < nb_counts; i++) { + cur_stats = i < DP_NETDEV_HW_OFFLOADS_STATS_LAST + ? &hwol_stats[i] + : &e2e_stats[i - DP_NETDEV_HW_OFFLOADS_STATS_LAST]; + snprintf(stats->counters[i].name, sizeof(stats->counters[i].name), + " Total %s", cur_stats->name); + stats->counters[i].value = cur_stats->total; + } + + return 0; +} + +/* Equivalent to 'dpif_is_netdev2' but usable in the + * metrics context. */ +static bool +metrics_dpif_is_netdev2(void *it) +{ + return dpif_is_netdev2(it); +} + +METRICS_COND(foreach_dpif, foreach_dpif_netdev, + metrics_dpif_is_netdev2); + +METRICS_COND(foreach_dpif_netdev, foreach_dpif_netdev_ext, + metrics_ext_enabled); + +static bool +dpif_netdev_offload_enabled(void *it OVS_UNUSED) +{ + return netdev_is_flow_api_enabled(); +} + +METRICS_COND(foreach_dpif_netdev, dpif_netdev_offload, + dpif_netdev_offload_enabled); + +struct hw_offload_it { + struct dp_netdev *dp; + struct dp_offload_thread *thread; + unsigned int tid; +}; + +static void +do_foreach_hw_offload_threads(metrics_visitor_fn visitor, + struct metrics_visitor_context *ctx, + struct metrics_node *node, + struct metrics_label *labels, + size_t n OVS_UNUSED) +{ + struct dp_offload_thread *thread; + struct hw_offload_it it; + unsigned int tid; + char id[50]; + + it.dp = get_dp_netdev2(ctx->it); + labels[0].value = id; + DP_NETDEV_OFFLOAD_FOREACH_THREAD (thread, tid) { + snprintf(id, sizeof id, "%u", tid); + it.thread = thread; + it.tid = tid; + ctx->it = ⁢ + visitor(ctx, node); + } +} + +METRICS_COLLECTION(dpif_netdev_offload, foreach_hw_offload_threads, + do_foreach_hw_offload_threads, "thread_num"); +METRICS_COND(foreach_hw_offload_threads, foreach_hw_offload_threads_dbg, + metrics_dbg_enabled); + +enum { + HWOL_METRICS_ENQUEUED, + HWOL_METRICS_INSERTED, + HWOL_METRICS_CT_UNIDIR, + HWOL_METRICS_CT_BIDIR, + HWOL_METRICS_N_ENTRIES, +}; + +#define HWOL_METRICS_ENTRIES \ + [HWOL_METRICS_ENQUEUED] = METRICS_GAUGE(n_enqueued, \ + "Number of hardware offload requests waiting to be processed."), \ + [HWOL_METRICS_INSERTED] = METRICS_GAUGE(n_inserted, \ + "Number of hardware offload rules currently inserted."), \ + [HWOL_METRICS_CT_UNIDIR] = METRICS_GAUGE(n_ct_unidir, \ + "Number of uni-directional connections offloaded in hardware."), \ + [HWOL_METRICS_CT_BIDIR] = METRICS_GAUGE(n_ct_bidir, \ + "Number of bi-directional connections offloaded in hardware."), + +static void +hw_offload_read_value(double *values, void *_it) +{ + struct netdev_offload_stats per_port_nos[MAX_OFFLOAD_THREAD_NB]; + struct netdev_offload_stats total_nos[MAX_OFFLOAD_THREAD_NB]; + struct hw_offload_it *it = _it; + unsigned int tid = it->tid; + struct dp_netdev *dp = it->dp; + struct dp_offload_thread *t = it->thread; + struct dp_netdev_port *port; + uint64_t count; + + atomic_read_relaxed(&t->enqueued_offload, &count); + values[HWOL_METRICS_ENQUEUED] = count; + + memset(total_nos, 0, sizeof total_nos); + dp_netdev_port_rdlock(dp); + HMAP_FOR_EACH (port, node, &dp->ports) { + memset(per_port_nos, 0, sizeof per_port_nos); + if (!netdev_offload_get_stats(port->netdev, per_port_nos)) { + netdev_offload_stats_add(&total_nos[tid], per_port_nos[tid]); + } + } + ovs_rwlock_unlock(&dp->port_rwlock); + + values[HWOL_METRICS_INSERTED] = total_nos[tid].n_inserted; + + atomic_read_relaxed(&t->ct_uni_dir_connections, &count); + values[HWOL_METRICS_CT_UNIDIR] = count; + + atomic_read_relaxed(&t->ct_bi_dir_connections, &count); + values[HWOL_METRICS_CT_BIDIR] = total_nos[tid].n_conns / 2 + count; +} + +METRICS_ENTRIES(foreach_hw_offload_threads_dbg, hw_offload_threads_dbg_entries, + "hw_offload", hw_offload_read_value, HWOL_METRICS_ENTRIES); + +static struct histogram * +hw_offload_latency_get(void *_it) +{ + struct hw_offload_it *it = _it; + + return &it->thread->latency; +} + +METRICS_HISTOGRAM(foreach_hw_offload_threads_dbg, hw_offload_latency, + "Latency in milliseconds between an offload request and its " + "completion.", hw_offload_latency_get); + +static void +do_foreach_hw_offload_types(metrics_visitor_fn visitor, + struct metrics_visitor_context *ctx, + struct metrics_node *node, + struct metrics_label *labels, + size_t n OVS_UNUSED) +{ + const char *hw_offload_type_names[] = { + [DP_OFFLOAD_FLOW] = "flow", + [DP_OFFLOAD_FLUSH] = "flush", + [DP_OFFLOAD_CONN] = "conn", + [DP_OFFLOAD_STATS_CLEAR] = "stats_clear", + }; + struct hw_offload_it *ctx_it = ctx->it; + + for (int i = 0; i < DP_OFFLOAD_TYPE_NUM; i++) { + labels[0].value = hw_offload_type_names[i]; + ctx->it = &ctx_it->thread->queue_metrics[i]; + visitor(ctx, node); + } +} + +/* Iterates on offload (thread x type). */ +METRICS_COLLECTION(foreach_hw_offload_threads_dbg, foreach_hw_offload_types, + do_foreach_hw_offload_types, "type"); + +static struct histogram * +hw_offload_queue_sojourn_time_get(void *_it) +{ + struct dp_offload_queue_metrics *m = _it; + + return &m->sojourn_time; +} + +METRICS_HISTOGRAM(foreach_hw_offload_types, hw_offload_queue_sojourn_time, + "Distribution of sojourn time for an offload request " + "in milliseconds", hw_offload_queue_sojourn_time_get); + +static struct histogram * +hw_offload_queue_wait_time_get(void *_it) +{ + struct dp_offload_queue_metrics *m = _it; + + return &m->wait_time; +} + +METRICS_HISTOGRAM(foreach_hw_offload_types, hw_offload_queue_wait_time, + "Distribution of wait time for an offload request " + "in milliseconds", hw_offload_queue_wait_time_get); + +static struct histogram * +hw_offload_queue_service_time_get(void *_it) +{ + struct dp_offload_queue_metrics *m = _it; + + return &m->service_time; +} + +METRICS_HISTOGRAM(foreach_hw_offload_types, hw_offload_queue_service_time, + "Distribution of service time for an offload request " + "in microseconds", hw_offload_queue_service_time_get); + +static void +datapath_hw_offload_read_value(double *values, void *_dp) +{ + double t_values[HWOL_METRICS_N_ENTRIES]; + struct dp_offload_thread *thread; + struct hw_offload_it it; + size_t i; + + for (i = 0; i < HWOL_METRICS_N_ENTRIES; i++) { + values[i] = 0.0; + } + + it.dp = get_dp_netdev2(_dp); + DP_NETDEV_OFFLOAD_FOREACH_THREAD (thread, it.tid) { + it.thread = thread; + hw_offload_read_value(t_values, &it); + for (i = 0; i < HWOL_METRICS_N_ENTRIES; i++) { + values[i] += t_values[i]; + } + } +} + +METRICS_ENTRIES(dpif_netdev_offload, datapath_hw_offload_entries, + "datapath_hw_offload", datapath_hw_offload_read_value, + HWOL_METRICS_ENTRIES); + +int +dpif_netdev2_offload_stats_clear(struct dpif *dpif OVS_UNUSED) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + struct dp_offload_thread *thread; + + if (!netdev_is_flow_api_enabled()) { + return EINVAL; + } + + DP_NETDEV_OFFLOAD_FOREACH_THREAD_NO_MAIN (thread) { + struct dp_offload_thread_item *item; + + item = xmalloc(sizeof *item); + item->type = DP_OFFLOAD_STATS_CLEAR; + item->dp = dp; + item->timestamp = time_usec(); + + dp_netdev_offload_thread_enqueue(thread, item); + } + + return 0; +} + +void +dpif_netdev2_set_static_config_ct_add_queue_size(const struct smap *other_config) +{ + if (conntrack_offload_is_enabled()) { + offload_ct_add_queue_size = + smap_get_uint(other_config, "hw-offload-ct-add-queue-size", + CT_ADD_DEFAULT_QUEUE_SIZE); + if (offload_ct_add_queue_size == 0) { + offload_ct_add_queue_size = CT_ADD_DEFAULT_QUEUE_SIZE; + VLOG_WARN("The size of hw-offload-ct-add-queue-size must be " + "greater than 0"); + } else if (conntrack_offload_size() < offload_ct_add_queue_size) { + offload_ct_add_queue_size = conntrack_offload_size(); + VLOG_INFO("Limiting hw-offload-ct-add-queue-size to the " + "conntrack offload size %u", + offload_ct_add_queue_size); + } + VLOG_INFO("hw-offload-ct-add-queue-size = %"PRIi32, + offload_ct_add_queue_size); + } else { + offload_ct_add_queue_size = 0; + } +} + +void +dpif_netdev_set_config_e2e_cache(const struct smap *other_config) +{ + dp_netdev_e2e_cache_enabled = netdev_is_e2e_cache_enabled(); + dp_netdev_e2e_cache_size = netdev_get_e2e_cache_size(); + if (dp_netdev_e2e_cache_enabled) { + static bool done = false; + int i_value = smap_get_int(other_config, "e2e-cache-trace-q-size", + E2E_CACHE_MAX_TRACE_Q_SIZE); + if (i_value < 0) { + i_value = 0; + } + if (!done || dp_netdev_e2e_cache_trace_q_size != (uint32_t) i_value) { + dp_netdev_e2e_cache_trace_q_size = (uint32_t) i_value; + if (dp_netdev_e2e_cache_trace_q_size) { + VLOG_INFO("E2E cache trace queue size %u", + dp_netdev_e2e_cache_trace_q_size); + } else { + VLOG_INFO("E2E cache trace queue unlimited"); + } + done = true; + } + } +} + +static inline bool +e2e_cache_set_action_is_valid(struct nlattr *a) +{ + const struct nlattr *set_action = nl_attr_get(a); + const size_t set_len = nl_attr_get_size(a); + const struct nlattr *sa; + unsigned int sleft; + + NL_ATTR_FOR_EACH (sa, sleft, set_action, set_len) { + enum ovs_key_attr type = nl_attr_type(sa); + + if (!(type == OVS_KEY_ATTR_ETHERNET || + type == OVS_KEY_ATTR_IPV4 || + type == OVS_KEY_ATTR_IPV6 || + type == OVS_KEY_ATTR_TCP || + type == OVS_KEY_ATTR_UDP)) { + VLOG_DBG("Unsupported set action type %d", type); + /* TODO: add statistic counter */ + return false; + } + } + return true; +} + +static inline bool +e2e_cache_flows_are_valid(struct e2e_cache_ovs_flow **netdev_flows, + uint16_t num) +{ + struct e2e_cache_ovs_flow *flow; + const struct match *match; + unsigned int left; + struct nlattr *a; + uint16_t i; + + for (i = 0; i < num; i++) { + flow = netdev_flows[i]; + if (flow->offload_state != E2E_OL_STATE_FLOW) { + continue; + } + + match = &flow->match[0]; + /* validate match */ + if ((match->flow.ipv6_label & match->wc.masks.ipv6_label) || + (match->flow.nw_tos & match->wc.masks.nw_tos) || + (match->flow.tcp_flags & match->wc.masks.tcp_flags) || + (match->flow.igmp_group_ip4 & match->wc.masks.igmp_group_ip4)) { + /* TODO: add statistic counter */ + return false; + } + + /* validate actions */ + NL_ATTR_FOR_EACH (a, left, flow->actions, flow->actions_size) { + enum ovs_action_attr type = nl_attr_type(a); + if (type == OVS_ACTION_ATTR_USERSPACE || + type == OVS_ACTION_ATTR_HASH || + type == OVS_ACTION_ATTR_TRUNC || + type == OVS_ACTION_ATTR_PUSH_NSH || + type == OVS_ACTION_ATTR_POP_NSH || + type == OVS_ACTION_ATTR_CT_CLEAR || + type == OVS_ACTION_ATTR_CHECK_PKT_LEN || + type == OVS_ACTION_ATTR_SAMPLE || + ((type == OVS_ACTION_ATTR_OUTPUT || + type == OVS_ACTION_ATTR_CLONE) && + left > NLA_ALIGN(a->nla_len)) || + ((type == OVS_ACTION_ATTR_SET || + type == OVS_ACTION_ATTR_SET_MASKED) && + !e2e_cache_set_action_is_valid(a))) { + /* TODO: add statistic counter */ + return false; + } + } + } + return true; +} + +#define e2e_save_set_attr(mfield, field, flag) \ + ovs_assert(key); \ + if (mask) { \ + if (!is_all_zeros(&mask->field, sizeof mask->field)) { \ + if (!is_all_ones(&mask->field, sizeof mask->field)) { \ + VLOG_DBG_RL(&rl, "HW partial mask is not supported"); \ + } \ + merged->flags |= flag; \ + merged->mfield.field = key->field; \ + } \ + } else if (!is_all_zeros(&key->field, sizeof key->field)) { \ + merged->flags |= flag; \ + merged->mfield.field = key->field; \ + } + +static inline void +e2e_cache_save_set_actions(struct e2e_cache_merged_set *merged, bool masked, + const struct nlattr *set_action, + const size_t set_len) +{ + const struct nlattr *sa; + unsigned int sleft; + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10); + + NL_ATTR_FOR_EACH (sa, sleft, set_action, set_len) { + if (nl_attr_type(sa) == OVS_KEY_ATTR_ETHERNET) { + const struct ovs_key_ethernet *key = nl_attr_get(sa); + const struct ovs_key_ethernet *mask = masked ? key + 1 : NULL; + + e2e_save_set_attr(eth, eth_src, E2E_SET_ETH_SRC); + e2e_save_set_attr(eth, eth_dst, E2E_SET_ETH_DST); + } else if (nl_attr_type(sa) == OVS_KEY_ATTR_IPV4) { + const struct ovs_key_ipv4 *key = nl_attr_get(sa); + const struct ovs_key_ipv4 *mask = masked ? key + 1 : NULL; + + e2e_save_set_attr(ipv4, ipv4_src, E2E_SET_IPV4_SRC); + e2e_save_set_attr(ipv4, ipv4_dst, E2E_SET_IPV4_DST); + e2e_save_set_attr(ipv4, ipv4_ttl, E2E_SET_IPV4_TTL); + } else if (nl_attr_type(sa) == OVS_KEY_ATTR_IPV6) { + const struct ovs_key_ipv6 *key = nl_attr_get(sa); + const struct ovs_key_ipv6 *mask = masked ? key + 1 : NULL; + + e2e_save_set_attr(ipv6, ipv6_src, E2E_SET_IPV6_SRC); + e2e_save_set_attr(ipv6, ipv6_dst, E2E_SET_IPV6_DST); + e2e_save_set_attr(ipv6, ipv6_hlimit, E2E_SET_IPV6_HLMT); + } else if (nl_attr_type(sa) == OVS_KEY_ATTR_TCP) { + const struct ovs_key_tcp *key = nl_attr_get(sa); + const struct ovs_key_tcp *mask = masked ? key + 1 : NULL; + + e2e_save_set_attr(tcp, tcp_src, E2E_SET_TCP_SRC); + e2e_save_set_attr(tcp, tcp_dst, E2E_SET_TCP_DST); + } else if (nl_attr_type(sa) == OVS_KEY_ATTR_UDP) { + const struct ovs_key_udp *key = nl_attr_get(sa); + const struct ovs_key_udp *mask = masked ? key + 1 : NULL; + + e2e_save_set_attr(udp, udp_src, E2E_SET_UDP_SRC); + e2e_save_set_attr(udp, udp_dst, E2E_SET_UDP_DST); + } + } +} + +#define e2e_construct_set_attr(mfield, field, flag) \ + if (merged->flags & flag) { \ + key->field = merged->mfield.field; \ + memset(&mask->field, 0xFF, sizeof mask->field); \ + } + +static inline void +e2e_cache_attach_merged_set_action(struct ofpbuf *buf, size_t tnl_offset, + struct e2e_cache_merged_set *merged) +{ + size_t offset; + struct ofpbuf tmpbuf; + + ofpbuf_init(&tmpbuf, 0); + offset = nl_msg_start_nested(&tmpbuf, OVS_ACTION_ATTR_SET_MASKED); + if (merged->flags & E2E_SET_ETH) { + struct ovs_key_ethernet *key = NULL; + struct ovs_key_ethernet *mask = NULL; + + key = nl_msg_put_unspec_zero(&tmpbuf, OVS_KEY_ATTR_ETHERNET, + 2 * sizeof *key); + mask = key + 1; + e2e_construct_set_attr(eth, eth_src, E2E_SET_ETH_SRC); + e2e_construct_set_attr(eth, eth_dst, E2E_SET_ETH_DST); + } + if (merged->flags & E2E_SET_IPV4) { + struct ovs_key_ipv4 *key = NULL; + struct ovs_key_ipv4 *mask = NULL; + + key = nl_msg_put_unspec_zero(&tmpbuf, OVS_KEY_ATTR_IPV4, + 2 * sizeof *key); + mask = key + 1; + e2e_construct_set_attr(ipv4, ipv4_src, E2E_SET_IPV4_SRC); + e2e_construct_set_attr(ipv4, ipv4_dst, E2E_SET_IPV4_DST); + e2e_construct_set_attr(ipv4, ipv4_ttl, E2E_SET_IPV4_TTL); + } + if (merged->flags & E2E_SET_IPV6) { + struct ovs_key_ipv6 *key = NULL; + struct ovs_key_ipv6 *mask = NULL; + + key = nl_msg_put_unspec_zero(&tmpbuf, OVS_KEY_ATTR_IPV6, + 2 * sizeof *key); + mask = key + 1; + e2e_construct_set_attr(ipv6, ipv6_src, E2E_SET_IPV6_SRC); + e2e_construct_set_attr(ipv6, ipv6_dst, E2E_SET_IPV6_DST); + e2e_construct_set_attr(ipv6, ipv6_hlimit, E2E_SET_IPV6_HLMT); + } + if (merged->flags & E2E_SET_TCP) { + struct ovs_key_tcp *key = NULL; + struct ovs_key_tcp *mask = NULL; + + key = nl_msg_put_unspec_zero(&tmpbuf, OVS_KEY_ATTR_TCP, + 2 * sizeof *key); + mask = key + 1; + e2e_construct_set_attr(tcp, tcp_src, E2E_SET_TCP_SRC); + e2e_construct_set_attr(tcp, tcp_dst, E2E_SET_TCP_DST); + } + if (merged->flags & E2E_SET_UDP) { + struct ovs_key_udp *key = NULL; + struct ovs_key_udp *mask = NULL; + + key = nl_msg_put_unspec_zero(&tmpbuf, OVS_KEY_ATTR_UDP, + 2 * sizeof *key); + mask = key + 1; + e2e_construct_set_attr(udp, udp_src, E2E_SET_UDP_SRC); + e2e_construct_set_attr(udp, udp_dst, E2E_SET_UDP_DST); + } + nl_msg_end_nested(&tmpbuf, offset); + /* insert the set action after tnl_pop in the buf */ + ofpbuf_insert(buf, tnl_offset, tmpbuf.data, tmpbuf.size); +} + +static void +e2e_cache_merge_actions(struct e2e_cache_ovs_flow **netdev_flows, + uint16_t num, struct ofpbuf *buf, + const struct nlattr **last_ct) +{ + uint16_t i = 0; + unsigned int left; + const struct nlattr *a; + uint16_t num_set = 0; + struct e2e_cache_merged_set merged_set; + size_t tnl_offset = 0; + size_t pad; + + memset(&merged_set, 0, sizeof merged_set); + for (i = 0; i < num; i++) { + if (i > 0 && netdev_flows[i]->offload_state != E2E_OL_STATE_FLOW && + netdev_flows[i - 1]->offload_state != E2E_OL_STATE_FLOW) { + continue; + } + NL_ATTR_FOR_EACH (a, left, netdev_flows[i]->actions, + netdev_flows[i]->actions_size) { + enum ovs_action_attr type = nl_attr_type(a); + + if (type == OVS_ACTION_ATTR_CT && last_ct) { + *last_ct = a; + } + if (type == OVS_ACTION_ATTR_CT || + type == OVS_ACTION_ATTR_RECIRC) { + continue; + } + if (type == OVS_ACTION_ATTR_SET || + type == OVS_ACTION_ATTR_SET_MASKED) { + const struct nlattr *set_action = nl_attr_get(a); + const size_t set_len = nl_attr_get_size(a); + bool masked = (type == OVS_ACTION_ATTR_SET_MASKED); + + e2e_cache_save_set_actions(&merged_set, masked, + set_action, set_len); + num_set++; + continue; + } + if (type == OVS_ACTION_ATTR_TUNNEL_POP) { + tnl_offset = buf->size + a->nla_len; + } + pad = PAD_SIZE(a->nla_len, NLMSG_ALIGNTO); + ofpbuf_put(buf, a, a->nla_len + pad); + } + } + if (num_set) { + e2e_cache_attach_merged_set_action(buf, tnl_offset, &merged_set); + } +} + +#define merge_flow_match(field, src, dst) \ + if (!is_all_zeros(&src->wc.masks.field, \ + sizeof src->wc.masks.field) && \ + is_all_zeros(&dst->mask.field, \ + sizeof dst->mask.field)) { \ + memcpy(&dst->spec.field, &src->flow.field, \ + sizeof src->flow.field); \ + memcpy(&dst->mask.field, &src->wc.masks.field, \ + sizeof src->wc.masks.field); \ + } + +static void +dp_netdev_fill_ct_match(struct match *match, const struct ct_match *ct_match); + +static void +e2e_cache_merge_match(struct e2e_cache_ovs_flow **netdev_flows, + uint16_t num, struct merged_match *merged_match) +{ + struct e2e_cache_ovs_flow *flow; + struct match match_on_stack; + const struct match *match; + uint16_t i = 0; + + memset(merged_match, 0, sizeof *merged_match); + + for (i = 0; i < num; i++) { + flow = netdev_flows[i]; + if (i > 0 && flow->offload_state != E2E_OL_STATE_FLOW && + netdev_flows[i - 1]->offload_state != E2E_OL_STATE_FLOW) { + continue; + } + /* parse match */ + if (flow->offload_state == E2E_OL_STATE_FLOW) { + match = &flow->match[0]; + } else { + dp_netdev_fill_ct_match(&match_on_stack, &flow->ct_match[0]); + match = &match_on_stack; + } + /* merge in_port */ + merge_flow_match(in_port, match, merged_match); + + /* merge tunnel outer */ + merge_flow_match(tunnel.ip_src, match, merged_match); + merge_flow_match(tunnel.ip_dst, match, merged_match); + merge_flow_match(tunnel.ipv6_src, match, merged_match); + merge_flow_match(tunnel.ipv6_dst, match, merged_match); + merge_flow_match(tunnel.tun_id, match, merged_match); + merge_flow_match(tunnel.tp_dst, match, merged_match); + + /* merge inner/non-tnl */ + merge_flow_match(dl_src, match, merged_match); + merge_flow_match(dl_dst, match, merged_match); + merge_flow_match(dl_type, match, merged_match); + merge_flow_match(nw_src, match, merged_match); + merge_flow_match(nw_dst, match, merged_match); + merge_flow_match(ipv6_src, match, merged_match); + merge_flow_match(ipv6_dst, match, merged_match); + merge_flow_match(nw_frag, match, merged_match); + merge_flow_match(nw_proto, match, merged_match); + merge_flow_match(tp_src, match, merged_match); + merge_flow_match(tp_dst, match, merged_match); + if (match->flow.vlans[0].tci) { + merge_flow_match(vlans[0].tci, match, merged_match); + } + merge_flow_match(ct_zone, match, merged_match); + } +} + +static int +e2e_cache_merge_flows(struct e2e_cache_ovs_flow **flows, + uint16_t num_flows, + struct e2e_cache_merged_flow *merged_flow, + struct ofpbuf *merged_actions) +{ + unsigned int tid = netdev_offload_thread_id(); + struct e2e_cache_stats *e2e_stats; + struct match match; + + e2e_stats = &dp_offload_threads[tid].e2e_stats; + if (!e2e_cache_flows_are_valid(flows, num_flows)) { + e2e_stats->merge_rej_flows++; + return -1; + } + e2e_cache_merge_match(flows, num_flows, &merged_flow->merged_match); + merged_flow->merged_match.mask.ct_zone = 0; + merged_match_to_match(&match, &merged_flow->merged_match); + dp_netdev_get_mega_ufid(&match, &merged_flow->ufid); + uuid_set_bits_v4((struct uuid *) &merged_flow->ufid, UUID_ATTR_3); + e2e_cache_merge_actions(flows, num_flows, merged_actions, NULL); + if (OVS_UNLIKELY(merged_actions->size < sizeof(struct nlattr))) { + e2e_stats->merge_rej_flows++; + return -1; + } + merged_flow->flow_mark = INVALID_FLOW_MARK; + e2e_stats->succ_merged_flows++; + return 0; +} + +static int +ct2ct_merge_flows(struct e2e_cache_ovs_flow **flows, + uint16_t num_flows, + struct e2e_cache_merged_flow *merged_flow, + struct ofpbuf *merged_actions) +{ + unsigned int tid = netdev_offload_thread_id(); + const struct nlattr *last_ct = NULL; + struct e2e_cache_stats *e2e_stats; + struct match match; + + ovs_assert(num_flows > 4); + + e2e_stats = &dp_offload_threads[tid].e2e_stats; + + /* Trace is: + * 0 Flow1 + * 1 CT1 + * ... + * num_flows - 4 Flow(N-1) + * num_flows - 3 CTN + * num_flows - 2 CTN-peer + * num_flows - 1 FlowN + * + * Matches are merged from CT1 (in [1]) until CTN included (in + * [num_flows - 3]). + * Actions are merged from Flow1 (in [0]) until CTN included (in + * [num_flows - 3]). + * The mark should be of Flow(N-1) (in [num_flows - 4]). + */ + e2e_cache_merge_match(&flows[1], num_flows - 3, + &merged_flow->merged_match); + merged_match_to_match(&match, &merged_flow->merged_match); + dp_netdev_get_mega_ufid(&match, &merged_flow->ufid); + uuid_set_bits_v4((struct uuid *) &merged_flow->ufid, UUID_ATTR_4); + e2e_cache_merge_actions(flows, num_flows - 2, merged_actions, &last_ct); + if (!last_ct) { + return -1; + } + ofpbuf_put(merged_actions, last_ct, last_ct->nla_len); + if (OVS_UNLIKELY(merged_actions->size < sizeof(struct nlattr))) { + ofpbuf_uninit(merged_actions); + e2e_stats->rej_ct2ct_merges++; + return -1; + } + /* Set the mark to the last flow in the CT2CT section. */ + merged_flow->flow_mark = + megaflow_to_mark_find(&flows[num_flows - 4]->ufid); + if (merged_flow->flow_mark == INVALID_FLOW_MARK) { + ofpbuf_uninit(merged_actions); + e2e_stats->rej_ct2ct_merges++; + return -1; + } + e2e_stats->succ_ct2ct_merges++; + return 0; +} + +static void +dp_netdev_fill_ct_match(struct match *match, const struct ct_match *ct_match) +{ + memset(match, 0, sizeof *match); + if (ct_match->key.dl_type == htons(ETH_TYPE_IP)) { + /* Fill in ipv4 5-tuples */ + match->flow.nw_src = ct_match->key.src.addr.ipv4; + match->flow.nw_dst = ct_match->key.dst.addr.ipv4; + match->wc.masks.nw_src = OVS_BE32_MAX; + match->wc.masks.nw_dst = OVS_BE32_MAX; + } else { + /* Fill in ipv6 5-tuples */ + memcpy(&match->flow.ipv6_src, + &ct_match->key.src.addr.ipv6, + sizeof match->flow.ipv6_src); + memcpy(&match->flow.ipv6_dst, + &ct_match->key.dst.addr.ipv6, + sizeof match->flow.ipv6_dst); + memset(&match->wc.masks.ipv6_src, 0xFF, + sizeof match->wc.masks.ipv6_src); + memset(&match->wc.masks.ipv6_dst, 0xFF, + sizeof match->wc.masks.ipv6_dst); + } + match->flow.dl_type = ct_match->key.dl_type; + match->flow.nw_proto = ct_match->key.nw_proto; + match->wc.masks.dl_type = OVS_BE16_MAX; + match->wc.masks.nw_proto = UINT8_MAX; + if (match->flow.nw_proto == IPPROTO_TCP) { + match->wc.masks.tcp_flags = htons(TCP_SYN | TCP_RST | TCP_FIN); + } + if (match->flow.nw_proto == IPPROTO_TCP || + match->flow.nw_proto == IPPROTO_UDP) { + match->flow.tp_src = ct_match->key.src.port; + match->flow.tp_dst = ct_match->key.dst.port; + match->wc.masks.tp_src = OVS_BE16_MAX; + match->wc.masks.tp_dst = OVS_BE16_MAX; + } + match->flow.ct_zone = ct_match->key.zone; + match->wc.masks.ct_zone = UINT16_MAX; + match->flow.in_port.odp_port = ct_match->odp_port; + match->wc.masks.in_port.odp_port = u32_to_odp(UINT32_MAX); +} + +static void +dp_netdev_set_ct_mark_labels_attr(struct ofpbuf *buf, + uint16_t attr, + void *offload_key, + size_t size) +{ + uint8_t *key, *mask; + + key = nl_msg_put_unspec_zero(buf, attr, 2 * size); + mask = key + size; + memcpy(key, offload_key, size); + memset(mask, 0xFF, size); +} + +static void +dp_netdev_create_ct_actions(struct ofpbuf *buf, + struct ct_flow_offload_item *offload) +{ + size_t offset; + char helper[] = "offl,st(0x ),id_key(0x )"; + char s[17]; + char *end; + + if (offload->nat.mod_flags) { + offset = nl_msg_start_nested(buf, OVS_ACTION_ATTR_SET_MASKED); + if (offload->ct_match.key.dl_type == htons(ETH_TYPE_IP)) { + struct ovs_key_ipv4 *ipv4_key = NULL, *ipv4_mask = NULL; + + if (offload->nat.mod_flags & NAT_ACTION_SRC || + offload->nat.mod_flags & NAT_ACTION_DST) { + ipv4_key = nl_msg_put_unspec_zero(buf, OVS_KEY_ATTR_IPV4, + 2 * sizeof *ipv4_key); + ipv4_mask = ipv4_key + 1; + } + if (offload->nat.mod_flags & NAT_ACTION_SRC) { + ipv4_key->ipv4_src = offload->nat.key.src.addr.ipv4; + ipv4_mask->ipv4_src = OVS_BE32_MAX; + } + if (offload->nat.mod_flags & NAT_ACTION_DST) { + ipv4_key->ipv4_dst = offload->nat.key.dst.addr.ipv4; + ipv4_mask->ipv4_dst = OVS_BE32_MAX; + } + } else { + struct ovs_key_ipv6 *ipv6_key = NULL, *ipv6_mask = NULL; + + if (offload->nat.mod_flags & NAT_ACTION_SRC || + offload->nat.mod_flags & NAT_ACTION_DST) { + ipv6_key = nl_msg_put_unspec_zero(buf, OVS_KEY_ATTR_IPV6, + 2 * sizeof *ipv6_key); + ipv6_mask = ipv6_key + 1; + } + if (offload->nat.mod_flags & NAT_ACTION_SRC) { + ipv6_key->ipv6_src = offload->nat.key.src.addr.ipv6; + memset(&ipv6_mask->ipv6_src, 0xFF, sizeof ipv6_mask->ipv6_src); + } + if (offload->nat.mod_flags & NAT_ACTION_DST) { + ipv6_key->ipv6_dst = offload->nat.key.dst.addr.ipv6; + memset(&ipv6_mask->ipv6_dst, 0xFF, sizeof ipv6_mask->ipv6_dst); + } + } + if (offload->nat.mod_flags & NAT_ACTION_SRC_PORT || + offload->nat.mod_flags & NAT_ACTION_DST_PORT) { + if (offload->ct_match.key.nw_proto == IPPROTO_TCP) { + struct ovs_key_tcp *tcp_key, *tcp_mask; + + tcp_key = nl_msg_put_unspec_zero(buf, OVS_KEY_ATTR_TCP, + 2 * sizeof *tcp_key); + tcp_mask = tcp_key + 1; + if (offload->nat.mod_flags & NAT_ACTION_SRC_PORT) { + tcp_key->tcp_src = offload->nat.key.src.port; + tcp_mask->tcp_src = OVS_BE16_MAX; + } + if (offload->nat.mod_flags & NAT_ACTION_DST_PORT) { + tcp_key->tcp_dst = offload->nat.key.dst.port; + tcp_mask->tcp_dst = OVS_BE16_MAX; + } + } + if (offload->ct_match.key.nw_proto == IPPROTO_UDP) { + struct ovs_key_udp *udp_key, *udp_mask; + + udp_key = nl_msg_put_unspec_zero(buf, OVS_KEY_ATTR_UDP, + 2 * sizeof *udp_key); + udp_mask = udp_key + 1; + if (offload->nat.mod_flags & NAT_ACTION_SRC_PORT) { + udp_key->udp_src = offload->nat.key.src.port; + udp_mask->udp_src = OVS_BE16_MAX; + } + if (offload->nat.mod_flags & NAT_ACTION_DST_PORT) { + udp_key->udp_dst = offload->nat.key.dst.port; + udp_mask->udp_dst = OVS_BE16_MAX; + } + } + } + nl_msg_end_nested(buf, offset); + } + offset = nl_msg_start_nested(buf, OVS_ACTION_ATTR_CT); + dp_netdev_set_ct_mark_labels_attr(buf, OVS_CT_ATTR_MARK, + &offload->mark_key, sizeof(uint32_t)); + dp_netdev_set_ct_mark_labels_attr(buf, OVS_CT_ATTR_LABELS, + &offload->label_key, sizeof(ovs_u128)); + nl_msg_put_u16(buf, OVS_CT_ATTR_ZONE, offload->ct_match.key.zone); + + end = helper; + ovs_strcat(helper, sizeof helper, &end, "offl,st(0x"); + ovs_strcat(helper, sizeof helper, &end, u32_to_hex(s, offload->ct_state)); + ovs_strcat(helper, sizeof helper, &end, "),id_key(0x"); + ovs_strcat(helper, sizeof helper, &end, uintptr_to_hex(s, + offload->ctid_key)); + ovs_strcat(helper, sizeof helper, &end, ")"); + + nl_msg_put_string(buf, OVS_CT_ATTR_HELPER, helper); + nl_msg_end_nested(buf, offset); +} + +static int +dp_netdev_ct_offload_add_cb(struct ct_flow_offload_item *ct_offload, + struct ct_match *ct_match, struct nlattr *actions, + int actions_len) +{ + struct dp_netdev *dp = ct_offload->dp; + const char *dpif_type_str = dpif_normalize_type(dp->class->type); + struct offload_info info = { .flow_mark = INVALID_FLOW_MARK, }; + struct netdev *port; + struct match match; + int ret; + + port = netdev_ports_get(ct_match->odp_port, dpif_type_str); + if (OVS_UNLIKELY(!port)) { + return ENODEV; + } + + dp_netdev_fill_ct_match(&match, ct_match); + + dp_netdev_port_rdlock_limit(dp, 10); + if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) { + struct ds ds = DS_EMPTY_INITIALIZER; + struct ofpbuf key_buf, mask_buf; + struct odp_flow_key_parms odp_parms = { + .flow = &match.flow, + .mask = &match.wc.masks, + .support = dp_netdev_support, + }; + + ofpbuf_init(&key_buf, 0); + ofpbuf_init(&mask_buf, 0); + + odp_flow_key_from_flow(&odp_parms, &key_buf); + odp_parms.key_buf = &key_buf; + odp_flow_key_from_mask(&odp_parms, &mask_buf); + + ds_put_cstr(&ds, "ct_add: "); + odp_format_ufid(&ct_offload->ufid, &ds); + ds_put_cstr(&ds, " "); + odp_flow_format(key_buf.data, key_buf.size, + mask_buf.data, mask_buf.size, + NULL, &ds, false); + ds_put_cstr(&ds, ", actions:"); + format_odp_actions(&ds, actions, actions_len, NULL); + + VLOG_DBG("%s", ds_cstr(&ds)); + + ofpbuf_uninit(&key_buf); + ofpbuf_uninit(&mask_buf); + + ds_destroy(&ds); + } + info.is_ct_conn = true; + info.orig_in_port = ct_match->orig_in_port; + ret = netdev_flow_put(port, &match, actions, actions_len, &ct_offload->ufid, + &info, NULL); + ovs_rwlock_unlock(&dp->port_rwlock); + netdev_close(port); + + return ret; +} + +typedef int +(*dp_netdev_ct_add_cb)(struct ct_flow_offload_item *ct_offload, + struct ct_match *match, struct nlattr *actions, + int actions_len); + +static int +dp_netdev_ct_add(struct ct_flow_offload_item *ct_offload, + dp_netdev_ct_add_cb cb) +{ + struct nlattr *actions; + size_t actions_size; + struct ofpbuf buf; + int ret; + + /* Bypass actions building if the work is already done. + * + * When e2e is enabled, the datapath will create the ct_actions and + * send them ready to the e2e thread. There, if the e2e-cache is not + * yet full, they will be consumed directly. Otherwise, an offload + * request will be emitted to the regular offload threads. + * + * In this case, those OFL-threads will call again this function, + * but the actions will already have been created. + */ + + if (!ct_offload->ct_actions_set) { + ofpbuf_init(&buf, 0); + dp_netdev_create_ct_actions(&buf, ct_offload); + actions = ofpbuf_at_assert(&buf, 0, sizeof(struct nlattr)); + actions_size = buf.size; + } else { + actions = ct_offload->actions; + actions_size = ct_offload->actions_size; + } + + ret = cb(ct_offload, &ct_offload->ct_match, actions, actions_size); + + if (!ct_offload->ct_actions_set) { + ofpbuf_uninit(&buf); + } + + return ret; +} + +static void +dp_offload_conn(struct dp_offload_thread_item *msg) +{ + struct dp_offload_conn_item *doci = &msg->data->conn; + struct dp_netdev *dp = conntrack_datapath(doci->ct); + struct dp_netdev_pmd_thread *pmd = ovsthread_getspecific(dp->per_pmd_key); + struct batch conns = batch_init_one(doci->conn); + + dp_netdev_offload_conns(pmd, doci->ct, &conns, doci->op); +} + +void +dp_netdev_port_rdlock_at(struct dp_netdev *dp, unsigned long long int limit_ms, + const char *where) + OVS_ACQ_RDLOCK(dp->port_rwlock) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + unsigned long long int start = time_msec(); + + if (ovs_rwlock_tryrdlock(&dp->port_rwlock)) { + const char *holder = dp->port_rwlock.where; + unsigned long long int elapsed; + + ovs_rwlock_rdlock(&dp->port_rwlock); + elapsed = time_msec() - start; + if (elapsed > limit_ms) { + VLOG_WARN_RL(&rl, "%s: Unreasonably long %llums port_rwlock wait, " + "held from %s", where, elapsed, holder); + } + } +} + +static void +pmd_thread_offload_netdevs(struct dp_netdev_pmd_thread *pmd, odp_port_t port_nos[CT_DIR_NUM], + struct netdev *netdevs[CT_DIR_NUM]) +{ + for (int i = 0; i < pmd->rx_port_count; i++) { + struct dp_netdev_port *port = pmd->rx_port_cache[i]; + + for (int dir = 0; dir < CT_DIR_NUM; dir++) { + if (!netdevs[dir]) { + if (port->port_no == port_nos[dir] && + !port->offload_disabled) { + netdevs[dir] = port->netdev; + } + } + } + } +} + +void +pmd_thread_offload_disable(struct dp_netdev_pmd_thread *pmd, struct netdev *netdev) +{ + for (int i = 0; i < pmd->rx_port_count; i++) { + struct dp_netdev_port *port = pmd->rx_port_cache[i]; + + if (port->netdev == netdev) { + port->offload_disabled = true; + return; + } + } +} + +#define DP_PMD_OFFLOAD_UPKEEP_PERIOD_MS (64) +#define DP_OFFLOAD_UPKEEP_PERIOD_MS (256) +/* Number of max-backoff to roughly reach the upkeep period. */ +#define DP_OFFLOAD_UPKEEP_N_BACKOFF \ + (DP_OFFLOAD_UPKEEP_PERIOD_MS / DP_NETDEV_OFFLOAD_BACKOFF_MAX) +BUILD_ASSERT_DECL(IS_POW2(DP_OFFLOAD_UPKEEP_N_BACKOFF)); + +void +pmd_thread_offload_upkeep(struct dp_netdev_pmd_thread *pmd, bool force) +{ + if (!force && pmd->offload_next_upkeep > pmd->ctx.now) { + return; + } + + dp_netdev_pmd_idle_end(pmd); + for (int i = 0; i < pmd->rx_port_count; i++) { + netdev_offload_upkeep(pmd->rx_port_cache[i]->netdev, true); + } + pmd->offload_next_upkeep = pmd->ctx.now + (DP_PMD_OFFLOAD_UPKEEP_PERIOD_MS * 1000); +} + +/* Eschew safety analysis as the PMD thread implicitly took ownership + * of the offload thread by allocating its netdev-offload thread ID. + * This allocation cannot be expressed in a way safety analysis understands. + */ +unsigned int +pmd_thread_offload_process(struct dp_netdev_pmd_thread *pmd, unsigned int limit) + OVS_REQUIRES(dp_offload_threads[pmd->offload_thread_id].offload_queue.read_lock) +{ + struct dp_offload_thread *offload_thread; + struct dp_offload_thread_item *offload; + struct mpsc_queue_node *node; + unsigned int n_msgs = 0; + + if (pmd->offload_thread_id == OVSTHREAD_ID_UNSET) { + return 0; + } + + offload_thread = &dp_offload_threads[pmd->offload_thread_id]; + MPSC_QUEUE_FOR_EACH_POP (node, &dp_offload_threads[pmd->offload_thread_id].offload_queue) { + dp_netdev_pmd_idle_end(pmd); + n_msgs++; + atomic_count_dec64(&offload_thread->enqueued_offload); + offload = CONTAINER_OF(node, struct dp_offload_thread_item, node); + dp_offload_process(offload_thread, offload); + /* Using limit == 0 means that every message still in the queue + * must be processed. It is used when terminating the polling thread + * to clean up requests. */ + if (limit != 0 && n_msgs >= limit) { + break; + } + } + + pmd_thread_offload_upkeep(pmd, false); + + return n_msgs; +} + +void +pmd_thread_offload_init(struct dp_netdev_pmd_thread *pmd) + OVS_ACQUIRES(dp_offload_threads[pmd->offload_thread_id].offload_queue.read_lock) +{ + pmd->offload_thread_id = netdev_offload_thread_id(); + if (pmd->core_id == NON_PMD_CORE_ID) { + ovs_assert(pmd->offload_thread_id == NETDEV_OFFLOAD_THREAD_MAIN); + } + dp_netdev_offload_thread_init(&dp_offload_threads[pmd->offload_thread_id]); + mpsc_queue_acquire(&dp_offload_threads[pmd->offload_thread_id].offload_queue); +} + +void +pmd_thread_offload_uninit(struct dp_netdev_pmd_thread *pmd) + OVS_RELEASES(dp_offload_threads[pmd->offload_thread_id].offload_queue.read_lock) +{ + pmd_thread_offload_process(pmd, 0); + mpsc_queue_release(&dp_offload_threads[pmd->offload_thread_id].offload_queue); + + dp_netdev_offload_thread_uninit(&dp_offload_threads[pmd->offload_thread_id]); + pmd->offload_thread_id = OVSTHREAD_ID_UNSET; + netdev_offload_thread_uninit(); +} + +void +dp_netdev_pmd_idle_begin(struct dp_netdev_pmd_thread *pmd) +{ + if (pmd->core_id != NON_PMD_CORE_ID && + !pmd->idle) { + ovsrcu_quiesce_start(); + pmd->idle = true; + } +} + +void +dp_netdev_pmd_idle_end(struct dp_netdev_pmd_thread *pmd) +{ + if (pmd->idle) { + ovsrcu_quiesce_end(); + pmd->idle = false; + pmd->next_rcu_quiesce = + pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL; + } +} + +static int +dp_netdev_ct_offload_e2e_active(struct ct_flow_offload_item *offload, + long long now, long long prev_now) +{ + struct dpif_flow_stats stats; + const struct dp_netdev *dp; + struct netdev *netdev; + int ret; + + if (!conntrack_offload_is_enabled()) { + return EINVAL; + } + + dp = offload->dp; + netdev = netdev_ports_get(offload->ct_match.odp_port, + dpif_normalize_type(dp->class->type)); + if (!netdev) { + return ENODEV; + } + + memset(&stats, 0, sizeof stats); + ret = !e2e_cache_get_merged_flows_stats(netdev, NULL, NULL, &offload->ufid, &stats, NULL, + now, prev_now); + + netdev_close(netdev); + if (ret) { + return ret; + } + + return stats.used > prev_now ? 0 : EINVAL; +} + +static void +dp_netdev_ct_offload_e2e_add(struct ct_flow_offload_item *offload) +{ + dp_netdev_ct_add(offload, dp_netdev_ct_e2e_add_cb); +} + +void +dp_netdev_flow_format(const char *prefix, + struct ds *s, + const struct dp_netdev_flow *dp_flow) +{ + struct dp_netdev_actions *dp_actions; + + ds_init(s); + ds_put_format(s, "%s: ", prefix); + odp_format_ufid(&dp_flow->ufid, s); + ds_put_cstr(s, " mega_"); + odp_format_ufid(&dp_flow->mega_ufid, s); + ds_put_cstr(s, " "); + + flow_format(s, &dp_flow->flow, NULL); + + dp_actions = dp_netdev_flow_get_actions(dp_flow); + ds_put_cstr(s, ", actions:"); + if (dp_actions) { + format_odp_actions(s, dp_actions->actions, dp_actions->size, NULL); + } else { + ds_put_cstr(s, "(nil)"); + } +} + +long long int +e2e_cache_flow_db_handle_ufid_msg(struct e2e_cache_ufid_msg *ufid_msg) +{ + long long int enqueue_time_us = 0; + + if (ufid_msg->op == E2E_UFID_MSG_PUT) { + e2e_cache_flow_db_put(ufid_msg); + } else if (ufid_msg->op == E2E_UFID_MSG_DEL) { + e2e_cache_flow_db_del(ufid_msg); + } else { + OVS_NOT_REACHED(); + } + + enqueue_time_us = ufid_msg->timestamp; + e2e_cache_ufid_msg_free(ufid_msg); + return enqueue_time_us; +} + +void +dp_netdev_offload_poll_queues(struct dp_offload_thread *ofl_thread, + struct e2e_cache_ufid_msg **ufid_msg, + struct dp_offload_thread_item **offload_item, + struct e2e_cache_trace_message **trace_msg) + OVS_REQUIRES(ofl_thread->ufid_queue.read_lock, + ofl_thread->offload_queue.read_lock, + ofl_thread->trace_queue.read_lock) +{ + struct mpsc_queue_node *queue_node; + unsigned int n_backoff; + uint64_t backoff; + + *ufid_msg = NULL; + *offload_item = NULL; + *trace_msg = NULL; + + backoff = DP_NETDEV_OFFLOAD_BACKOFF_MIN; + n_backoff = 0; + + while (1) { + queue_node = mpsc_queue_pop(&ofl_thread->ufid_queue); + if (queue_node != NULL) { + /* ufid message is high priority. if we have it we are done. */ + *ufid_msg = CONTAINER_OF(queue_node, struct e2e_cache_ufid_msg, + node); + return; + } + + queue_node = mpsc_queue_pop(&ofl_thread->offload_queue); + if (queue_node != NULL) { + *offload_item = CONTAINER_OF(queue_node, + struct dp_offload_thread_item, node); + atomic_count_dec64(&ofl_thread->enqueued_offload); + return; + } + + queue_node = mpsc_queue_pop(&ofl_thread->trace_queue); + if (queue_node != NULL) { + *trace_msg = CONTAINER_OF(queue_node, + struct e2e_cache_trace_message, node); + atomic_count_dec(&ofl_thread->e2e_stats.queue_trcs); + return; + } + + /* Execute upkeep if + * + * + we are waiting for work for the first time + * -> We have just stopped a streak of offloading, + * some remaining things might need cleanup. + * + * + we have waited roughly the amount of time + * between upkeep period. + */ + if ((n_backoff & (DP_OFFLOAD_UPKEEP_N_BACKOFF - 1)) == 0) { + /* Signal 'quiescing' only on the first backoff. */ + netdev_ports_upkeep(n_backoff == 0); + } + n_backoff += 1; + + /* The thread is flagged as quiescent during xnanosleep(). */ + xnanosleep(backoff * 1E6); + if (backoff < DP_NETDEV_OFFLOAD_BACKOFF_MAX) { + backoff <<= 1; + } + } +} + +long long int +dp_offload_measure_latency(struct dp_offload_thread *thread, + long long int enqueue_time_us, + long long int finish_time_us) +{ + long long int latency_us; + + latency_us = finish_time_us - enqueue_time_us; + mov_avg_cma_update(&thread->cma, latency_us); + mov_avg_ema_update(&thread->ema, latency_us); + histogram_add_sample(&thread->latency, latency_us / 1000); + + return latency_us; +} + +void +dp_offload_process(struct dp_offload_thread *thread, + struct dp_offload_thread_item *msg) +{ + struct dp_offload_queue_metrics *m; + long long int enqueue_time_us; + long long int dequeue_time_us; + long long int service_time_us; + long long int finish_time_us; + long long int wait_time_ms; + long long int latency_us; + + enqueue_time_us = msg->timestamp; + dequeue_time_us = time_usec(); + + switch (msg->type) { + case DP_OFFLOAD_FLOW: + dp_offload_flow(msg); + break; + case DP_OFFLOAD_CONN: + dp_offload_conn(msg); + break; + case DP_OFFLOAD_STATS_CLEAR: + mov_avg_cma_init(&thread->cma); + mov_avg_ema_init(&thread->ema, 100); + break; + case DP_OFFLOAD_FLUSH: + dp_offload_flush(msg); + break; + default: + OVS_NOT_REACHED(); + } + + finish_time_us = time_usec(); + latency_us = dp_offload_measure_latency(thread, + enqueue_time_us, + finish_time_us); + + wait_time_ms = (dequeue_time_us - enqueue_time_us) / 1000; + service_time_us = finish_time_us - dequeue_time_us; + m = &thread->queue_metrics[msg->type]; + histogram_add_sample(&m->wait_time, wait_time_ms); + histogram_add_sample(&m->service_time, service_time_us); + histogram_add_sample(&m->sojourn_time, latency_us / 1000); + + switch (msg->type) { + case DP_OFFLOAD_FLOW: + if (!thread->high_latency_event && + latency_us >= 200000) { + thread->high_latency_event = true; + COVERAGE_INC(flow_offload_200ms_latency); + } + break; + case DP_OFFLOAD_CONN: + if (!thread->high_latency_event) { + thread->high_latency_event = true; + if (latency_us >= 100) { + COVERAGE_INC(ct_offload_100us_latency); + } else if (latency_us >= 50) { + COVERAGE_INC(ct_offload_50us_latency); + } else if (latency_us > 30) { + COVERAGE_INC(ct_offload_30us_latency); + } else { + thread->high_latency_event = false; + } + } + break; + case DP_OFFLOAD_STATS_CLEAR: + /* Fallthrough */ + case DP_OFFLOAD_FLUSH: + /* Fallthrough */ + default: + break; + } + + dp_netdev_free_offload(msg); +} + +static void +poll_threads_n_read_value(double *values, void *it) +{ + struct dp_netdev *dp = get_dp_netdev2(it); + + values[0] = cmap_count(&dp->poll_threads); +} + +METRICS_ENTRIES(foreach_dpif_netdev, poll_threads_n, + "poll_threads", poll_threads_n_read_value, + METRICS_GAUGE(n, "Number of polling threads."), +); + +static void +do_foreach_poll_threads(metrics_visitor_fn visitor, + struct metrics_visitor_context *ctx, + struct metrics_node *node, + struct metrics_label *labels, + size_t n OVS_UNUSED) +{ + struct dp_netdev *dp = get_dp_netdev2(ctx->it); + struct dp_netdev_pmd_thread *pmd; + char core[50]; + char numa[50]; + + labels[0].value = core; + labels[1].value = numa; + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + if (pmd->core_id == NON_PMD_CORE_ID && + !metrics_dbg_enabled(NULL)) { + /* By definition, if the core ID is not one of a PMD, + * then it is not a poll thread (i.e. 'main'). + * Do not iterate on it as if it was one. */ + continue; + } + snprintf(core, sizeof core, "%u", pmd->core_id); + snprintf(numa, sizeof numa, "%d", pmd->numa_id); + if (pmd->core_id == NON_PMD_CORE_ID) { + snprintf(core, sizeof core, "main"); + snprintf(numa, sizeof numa, "0"); + } + ctx->it = pmd; + visitor(ctx, node); + } +} + +METRICS_COLLECTION(foreach_dpif_netdev, foreach_poll_threads, + do_foreach_poll_threads, "core", "numa"); + +METRICS_COND(foreach_poll_threads, foreach_poll_threads_ext, + metrics_ext_enabled); + +METRICS_COND(foreach_poll_threads, foreach_poll_threads_dbg, + metrics_dbg_enabled); + +enum { + PMD_METRICS_PACKETS, + PMD_METRICS_RECIRC, + PMD_METRICS_HIT, + PMD_METRICS_MISSED, + PMD_METRICS_LOST, + PMD_METRICS_AVG_LOOKUPS_PER_HIT, + PMD_METRICS_AVG_PACKETS_PER_BATCH, + PMD_METRICS_AVG_RECIRC_PER_PACKET, + PMD_METRICS_AVG_PASSES_PER_PACKET, + PMD_METRICS_AVG_CYCLES_PER_PACKET, + PMD_METRICS_AVG_BUSY_CYCLES_PER_PACKET, + PMD_METRICS_PERCENT_BUSY_CYCLES, + PMD_METRICS_PERCENT_IDLE_CYCLES, +}; + +static void +poll_threads_read_value(double *values, void *it) +{ + struct dp_netdev_pmd_thread *pmd = it; + uint64_t total_cycles, total_packets; + uint64_t stats[PMD_N_STATS]; + double busy_cycles_per_pkt; + double packets_per_batch; + double avg_busy_cycles; + double avg_idle_cycles; + double lookups_per_hit; + double recirc_per_pkt; + double passes_per_pkt; + double cycles_per_pkt; + uint64_t n_hit; + + /* Do not use 'pmd_perf_read_counters'. Counters are supposed to + * always be increasing, while the pmd perf module is made + * for debugging purpose and offers a 'clear' operation. + * Read the counters exactly as they are. + */ + for (int i = 0; i < PMD_N_STATS; i++) { + atomic_read_relaxed(&pmd->perf_stats.counters.n[i], &stats[i]); + } + + n_hit = 0; + n_hit += stats[PMD_STAT_PHWOL_HIT]; + n_hit += stats[PMD_STAT_SIMPLE_HIT]; + n_hit += stats[PMD_STAT_EXACT_HIT]; + n_hit += stats[PMD_STAT_SMC_HIT]; + n_hit += stats[PMD_STAT_MASKED_HIT]; + + total_cycles = stats[PMD_CYCLES_ITER_IDLE] + + stats[PMD_CYCLES_ITER_BUSY]; + total_packets = stats[PMD_STAT_RECV]; + + lookups_per_hit = 0; + if (stats[PMD_STAT_MASKED_HIT] > 0) { + lookups_per_hit = (double) stats[PMD_STAT_MASKED_LOOKUP] / + (double) stats[PMD_STAT_MASKED_HIT]; + } + + packets_per_batch = 0; + if (stats[PMD_STAT_SENT_BATCHES] > 0) { + packets_per_batch = (double) stats[PMD_STAT_SENT_PKTS] / + (double) stats[PMD_STAT_SENT_BATCHES]; + } + + avg_idle_cycles = 0; + avg_busy_cycles = 0; + if (total_cycles > 0) { + avg_idle_cycles = (double) stats[PMD_CYCLES_ITER_IDLE] / + (double) total_cycles * 100.0; + avg_busy_cycles = (double) stats[PMD_CYCLES_ITER_BUSY] / + (double) total_cycles * 100.0; + } + + recirc_per_pkt = 0; + passes_per_pkt = 0; + cycles_per_pkt = 0; + busy_cycles_per_pkt = 0; + if (total_packets > 0) { + recirc_per_pkt = (double) stats[PMD_STAT_RECIRC] / + (double) total_packets; + passes_per_pkt = (double) (total_packets + stats[PMD_STAT_RECIRC]) / + (double) total_packets; + cycles_per_pkt = (double) total_cycles / (double) total_packets; + busy_cycles_per_pkt = (double) stats[PMD_CYCLES_ITER_BUSY] / + (double) total_packets; + } + + values[PMD_METRICS_PACKETS] = stats[PMD_STAT_RECV]; + values[PMD_METRICS_RECIRC] = stats[PMD_STAT_RECIRC]; + values[PMD_METRICS_HIT] = n_hit; + values[PMD_METRICS_MISSED] = stats[PMD_STAT_MISS]; + values[PMD_METRICS_LOST] = stats[PMD_STAT_LOST]; + + values[PMD_METRICS_AVG_LOOKUPS_PER_HIT] = lookups_per_hit; + values[PMD_METRICS_AVG_PACKETS_PER_BATCH] = packets_per_batch; + values[PMD_METRICS_AVG_RECIRC_PER_PACKET] = recirc_per_pkt; + values[PMD_METRICS_AVG_PASSES_PER_PACKET] = passes_per_pkt; + values[PMD_METRICS_AVG_CYCLES_PER_PACKET] = cycles_per_pkt; + values[PMD_METRICS_AVG_BUSY_CYCLES_PER_PACKET] = busy_cycles_per_pkt; + values[PMD_METRICS_PERCENT_BUSY_CYCLES] = avg_busy_cycles; + values[PMD_METRICS_PERCENT_IDLE_CYCLES] = avg_idle_cycles; +} + +METRICS_ENTRIES(foreach_poll_threads, poll_threads_entries, + "poll_threads", poll_threads_read_value, + [PMD_METRICS_PACKETS] = METRICS_COUNTER(packets, + "Number of received packets."), + [PMD_METRICS_RECIRC] = METRICS_COUNTER(recirculations, + "Number of executed packet recirculations."), + [PMD_METRICS_HIT] = METRICS_COUNTER(hit, + "Number of flow table matches."), + [PMD_METRICS_MISSED] = METRICS_COUNTER(missed, + "Number of flow table misses and upcall succeeded."), + [PMD_METRICS_LOST] = METRICS_COUNTER(lost, + "Number of flow table misses and upcall failed."), + [PMD_METRICS_AVG_LOOKUPS_PER_HIT] = METRICS_GAUGE(lookups_per_hit, + "Average number of lookups per flow table hit."), + [PMD_METRICS_AVG_PACKETS_PER_BATCH] = METRICS_GAUGE(packets_per_batch, + "Average number of packets per batch."), + [PMD_METRICS_AVG_RECIRC_PER_PACKET] = METRICS_GAUGE(recirc_per_packet, + "Average number of recirculations per packet."), + [PMD_METRICS_AVG_PASSES_PER_PACKET] = METRICS_GAUGE(passes_per_packet, + "Average number of datapath passes per packet."), + [PMD_METRICS_AVG_CYCLES_PER_PACKET] = METRICS_GAUGE(cycles_per_packet, + "Average number of CPU cycles per packet."), + [PMD_METRICS_AVG_BUSY_CYCLES_PER_PACKET] = METRICS_GAUGE( + busy_cycles_per_packet, + "Average number of active CPU cycles per packet."), + [PMD_METRICS_PERCENT_BUSY_CYCLES] = METRICS_GAUGE(busy_cycles, + "Percent of useful CPU cycles."), + [PMD_METRICS_PERCENT_IDLE_CYCLES] = METRICS_GAUGE(idle_cycles, + "Percent of idle CPU cycles."), +); + +enum { + PMD_METRICS_SIMPLE_N_ENTRIES, + PMD_METRICS_SIMPLE_HIT, + PMD_METRICS_SIMPLE_MISS, + PMD_METRICS_SIMPLE_UPDATE, + PMD_METRICS_EMC_N_ENTRIES, + PMD_METRICS_EMC_HIT, + PMD_METRICS_EMC_MISS, + PMD_METRICS_EMC_UPDATE, + PMD_METRICS_SMC_N_ENTRIES, + PMD_METRICS_SMC_HIT, + PMD_METRICS_SMC_MISS, + PMD_METRICS_SMC_UPDATE, + PMD_METRICS_CLS_N_ENTRIES, + PMD_METRICS_CLS_HIT, + PMD_METRICS_CLS_MISS, + PMD_METRICS_CLS_UPDATE, + PMD_METRICS_N_CACHE_ENTRIES, +}; + +static unsigned int +dpcls_count(struct dpcls *cls) +{ + struct dpcls_subtable *subtable; + unsigned int count = 0; + + CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) { + count += cmap_count(&subtable->rules); + } + + return count; +} + +static void +poll_threads_cache_read_value(double *values, void *it) +{ + struct dp_netdev_pmd_thread *pmd = it; + uint64_t stats[PMD_N_STATS]; + unsigned int pmd_n_cls_rules; + struct dpcls *cls; + + for (int i = 0; i < PMD_N_STATS; i++) { + atomic_read_relaxed(&pmd->perf_stats.counters.n[i], &stats[i]); + } + + values[PMD_METRICS_SIMPLE_N_ENTRIES] = + cmap_count(&pmd->simple_match_table); + values[PMD_METRICS_SIMPLE_HIT] = stats[PMD_STAT_SIMPLE_HIT]; + values[PMD_METRICS_SIMPLE_MISS] = stats[PMD_STAT_SIMPLE_MISS]; + values[PMD_METRICS_SIMPLE_UPDATE] = stats[PMD_STAT_SIMPLE_UPDATE]; + + values[PMD_METRICS_EMC_N_ENTRIES] = + emc_cache_count(&(pmd->flow_cache).emc_cache); + values[PMD_METRICS_EMC_HIT] = stats[PMD_STAT_EXACT_HIT]; + values[PMD_METRICS_EMC_MISS] = stats[PMD_STAT_EXACT_MISS]; + values[PMD_METRICS_EMC_UPDATE] = stats[PMD_STAT_EXACT_UPDATE]; + + values[PMD_METRICS_SMC_N_ENTRIES] = + smc_cache_count(&(pmd->flow_cache).smc_cache); + values[PMD_METRICS_SMC_HIT] = stats[PMD_STAT_SMC_HIT]; + values[PMD_METRICS_SMC_MISS] = stats[PMD_STAT_SMC_MISS]; + values[PMD_METRICS_SMC_UPDATE] = stats[PMD_STAT_SMC_UPDATE]; + + pmd_n_cls_rules = 0; + CMAP_FOR_EACH (cls, node, &pmd->classifiers) { + pmd_n_cls_rules += dpcls_count(cls); + } + + values[PMD_METRICS_CLS_N_ENTRIES] = pmd_n_cls_rules; + values[PMD_METRICS_CLS_HIT] = stats[PMD_STAT_MASKED_HIT]; + values[PMD_METRICS_CLS_MISS] = stats[PMD_STAT_MASKED_LOOKUP] - + stats[PMD_STAT_MASKED_HIT]; + values[PMD_METRICS_CLS_UPDATE] = stats[PMD_STAT_MASKED_UPDATE]; +} + +/* Use a single point of definition for the cache entries to enforce + * strict alignment between 'datapath_cache' and 'poll_threads_cache' + * metrics. */ +#define PMD_METRICS_CACHE_ENTRIES \ + /* Simple match cache. */ \ + [PMD_METRICS_SIMPLE_N_ENTRIES] = METRICS_GAUGE(simple_n_entries, \ + "Number of entries in the simple match cache."), \ + [PMD_METRICS_SIMPLE_HIT] = METRICS_COUNTER(simple_hit, \ + "Number of lookup hit in the simple match cache."), \ + [PMD_METRICS_SIMPLE_MISS] = METRICS_COUNTER(simple_miss, \ + "Number of lookup miss in the simple match cache."), \ + [PMD_METRICS_SIMPLE_UPDATE] = METRICS_COUNTER(simple_update, \ + "Number of updates of the simple match cache."), \ + /* Exact match cache. */ \ + [PMD_METRICS_EMC_N_ENTRIES] = METRICS_GAUGE(emc_n_entries, \ + "Number of entries in the exact match cache."), \ + [PMD_METRICS_EMC_HIT] = METRICS_COUNTER(emc_hit, \ + "Number of lookup hit in the exact match cache."), \ + [PMD_METRICS_EMC_MISS] = METRICS_COUNTER(emc_miss, \ + "Number of lookup miss in the exact match cache."), \ + [PMD_METRICS_EMC_UPDATE] = METRICS_COUNTER(emc_update, \ + "Number of updates of the exact match cache."), \ + /* Signature match cache. */ \ + [PMD_METRICS_SMC_N_ENTRIES] = METRICS_GAUGE(smc_n_entries, \ + "Number of entries in the signature match cache."), \ + [PMD_METRICS_SMC_HIT] = METRICS_COUNTER(smc_hit, \ + "Number of lookup hit in the signature match cache."), \ + [PMD_METRICS_SMC_MISS] = METRICS_COUNTER(smc_miss, \ + "Number of lookup miss in the signature match cache."), \ + [PMD_METRICS_SMC_UPDATE] = METRICS_COUNTER(smc_update, \ + "Number of updates of the signature match cache."), \ + /* Datapath classifiers. */ \ + [PMD_METRICS_CLS_N_ENTRIES] = METRICS_GAUGE(cls_n_entries, \ + "Number of entries in the datapath classifiers."), \ + [PMD_METRICS_CLS_HIT] = METRICS_COUNTER(cls_hit, \ + "Number of lookup hit in the datapath classifiers."), \ + [PMD_METRICS_CLS_MISS] = METRICS_COUNTER(cls_miss, \ + "Number of lookup miss in the datapath classifiers."), \ + [PMD_METRICS_CLS_UPDATE] = METRICS_COUNTER(cls_update, \ + "Number of updates of the datapath classifiers."), + +METRICS_ENTRIES(foreach_poll_threads_dbg, poll_threads_cache_dbg_entries, + "poll_threads_cache", poll_threads_cache_read_value, PMD_METRICS_CACHE_ENTRIES); + +static void +datapath_cache_read_value(double *values, void *it) +{ + double pmd_values[PMD_METRICS_N_CACHE_ENTRIES]; + struct dp_netdev *dp = get_dp_netdev2(it); + struct dp_netdev_pmd_thread *pmd; + int i; + + for (i = 0; i < ARRAY_SIZE(pmd_values); i++) { + values[i] = 0.0; + } + + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + poll_threads_cache_read_value(pmd_values, pmd); + for (i = 0; i < ARRAY_SIZE(pmd_values); i++) { + values[i] += pmd_values[i]; + } + } +} + +METRICS_ENTRIES(foreach_dpif_netdev_ext, datapath_cache_ext_entries, + "datapath_cache", datapath_cache_read_value, PMD_METRICS_CACHE_ENTRIES); + +METRICS_DECLARE(hw_offload_threads_dbg_entries); +METRICS_DECLARE(hw_offload_latency); +METRICS_DECLARE(hw_offload_queue_sojourn_time); +METRICS_DECLARE(hw_offload_queue_wait_time); +METRICS_DECLARE(hw_offload_queue_service_time); +METRICS_DECLARE(datapath_hw_offload_entries); + +void +dpif_netdev_metrics_register(void) +{ + METRICS_REGISTER(datapath_cache_ext_entries); + METRICS_REGISTER(poll_threads_entries); + METRICS_REGISTER(poll_threads_cache_dbg_entries); + METRICS_REGISTER(hw_offload_threads_dbg_entries); + METRICS_REGISTER(hw_offload_latency); + METRICS_REGISTER(hw_offload_queue_sojourn_time); + METRICS_REGISTER(hw_offload_queue_wait_time); + METRICS_REGISTER(hw_offload_queue_service_time); + METRICS_REGISTER(datapath_hw_offload_entries); +} + +static void +dpif_netdev2_set_static_config(const struct smap *other_config) +{ + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; + + if (!ovsthread_once_start(&once)) { + return; + } + + conntrack_offload_config(other_config); + dpif_netdev2_set_static_config_ct_add_queue_size(other_config); + + ovsthread_once_done(&once); +} + +void +dpif_netdev2_set_ext_config(struct dp_netdev *dp, const struct smap *other_config, + bool first_set_config) +{ + bool pmd_quiet_idle, cur_pmd_quiet_idle; + + if (smap_get_node(other_config, "max-recirc-depth")) { + unsigned int read_depth; + + read_depth = smap_get_uint(other_config, "max-recirc-depth", + DEFAULT_MAX_RECIRC_DEPTH); + if (read_depth < DEFAULT_MAX_RECIRC_DEPTH) { + read_depth = DEFAULT_MAX_RECIRC_DEPTH; + } + if (netdev_is_e2e_cache_enabled() + && read_depth > E2E_CACHE_MAX_TRACE) { + VLOG_INFO("max recirc depth is %d if e2e-cache is enabled", + E2E_CACHE_MAX_TRACE); + read_depth = E2E_CACHE_MAX_TRACE; + } + if (max_recirc_depth != read_depth) { + max_recirc_depth = read_depth; + VLOG_INFO("max recirc depth set to %u", read_depth); + } + } + + bool sleep_changed = set_all_pmd_max_sleeps(dp, other_config); + if (first_set_config || sleep_changed) { + log_all_pmd_sleeps(dp); + } + + pmd_quiet_idle = smap_get_bool(other_config, "pmd-quiet-idle", false); + atomic_read_relaxed(&dp->pmd_quiet_idle, &cur_pmd_quiet_idle); + if (first_set_config || pmd_quiet_idle != cur_pmd_quiet_idle) { + atomic_store_relaxed(&dp->pmd_quiet_idle, pmd_quiet_idle); + VLOG_INFO("PMD quiescent idling mode %s.", + pmd_quiet_idle ? "enabled" : "disabled"); + } + + dpif_netdev2_set_static_config(other_config); +} + +void +dp_netdev_doca_affinity_list(struct dp_netdev *dp, + struct netdev *netdev, + char **affinity_list) +{ + struct ds doca_affinity_s = DS_EMPTY_INITIALIZER; + int numa_id = netdev_get_numa_id(netdev); + struct ovs_numa_dump *pmd_cores; + struct ovs_numa_info_core *core; + bool same_numa; + uint8_t q; + int iter; + + pmd_cores = dp_netdev_pmd_cmask2cores(dp->pmd_cmask); + same_numa = true; + q = 0; + /* In the first iteration, assign the same numa cores, and the rest of + * the queues to other numa cores in the 2nd one. + */ + for (iter = 0; iter < 2; iter++) { + FOR_EACH_CORE_ON_DUMP(core, pmd_cores) { + if ((same_numa && numa_id != core->numa_id) || + (!same_numa && numa_id == core->numa_id)) { + continue; + } + if (q > 0) { + ds_put_cstr(&doca_affinity_s, ","); + } + ds_put_format(&doca_affinity_s, "%d:%d", q, core->core_id); + q++; + } + same_numa = false; + } + ovs_numa_dump_destroy(pmd_cores); + free(*affinity_list); + *affinity_list = ds_steal_cstr(&doca_affinity_s); + ds_destroy(&doca_affinity_s); +} + +int +dpif_netdev2_ct_get_stats(struct dpif *dpif, + struct ct_dpif_stats *stats) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + + return conntrack_get_stats(dp->conntrack, stats); +} + +void +dp_netdev_esw_ports_set_disabled(struct dp_netdev *dp, struct netdev *esw_mgr, bool value) + OVS_REQ_WRLOCK(dp->port_rwlock) +{ + struct dp_netdev_port *port; + int esw_mgr_pid; + + esw_mgr_pid = netdev_dpdk_get_esw_mgr_port_id(esw_mgr); + + if (esw_mgr_pid == -1) { + return; + } + + HMAP_FOR_EACH (port, node, &dp->ports) { + if (esw_mgr_pid == netdev_dpdk_get_esw_mgr_port_id(port->netdev)) { + port->disabled = value; + } + } +} + +int +dp_netdev_offload_netdev_meter_set(uint32_t meter_id_, + struct ofputil_meter_config *config) +{ + /* Compensate for ovs-ofctl (meter_ID - 1) adjustment */ + ofproto_meter_id id = { .uint32 = meter_id_ + 1 }; + return netdev_dpdk_meter_set(id, config); +} + +int +dp_netdev_offload_netdev_meter_get(uint32_t meter_id_, + struct ofputil_meter_stats *stats, + uint16_t n_bands) +{ + /* Compensate for ovs-ofctl (meter_ID - 1) adjustment */ + ofproto_meter_id id = { .uint32 = meter_id_ + 1 }; + return netdev_dpdk_meter_get(id, stats, n_bands); +} + +int +dp_netdev_offload_netdev_meter_del(uint32_t meter_id_, + struct ofputil_meter_stats *stats, + uint16_t n_bands) +{ + /* Compensate for ovs-ofctl (meter_ID - 1) adjustment */ + ofproto_meter_id id = { .uint32 = meter_id_ + 1 }; + return netdev_dpdk_meter_del(id, stats, n_bands); +} + +bool +dp_netdev_port_exists(struct netdev *netdev) +{ + struct ifreq ifr; + int sock; + int err; + + sock = socket(AF_INET, SOCK_STREAM, IPPROTO_IP); + if (sock < 0) { + return false; + } + ovs_strlcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name); + + /* Try to get some property of the port, to check it exists. */ + err = ioctl(sock, SIOCGIFMTU, &ifr); + + close(sock); + + return err == 0; +} diff --git a/lib/dpif-netdev-ext2.h b/lib/dpif-netdev-ext2.h new file mode 100644 index 00000000000..6f12d846471 --- /dev/null +++ b/lib/dpif-netdev-ext2.h @@ -0,0 +1,516 @@ +/* + * Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef DPIF_NETDEV_EXT_H +#define DPIF_NETDEV_EXT_H 1 + +#include +#include +#include +#include "dpif.h" +#include "dp-packet.h" +#include "netdev-offload-ext.h" +#include "openvswitch/match.h" +#include "openvswitch/types.h" +#include "packets.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct unixctl_conn; +struct dp_offload_thread; +struct dp_offload_thread_item; +struct dp_packet_flow_map; + +extern bool dp_netdev_e2e_cache_enabled; +extern unsigned int max_recirc_depth; + +struct ovs_numa_dump *dp_netdev_pmd_cmask2cores(const char *pmd_cmask); +unsigned int dpif_netdev2_get_n_pmd_threads(void); +void dpif_netdev2_set_n_pmd_threads(const char *pmd_cmask); + +void +dp_netdev_dump_packets_toggle(struct unixctl_conn *conn, int argc, + const char *argv[], void *aux OVS_UNUSED); + +void +dp_netdev_read_dump_packets_enabled(bool *flag); + +/* This struct holds the e2e-cache statistic counters + * generated_trcs = Amount of trace messages generated/dispatched to E2E cache. + * processed_trcs = Amount of trace messages processed by E2E cache. + * discarded_trcs = Amount of trace messages discarded by E2E cache. + * aborted_trcs = Amount of trace messages aborted by E2E cache. + * throttled_trcs = Amount of trace messages throttled due to high message + * rate. + * queue_trcs = Amount of trace messages in E2E cache queue. + * overflow_trcs = Amount of trace messages dropped due to + * queue overflow. + * flow_add_msgs = Amount of new flow messages received by E2E cache. + * flow_del_msgs = Amount of delete flow messages received by E2E cache. + * flush_flow_msgs = Amount of flush flow messages received by E2E cache. + * succ_merged_flows = Amount of successfully merged flows. + * merge_rej_flows = Amount of flows rejected by the merge engine. + * add_merged_flow_hw = Amount of add merged flow messages dispatched to + * HW offload. + * del_merged_flow_hw = Amount of delete merged flow messages dispatched to + * HW offload. + * add_ct_flow_hw = Amount of successful CT offload operations to MT. + * add_ct_flow_err = Amount of failed CT offload operations MT. + * succ_ct2ct_merges = Amount of successfully ct2ct merges. + * rej_ct2ct_merges = Amount of merges rejected by the ct2ct merge engine. + * add_ct2ct_flows = Amount of CT2CT offload add operations. + * del_ct2ct_flows = Amount of CT2CT offload del operations. + */ +struct e2e_cache_stats { + atomic_count generated_trcs; + uint32_t processed_trcs; + atomic_count discarded_trcs; + atomic_count aborted_trcs; + atomic_count throttled_trcs; + atomic_count queue_trcs; + atomic_count overflow_trcs; + atomic_count flow_add_msgs; + atomic_count flow_del_msgs; + uint32_t flush_flow_msgs; + uint32_t succ_merged_flows; + uint32_t merge_rej_flows; + uint32_t add_merged_flow_hw; + uint32_t del_merged_flow_hw; + uint32_t add_ct_mt_flow_hw; + uint32_t del_ct_mt_flow_hw; + uint32_t add_ct_mt_flow_err; + uint32_t del_ct_mt_flow_err; + uint32_t succ_ct2ct_merges; + uint32_t rej_ct2ct_merges; + uint32_t add_ct2ct_flows; + uint32_t del_ct2ct_flows; +}; + +enum { + E2E_UFID_MSG_PUT = 1, + E2E_UFID_MSG_DEL = 2, +}; + +struct e2e_cache_ufid_msg { + struct mpsc_queue_node node; + ovs_u128 ufid; + int op; + bool is_ct; + struct nlattr *actions; + struct dp_netdev *dp; + struct conn *conn; + struct netdev *netdev; + struct ovs_barrier *barrier; + struct ovs_refcount *del_refcnt; + size_t actions_len; + long long int timestamp; + union { + struct match match[0]; + struct ct_match ct_match[0]; + }; +}; + +enum e2e_offload_state { + E2E_OL_STATE_FLOW, + E2E_OL_STATE_CT_SW, + E2E_OL_STATE_CT_HW, + E2E_OL_STATE_CT_MT, + E2E_OL_STATE_CT2CT, + E2E_OL_STATE_CT_ERR, + E2E_OL_STATE_NUM, +}; + +static const char * const e2e_offload_state_names[] = { + [E2E_OL_STATE_FLOW] = "E2E_OL_STATE_FLOW", + [E2E_OL_STATE_CT_SW] = "E2E_OL_STATE_CT_SW", + [E2E_OL_STATE_CT_HW] = "E2E_OL_STATE_CT_HW", + [E2E_OL_STATE_CT_MT] = "E2E_OL_STATE_CT_MT", + [E2E_OL_STATE_CT2CT] = "E2E_OL_STATE_CT2CT", + [E2E_OL_STATE_CT_ERR] = "E2E_OL_STATE_CT_ERR", + [E2E_OL_STATE_NUM] = "Unknown", +}; + +struct merged_match_fields { + union flow_in_port in_port; /* Input port.*/ + struct { + ovs_be32 ip_dst; + struct in6_addr ipv6_dst; + ovs_be32 ip_src; + struct in6_addr ipv6_src; + ovs_be64 tun_id; + ovs_be16 tp_dst; + } tunnel; + + /* L2. */ + struct eth_addr dl_dst; /* Ethernet destination address. */ + struct eth_addr dl_src; /* Ethernet source address. */ + ovs_be16 dl_type; /* Ethernet frame type. */ + + /* VLANs. */ + union flow_vlan_hdr vlans[1]; /* VLANs */ + + /* L3. */ + ovs_be32 nw_src; /* IPv4 source address or ARP SPA. */ + ovs_be32 nw_dst; /* IPv4 destination address or ARP TPA. */ + struct in6_addr ipv6_src; /* IPv6 source address. */ + struct in6_addr ipv6_dst; /* IPv6 destination address. */ + uint8_t nw_frag; /* FLOW_FRAG_* flags. */ + uint8_t nw_proto; /* IP protocol or low 8 bits of ARP opcode. */ + + /* L4. */ + ovs_be16 tp_src; /* TCP/UDP/SCTP source port/ICMP type. */ + ovs_be16 tp_dst; /* TCP/UDP/SCTP destination port/ICMP code. */ + + /* MD. */ + uint16_t ct_zone; /* Connection tracking zone. */ +}; + +struct merged_match { + struct merged_match_fields spec; + struct merged_match_fields mask; +}; + +/* + * A mapping from ufid to flow for e2e cache. + */ +struct e2e_cache_ovs_flow { + struct hmap_node node; + ovs_u128 ufid; + unsigned int merge_tid; + struct nlattr *actions; + struct e2e_cache_ovs_flow *ct_peer; + enum e2e_offload_state offload_state; + uint16_t actions_size; + struct hmap merged_counters; /* Map of merged flows counters + it is part of. */ + struct ovs_list associated_merged_flows; + union { + struct match match[0]; + struct ct_match ct_match[0]; + }; +}; + +/* Helper struct for accessing a struct containing ovs_list array. + * Containing struct + * |- Helper array + * [0] Helper item 0 + * |- ovs_list item 0 + * |- index (0) + * [1] Helper item 1 + * |- ovs_list item 1 + * |- index (1) + * To access the containing struct from one of the ovs_list items: + * 1. Get the helper item from the ovs_list item using + * helper item = + CONTAINER_OF(ovs_list item, helper struct type, ovs_list field) + * 2. Get the contining struct from the helper item and its index in the array: + * containing struct = + * CONTAINER_OF(helper item, containing struct type, helper field[index]) + */ +struct flow2flow_item { + struct ovs_list list; + struct e2e_cache_ovs_flow *mt_flow; + uint16_t index; +}; + +enum { + E2E_SET_ETH_SRC = 1 << 0, + E2E_SET_ETH_DST = 1 << 1, + E2E_SET_ETH = E2E_SET_ETH_SRC | E2E_SET_ETH_DST, + + E2E_SET_IPV4_SRC = 1 << 2, + E2E_SET_IPV4_DST = 1 << 3, + E2E_SET_IPV4_TTL = 1 << 4, + E2E_SET_IPV4 = E2E_SET_IPV4_SRC | E2E_SET_IPV4_DST | \ + E2E_SET_IPV4_TTL, + + E2E_SET_IPV6_SRC = 1 << 5, + E2E_SET_IPV6_DST = 1 << 6, + E2E_SET_IPV6_HLMT = 1 << 7, + E2E_SET_IPV6 = E2E_SET_IPV6_SRC | E2E_SET_IPV6_DST | \ + E2E_SET_IPV6_HLMT, + + E2E_SET_UDP_SRC = 1 << 8, + E2E_SET_UDP_DST = 1 << 9, + E2E_SET_UDP = E2E_SET_UDP_SRC | E2E_SET_UDP_DST, + + E2E_SET_TCP_SRC = 1 << 10, + E2E_SET_TCP_DST = 1 << 11, + E2E_SET_TCP = E2E_SET_TCP_SRC | E2E_SET_TCP_DST, +}; + +struct e2e_cache_merged_set { + struct ovs_key_ethernet eth; + struct ovs_key_ipv4 ipv4; + struct ovs_key_ipv6 ipv6; + struct ovs_key_tcp tcp; + struct ovs_key_udp udp; + uint32_t flags; +}; + +/* + * Merged flow structure. + */ +struct e2e_cache_merged_flow { + union { + struct hmap_node in_hmap; + struct ovs_list in_list; + } node; + ovs_u128 ufid; + unsigned int tid; + struct dp_netdev *dp; + struct nlattr *actions; + uint16_t actions_size; + uint16_t associated_flows_len; + struct ovs_list flow_counter_list; /* Anchor for list of merged flows + using the same flow counter. */ + struct ovs_list ct_counter_list; /* Anchor for list of merged flows + using the same CT counter. */ + uintptr_t ct_counter_key; + struct flows_counter_key flows_counter_key; + struct merged_match merged_match; + uint32_t flow_mark; + struct flow2flow_item associated_flows[0]; +}; + +/* Counter object. */ +struct e2e_cache_counter_item { + struct hmap_node node; + struct ovs_list merged_flows; /* List of merged flows using this counter. */ + size_t hash; + bool is_ct; + struct flows_counter_key key; +}; + +int +dpif_netdev2_offload_stats_get(struct dpif *dpif, + struct netdev_custom_stats *stats, + bool verbose); + +int +dpif_netdev2_offload_stats_clear(struct dpif *dpif OVS_UNUSED); + +void +dpif_netdev2_set_static_config_ct_add_queue_size(const struct smap *other_config); + +void +dpif_netdev_set_config_e2e_cache(const struct smap *other_config); + +void +dp_netdev_offload_thread_uninit(struct dp_offload_thread *thread); + +void +dp_netdev_offload_thread_init(struct dp_offload_thread *thread); + +struct dp_offload_thread * +dp_netdev_offload_thread_next(struct dp_offload_thread *start, + unsigned int *tid, bool include_main); + +void +e2e_cache_trace_add_flow(struct dp_packet *p, + const ovs_u128 *ufid); + +int +e2e_cache_flow_del(const ovs_u128 *ufid, struct dp_netdev *dp, + struct conn *conn, long long int now); + +void +e2e_cache_trace_tnl_pop(struct dp_packet *packet); + +void +e2e_cache_dispatch_trace_message(struct dp_netdev *dp, + struct dp_packet_batch *batch, + long long int now); + +int +e2e_cache_flow_put(bool is_ct, const ovs_u128 *ufid, const void *match, + const struct nlattr *actions, size_t actions_len, + long long int now); + +int +e2e_cache_process_trace_info(struct dp_netdev *dp, + const struct e2e_cache_trace_info *trc_info, + unsigned int tid); + +bool +e2e_cache_get_merged_flows_stats(struct netdev *netdev, + struct match *match, + struct nlattr **actions, + const ovs_u128 *mt_ufid, + struct dpif_flow_stats *stats, + struct ofpbuf *buf, + long long now, + long long prev_now); + +void +dpif_netdev_dump_e2e_flows(struct hmap *portno_names, + struct ofputil_port_map *port_map, struct ds *s); + +void +packet_enqueue_to_flow_map(struct dp_packet *packet, + struct dp_netdev_flow *flow, + uint16_t tcp_flags, + struct dp_packet_flow_map *flow_map, + size_t index); + +unsigned int +dp_netdev_offload_thread_nb(void); + +void +pmd_thread_offload_upkeep(struct dp_netdev_pmd_thread *pmd, bool force); + +unsigned int +pmd_thread_offload_process(struct dp_netdev_pmd_thread *pmd, unsigned int limit); + +void +pmd_thread_offload_init(struct dp_netdev_pmd_thread *pmd); + +void +pmd_thread_offload_uninit(struct dp_netdev_pmd_thread *pmd); + +void +pmd_thread_offload_disable(struct dp_netdev_pmd_thread *pmd, struct netdev *netdev); + +void +dp_netdev_pmd_idle_begin(struct dp_netdev_pmd_thread *pmd); + +void +dp_netdev_pmd_idle_end(struct dp_netdev_pmd_thread *pmd); + +void +dp_netdev_flow_format(const char *prefix, + struct ds *s, + const struct dp_netdev_flow *dp_flow); + +long long int +e2e_cache_flow_db_handle_ufid_msg(struct e2e_cache_ufid_msg *ufid_msg); + +long long int +dp_offload_measure_latency(struct dp_offload_thread *thread, + long long int enqueue_time_us, + long long int finish_time_us); + +void +dp_offload_process(struct dp_offload_thread *thread, + struct dp_offload_thread_item *msg); + +void +dp_netdev_offload_poll_queues(struct dp_offload_thread *ofl_thread, + struct e2e_cache_ufid_msg **ufid_msg, + struct dp_offload_thread_item **offload_item, + struct e2e_cache_trace_message **trace_msg); + +void +dp_netdev_ct_offload_init(struct dp_netdev *dp); + +void +dp_netdev_ct_offload_uninit(struct dp_netdev *dp); + +/* Iterator over all active offload threads. + * + * THREAD (struct dp_offload_thread *): + * Pointer to a thread object. Will be set to each valid + * thread in succession and then NULL after completion. + */ +#define DP_NETDEV_OFFLOAD_FOREACH_THREAD_SHORT(INCLUDE_MAIN, THREAD) \ + for ((THREAD) = dp_netdev_offload_thread_next(NULL, NULL, INCLUDE_MAIN); (THREAD); \ + (THREAD) = dp_netdev_offload_thread_next((THREAD), NULL, INCLUDE_MAIN)) + +/* Iterator over all active offload threads. + * + * THREAD (struct dp_offload_thread *): + * Pointer to a thread object. Will be set to each valid + * thread in succession and then NULL after completion. + * + * ID (unsigned int): + * Variable that will contain the thread-id during each + * step of the iteration. + */ +#define DP_NETDEV_OFFLOAD_FOREACH_THREAD_LONG(INCLUDE_MAIN, THREAD, ID) \ + for ((THREAD) = dp_netdev_offload_thread_next(NULL, &(ID), INCLUDE_MAIN); (THREAD); \ + (THREAD) = dp_netdev_offload_thread_next((THREAD), &(ID), INCLUDE_MAIN)) + +/* Iterator over all active offload threads. + * + * This is a variadic macro that can take either SHORT or LONG + * form: passing only (THREAD), or (THREAD, ID)) according to each + * variant signature above. + * + */ +#define DP_NETDEV_OFFLOAD_FOREACH_THREAD(...) \ + OVERLOAD_SAFE_MACRO(DP_NETDEV_OFFLOAD_FOREACH_THREAD_LONG, \ + DP_NETDEV_OFFLOAD_FOREACH_THREAD_SHORT, \ + 3, true, __VA_ARGS__) + +/* Iterator over all active offload threads except 'main'. + * + * This is a variadic macro that can take either SHORT or LONG + * form: passing only (THREAD), or (THREAD, ID)) according to each + * variant signature above. + * + * The reason to exclude the main sometimes is that this special thread + * does not handle offload messages. If the iteration is done to emit + * messages, the main thread must be excluded. + * + */ +#define DP_NETDEV_OFFLOAD_FOREACH_THREAD_NO_MAIN(...) \ + OVERLOAD_SAFE_MACRO(DP_NETDEV_OFFLOAD_FOREACH_THREAD_LONG, \ + DP_NETDEV_OFFLOAD_FOREACH_THREAD_SHORT, \ + 3, false, __VA_ARGS__) + +void +dpif_netdev_metrics_register(void); + +void +dp_netdev_offload_thread_enqueue(struct dp_offload_thread *thread, + struct dp_offload_thread_item *offload); + +void +dpif_netdev2_set_ext_config(struct dp_netdev *dp, const struct smap *other_config, + bool first_set_config); + +void +dp_netdev_doca_affinity_list(struct dp_netdev *dp, + struct netdev *netdev, + char **affinity_list); + +int +dpif_netdev2_ct_get_stats(struct dpif *dpif, + struct ct_dpif_stats *stats); + +int +dp_netdev_offload_netdev_meter_set(uint32_t meter_id_, + struct ofputil_meter_config *config); + +int +dp_netdev_offload_netdev_meter_get(uint32_t meter_id_, + struct ofputil_meter_stats *stats, + uint16_t n_bands); +int +dp_netdev_offload_netdev_meter_del(uint32_t meter_id_, + struct ofputil_meter_stats *stats, + uint16_t n_bands); + +bool +dp_netdev_port_exists(struct netdev *netdev); + +#ifdef __cplusplus +} +#endif + +#endif /* dpif-netdev-ext.h */ diff --git a/lib/dpif-netdev2.c b/lib/dpif-netdev2.c new file mode 100644 index 00000000000..f39bb696621 --- /dev/null +++ b/lib/dpif-netdev2.c @@ -0,0 +1,10929 @@ +/* + * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc. + * Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "dpif-netdev2.h" +#include "dpif-netdev-private.h" +#include "dpif-netdev-private-dfc.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bitmap.h" +#include "ccmap.h" +#include "cmap.h" +#include "conntrack.h" +#include "conntrack-offload.h" +#include "conntrack-tp.h" +#include "coverage.h" +#include "ct-dpif.h" +#include "csum.h" +#include "dp-packet.h" +#include "dpif.h" +#include "dpif-netdev-lookup.h" +#include "dpif-netdev-perf.h" +#include "dpif-netdev-private-extract.h" +#include "dpif-plugin.h" +#include "dpif-provider.h" +#include "dummy.h" +#include "fat-rwlock.h" +#include "flow.h" +#include "histogram.h" +#include "hmapx.h" +#include "id-fpool.h" +#include "id-pool.h" +#include "ipf.h" +#include "metrics.h" +#include "mov-avg.h" +#include "mpsc-queue.h" +#include "netdev.h" +#include "netdev-offload.h" +#include "netdev-offload-dpdk-ext.h" +#include "netdev-provider.h" +#include "netdev-vport.h" +#include "netdev-dpdk.h" +#include "netlink.h" +#include "odp-execute.h" +#include "odp-util.h" +#include "openvswitch/dynamic-string.h" +#include "openvswitch/list.h" +#include "openvswitch/match.h" +#include "openvswitch/ofp-parse.h" +#include "openvswitch/ofp-print.h" +#include "openvswitch/ofpbuf.h" +#include "openvswitch/shash.h" +#include "openvswitch/vlog.h" +#include "ovs-doca.h" +#include "ovs-numa.h" +#include "ovs-rcu.h" +#include "packets.h" +#include "openvswitch/poll-loop.h" +#include "pvector.h" +#include "random.h" +#include "rtnetlink.h" +#include "seq.h" +#include "smap.h" +#include "sset.h" +#include "timeval.h" +#include "tnl-neigh-cache.h" +#include "tnl-ports.h" +#include "unixctl.h" +#include "util.h" +#include "uuid.h" + +VLOG_DEFINE_THIS_MODULE(dpif_netdev2); + +const struct dpif_class dpif_netdev2_class; + +/* Auto Load Balancing Defaults */ +#define ALB_IMPROVEMENT_THRESHOLD 25 +#define ALB_LOAD_THRESHOLD 95 +#define ALB_REBALANCE_INTERVAL 1 /* 1 Min */ +#define MAX_ALB_REBALANCE_INTERVAL 20000 /* 20000 Min */ +#define MIN_TO_MSEC 60000 + +#define FLOW_DUMP_MAX_BATCH 50 +/* Use per thread recirc_depth to prevent recirculation loop. */ +DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0) + +/* Use instant packet send by default. */ +#define DEFAULT_TX_FLUSH_INTERVAL 0 + +/* Configuration parameters. */ +enum { MAX_BANDS = 8 }; /* Maximum number of bands / meter. */ + +COVERAGE_DEFINE(datapath_drop_meter); +COVERAGE_DEFINE(datapath_drop_upcall_error); +COVERAGE_DEFINE(datapath_drop_lock_error); +COVERAGE_DEFINE(datapath_drop_userspace_action_error); +COVERAGE_DEFINE(datapath_drop_tunnel_push_error); +COVERAGE_DEFINE(datapath_drop_tunnel_pop_error); +COVERAGE_DEFINE(datapath_drop_tunnel_tso_recirc); +COVERAGE_DEFINE(datapath_drop_recirc_error); +COVERAGE_DEFINE(datapath_drop_invalid_port); +COVERAGE_DEFINE(datapath_drop_invalid_bond); +COVERAGE_DEFINE(datapath_drop_invalid_tnl_port); +COVERAGE_DEFINE(datapath_drop_rx_invalid_packet); +#ifdef ALLOW_EXPERIMENTAL_API /* Packet restoration API required. */ +COVERAGE_DEFINE(datapath_drop_hw_miss_recover); +#endif + +/* Protects against changes to 'dp_netdevs'. */ +struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER; + +/* Contains all 'struct dp_netdev's. */ +static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex) + = SHASH_INITIALIZER(&dp_netdevs); + +static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600); + +#define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \ + | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \ + | CS_SRC_NAT | CS_DST_NAT) +#define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK) + +struct odp_support dp_netdev_support = { + .max_vlan_headers = SIZE_MAX, + .max_mpls_depth = SIZE_MAX, + .recirc = true, + .ct_state = true, + .ct_zone = true, + .ct_mark = true, + .ct_label = true, + .ct_state_nat = true, + .ct_orig_tuple = true, + .ct_orig_tuple6 = true, +}; + + +/* Simple non-wildcarding single-priority classifier. */ + +/* Time in microseconds between successive optimizations of the dpcls + * subtable vector */ +#define DPCLS_OPTIMIZATION_INTERVAL 1000000LL + +/* Time in microseconds of the interval in which rxq processing cycles used + * in rxq to pmd assignments is measured and stored. */ +#define PMD_INTERVAL_LEN 5000000LL +/* For converting PMD_INTERVAL_LEN to secs. */ +#define INTERVAL_USEC_TO_SEC 1000000LL + +/* Number of intervals for which cycles are stored + * and used during rxq to pmd assignment. */ +#define PMD_INTERVAL_MAX 12 + +/* Time in microseconds to try RCU quiescing. */ +#define PMD_RCU_QUIESCE_INTERVAL 10000LL + +/* Timer resolution for PMD threads in nanoseconds. */ +#define PMD_TIMER_RES_NS 1000 + +/* Number of pkts Rx on an interface that will stop pmd thread sleeping. */ +#define PMD_SLEEP_THRESH (NETDEV_MAX_BURST / 2) +/* Time in uS to increment a pmd thread sleep time. */ +#define PMD_SLEEP_INC_US 1 + +struct pmd_sleep { + unsigned core_id; + uint64_t max_sleep; +}; + +static void dpcls_init(struct dpcls *); +static void dpcls_destroy(struct dpcls *); +static void dpcls_sort_subtable_vector(struct dpcls *); +static uint32_t dpcls_subtable_lookup_reprobe(struct dpcls *cls); +static void dpcls_insert(struct dpcls *, struct dpcls_rule *, + const struct netdev_flow_key *mask); +static void dpcls_remove(struct dpcls *, struct dpcls_rule *); + +/* Set of supported meter flags */ +#define DP_SUPPORTED_METER_FLAGS_MASK \ + (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST) + +/* Set of supported meter band types */ +#define DP_SUPPORTED_METER_BAND_TYPES \ + ( 1 << OFPMBT13_DROP ) + +static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp, + odp_port_t) + OVS_REQ_RDLOCK(dp->port_rwlock); + +enum rxq_cycles_counter_type { + RXQ_CYCLES_PROC_CURR, /* Cycles spent successfully polling and + processing packets during the current + interval. */ + RXQ_CYCLES_PROC_HIST, /* Total cycles of all intervals that are used + during rxq to pmd assignment. */ + RXQ_N_CYCLES +}; + +struct dp_offload_thread dp_offload_threads[MAX_OFFLOAD_THREAD_NB]; + +static void *dp_netdev_flow_offload_main(void *arg); + +void +dp_netdev_offload_init(void) +{ + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; + + if (!ovsthread_once_start(&once)) { + return; + } + + for (unsigned int tid = 0; tid < netdev_offload_thread_nb(); tid++) { + ovs_thread_create("hw_offload", dp_netdev_flow_offload_main, NULL); + } + + /* Wait until all requested offload threads have completed their initialization. */ + while (dp_netdev_offload_thread_nb() < netdev_offload_thread_nb() + 1) { + } + + ovsthread_once_done(&once); +} + +#define XPS_TIMEOUT 500000LL /* In microseconds. */ + +/* Contained by struct dp_netdev_port's 'rxqs' member. */ +struct dp_netdev_rxq { + struct dp_netdev_port *port; + struct netdev_rxq *rx; + unsigned core_id; /* Core to which this queue should be + pinned. OVS_CORE_UNSPEC if the + queue doesn't need to be pinned to a + particular core. */ + atomic_count intrvl_idx; /* Write index for 'cycles_intrvl'. */ + struct dp_netdev_pmd_thread *pmd; /* pmd thread that polls this queue. */ + bool is_vhost; /* Is rxq of a vhost port. */ + + /* Counters of cycles spent successfully polling and processing pkts. */ + atomic_ullong cycles[RXQ_N_CYCLES]; + /* We store PMD_INTERVAL_MAX intervals of data for an rxq and then + sum them to yield the cycles used for an rxq. */ + atomic_ullong cycles_intrvl[PMD_INTERVAL_MAX]; +}; + +static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t, + struct flow *, bool); + +struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *, + size_t); +struct dp_netdev_actions *dp_netdev_flow_get_actions( + const struct dp_netdev_flow *); +static void dp_netdev_actions_free(struct dp_netdev_actions *); + +struct polled_queue { + struct dp_netdev_rxq *rxq; + odp_port_t port_no; + bool emc_enabled; + bool rxq_enabled; + uint64_t change_seq; +}; + +/* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */ +struct rxq_poll { + struct dp_netdev_rxq *rxq; + struct hmap_node node; +}; + +/* Contained by struct dp_netdev_pmd_thread's 'send_port_cache', + * 'tnl_port_cache' or 'tx_ports'. */ +struct tx_port { + struct dp_netdev_port *port; + int qid; + long long last_used; + struct hmap_node node; + long long flush_time; + struct dp_packet_batch output_pkts; + struct dp_packet_batch *txq_pkts; /* Only for hash mode. */ + struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST]; +}; + +/* Contained by struct tx_bond 'member_buckets'. */ +struct member_entry { + odp_port_t member_id; + atomic_ullong n_packets; + atomic_ullong n_bytes; +}; + +/* Contained by struct dp_netdev_pmd_thread's 'tx_bonds'. */ +struct tx_bond { + struct cmap_node node; + uint32_t bond_id; + struct member_entry member_buckets[BOND_BUCKETS]; +}; + +/* Interface to netdev-based datapath. */ +struct dpif_netdev { + struct dpif dpif; + struct dp_netdev *dp; + uint64_t last_port_seq; +}; + +static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no, + struct dp_netdev_port **portp) + OVS_REQ_RDLOCK(dp->port_rwlock); +static int get_port_by_name(struct dp_netdev *dp, const char *devname, + struct dp_netdev_port **portp) + OVS_REQ_RDLOCK(dp->port_rwlock); +static void dp_netdev_free(struct dp_netdev *dp) + OVS_NO_THREAD_SAFETY_ANALYSIS; +static int do_add_port(struct dp_netdev *dp, const char *devname, + const char *type, odp_port_t port_no, + struct netdev **datapath_netdev) + OVS_REQ_WRLOCK(dp->port_rwlock); +static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *) + OVS_REQ_WRLOCK(dp->port_rwlock); +static int dpif_netdev2_open(const struct dpif_class *, const char *name, + bool create, struct dpif **); +static sflow_upcall_callback *sflow_upcall_cb; +static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd, + struct dp_packet_batch *, + bool should_steal, + const struct flow *flow, + struct dp_netdev_flow *dp_flow, + const struct nlattr *actions, + size_t actions_len); +static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *, + struct dp_packet_batch *); + +static void dp_netdev_disable_upcall(struct dp_netdev *); +static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd); +static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, + struct dp_netdev *dp, unsigned core_id, + int numa_id); +static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd); +static void dp_netdev_set_nonpmd(struct dp_netdev *dp) + OVS_REQ_WRLOCK(dp->port_rwlock); + +static void *pmd_thread_main(void *); +static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp, + unsigned core_id); +static struct dp_netdev_pmd_thread * +dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos); +static void dp_netdev_del_pmd(struct dp_netdev *dp, + struct dp_netdev_pmd_thread *pmd); +static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd); +static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd); +static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, + struct dp_netdev_port *port) + OVS_REQUIRES(pmd->port_mutex); +static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd, + struct tx_port *tx) + OVS_REQUIRES(pmd->port_mutex); +static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd, + struct dp_netdev_rxq *rxq) + OVS_REQUIRES(pmd->port_mutex); +static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd, + struct rxq_poll *poll) + OVS_REQUIRES(pmd->port_mutex); +static int +dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd, + bool force); +static void dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, + struct tx_bond *bond, bool update) + OVS_EXCLUDED(pmd->bond_mutex); +static void dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd, + uint32_t bond_id) + OVS_EXCLUDED(pmd->bond_mutex); + +static void dp_netdev_offload_flush(struct dp_netdev *dp, + struct dp_netdev_port *port) + OVS_EXCLUDED(dp->port_rwlock); + +static void reconfigure_datapath(struct dp_netdev *dp) + OVS_REQ_RDLOCK(dp->port_rwlock); +static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd); +static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd); +static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd); +static void +dp_netdev_port_flow_flush(struct dp_netdev *dp, struct dp_netdev_port *port); + +static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd) + OVS_REQUIRES(pmd->port_mutex); +static inline void +dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, + struct polled_queue *poll_list, int poll_cnt); +static void +dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx, + enum rxq_cycles_counter_type type, + unsigned long long cycles); +static uint64_t +dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx, + enum rxq_cycles_counter_type type); +static void +dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx, + unsigned long long cycles); +static uint64_t +dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx); +static uint64_t +get_interval_values(atomic_ullong *source, atomic_count *cur_idx, + int num_to_read); +static void +dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd, + bool purge); +static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd, + struct tx_port *tx); +inline struct dpcls * +dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd, + odp_port_t in_port); + +static void dp_netdev_request_reconfigure(struct dp_netdev *dp); +static inline bool +pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd); +static void queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd, + struct dp_netdev_flow *flow); + +static void dp_netdev_simple_match_insert(struct dp_netdev_pmd_thread *pmd, + struct dp_netdev_flow *flow) + OVS_REQUIRES(pmd->flow_mutex); +static void dp_netdev_simple_match_remove(struct dp_netdev_pmd_thread *pmd, + struct dp_netdev_flow *flow) + OVS_REQUIRES(pmd->flow_mutex); + +static bool dp_netdev_flow_is_simple_match(const struct match *); + +/* Updates the time in PMD threads context and should be called in three cases: + * + * 1. PMD structure initialization: + * - dp_netdev_configure_pmd() + * + * 2. Before processing of the new packet batch: + * - dpif_netdev_execute() + * - dp_netdev_process_rxq_port() + * + * 3. At least once per polling iteration in main polling threads if no + * packets received on current iteration: + * - dpif_netdev_run() + * - pmd_thread_main() + * + * 'pmd->ctx.now' should be used without update in all other cases if possible. + */ +static inline void +pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd) +{ + pmd->ctx.now = time_usec(); +} + +/* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */ +bool +dpif_is_netdev2(const struct dpif *dpif) +{ + return dpif->dpif_class->open == dpif_netdev2_open; +} + +static struct dpif_netdev * +dpif_netdev_cast(const struct dpif *dpif) +{ + ovs_assert(dpif_is_netdev2(dpif)); + return CONTAINER_OF(dpif, struct dpif_netdev, dpif); +} + +struct dp_netdev * +get_dp_netdev2(const struct dpif *dpif) +{ + return dpif_netdev_cast(dpif)->dp; +} + +enum pmd_info_type { + PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */ + PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */ + PMD_INFO_SHOW_RXQ, /* Show poll lists of pmd threads. */ + PMD_INFO_PERF_SHOW, /* Show pmd performance details. */ + PMD_INFO_SLEEP_SHOW, /* Show max sleep configuration details. */ +}; + +static void +format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd) +{ + ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID) + ? "main thread" : "pmd thread"); + if (pmd->numa_id != OVS_NUMA_UNSPEC) { + ds_put_format(reply, " numa_id %d", pmd->numa_id); + } + if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) { + ds_put_format(reply, " core_id %u", pmd->core_id); + } + ds_put_cstr(reply, ":\n"); +} + +static void +pmd_info_show_stats(struct ds *reply, + struct dp_netdev_pmd_thread *pmd) +{ + uint64_t stats[PMD_N_STATS]; + uint64_t total_cycles, total_packets; + double passes_per_pkt = 0; + double lookups_per_hit = 0; + double packets_per_batch = 0; + + pmd_perf_read_counters(&pmd->perf_stats, stats); + total_cycles = stats[PMD_CYCLES_ITER_IDLE] + + stats[PMD_CYCLES_ITER_BUSY]; + total_packets = stats[PMD_STAT_RECV]; + + format_pmd_thread(reply, pmd); + + if (total_packets > 0) { + passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC]) + / (double) total_packets; + } + if (stats[PMD_STAT_MASKED_HIT] > 0) { + lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP] + / (double) stats[PMD_STAT_MASKED_HIT]; + } + if (stats[PMD_STAT_SENT_BATCHES] > 0) { + packets_per_batch = stats[PMD_STAT_SENT_PKTS] + / (double) stats[PMD_STAT_SENT_BATCHES]; + } + + ds_put_format(reply, + " packets received: %"PRIu64"\n" + " packet recirculations: %"PRIu64"\n" + " avg. datapath passes per packet: %.02f\n" + " phwol hits: %"PRIu64"\n" + " mfex opt hits: %"PRIu64"\n" + " simple match hits: %"PRIu64"\n" + " emc hits: %"PRIu64"\n" + " smc hits: %"PRIu64"\n" + " megaflow hits: %"PRIu64"\n" + " avg. subtable lookups per megaflow hit: %.02f\n" + " miss with success upcall: %"PRIu64"\n" + " miss with failed upcall: %"PRIu64"\n" + " avg. packets per output batch: %.02f\n", + total_packets, stats[PMD_STAT_RECIRC], + passes_per_pkt, stats[PMD_STAT_PHWOL_HIT], + stats[PMD_STAT_MFEX_OPT_HIT], + stats[PMD_STAT_SIMPLE_HIT], + stats[PMD_STAT_EXACT_HIT], + stats[PMD_STAT_SMC_HIT], + stats[PMD_STAT_MASKED_HIT], + lookups_per_hit, stats[PMD_STAT_MISS], stats[PMD_STAT_LOST], + packets_per_batch); + + if (total_cycles == 0) { + return; + } + + ds_put_format(reply, + " idle cycles: %"PRIu64" (%.02f%%)\n" + " processing cycles: %"PRIu64" (%.02f%%)\n", + stats[PMD_CYCLES_ITER_IDLE], + stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100, + stats[PMD_CYCLES_ITER_BUSY], + stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100); + + if (total_packets == 0) { + return; + } + + ds_put_format(reply, + " avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n", + total_cycles / (double) total_packets, + total_cycles, total_packets); + + ds_put_format(reply, + " avg processing cycles per packet: " + "%.02f (%"PRIu64"/%"PRIu64")\n", + stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets, + stats[PMD_CYCLES_ITER_BUSY], total_packets); +} + +static void +pmd_info_show_perf(struct ds *reply, + struct dp_netdev_pmd_thread *pmd, + struct pmd_perf_params *par) +{ + if (pmd->core_id != NON_PMD_CORE_ID) { + char *time_str = + xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true); + long long now = time_msec(); + double duration = (now - pmd->perf_stats.start_ms) / 1000.0; + + ds_put_cstr(reply, "\n"); + ds_put_format(reply, "Time: %s\n", time_str); + ds_put_format(reply, "Measurement duration: %.3f s\n", duration); + ds_put_cstr(reply, "\n"); + format_pmd_thread(reply, pmd); + ds_put_cstr(reply, "\n"); + pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration); + if (pmd_perf_metrics_enabled(pmd)) { + /* Prevent parallel clearing of perf metrics. */ + ovs_mutex_lock(&pmd->perf_stats.clear_mutex); + if (par->histograms) { + ds_put_cstr(reply, "\n"); + pmd_perf_format_histograms(reply, &pmd->perf_stats); + } + if (par->iter_hist_len > 0) { + ds_put_cstr(reply, "\n"); + pmd_perf_format_iteration_history(reply, &pmd->perf_stats, + par->iter_hist_len); + } + if (par->ms_hist_len > 0) { + ds_put_cstr(reply, "\n"); + pmd_perf_format_ms_history(reply, &pmd->perf_stats, + par->ms_hist_len); + } + ovs_mutex_unlock(&pmd->perf_stats.clear_mutex); + } + free(time_str); + } +} + +static int +compare_poll_list(const void *a_, const void *b_) +{ + const struct rxq_poll *a = a_; + const struct rxq_poll *b = b_; + + const char *namea = netdev_rxq_get_name(a->rxq->rx); + const char *nameb = netdev_rxq_get_name(b->rxq->rx); + + int cmp = strcmp(namea, nameb); + if (!cmp) { + return netdev_rxq_get_queue_id(a->rxq->rx) + - netdev_rxq_get_queue_id(b->rxq->rx); + } else { + return cmp; + } +} + +static void +sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list, + size_t *n) + OVS_REQUIRES(pmd->port_mutex) +{ + struct rxq_poll *ret, *poll; + size_t i; + + *n = hmap_count(&pmd->poll_list); + if (!*n) { + ret = NULL; + } else { + ret = xcalloc(*n, sizeof *ret); + i = 0; + HMAP_FOR_EACH (poll, node, &pmd->poll_list) { + ret[i] = *poll; + i++; + } + ovs_assert(i == *n); + qsort(ret, *n, sizeof *ret, compare_poll_list); + } + + *list = ret; +} + +static void +pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd, + int secs) +{ + if (pmd->core_id != NON_PMD_CORE_ID) { + struct rxq_poll *list; + size_t n_rxq; + uint64_t total_pmd_cycles = 0; + uint64_t busy_pmd_cycles = 0; + uint64_t total_rxq_proc_cycles = 0; + unsigned int intervals; + + ds_put_format(reply, + "pmd thread numa_id %d core_id %u:\n isolated : %s\n", + pmd->numa_id, pmd->core_id, (pmd->isolated) + ? "true" : "false"); + + ovs_mutex_lock(&pmd->port_mutex); + sorted_poll_list(pmd, &list, &n_rxq); + + /* Get the total pmd cycles for an interval. */ + atomic_read_relaxed(&pmd->intrvl_cycles, &total_pmd_cycles); + /* Calculate how many intervals are to be used. */ + intervals = DIV_ROUND_UP(secs, + PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC); + /* Estimate the cycles to cover all intervals. */ + total_pmd_cycles *= intervals; + busy_pmd_cycles = get_interval_values(pmd->busy_cycles_intrvl, + &pmd->intrvl_idx, + intervals); + if (busy_pmd_cycles > total_pmd_cycles) { + busy_pmd_cycles = total_pmd_cycles; + } + + for (int i = 0; i < n_rxq; i++) { + struct dp_netdev_rxq *rxq = list[i].rxq; + const char *name = netdev_rxq_get_name(rxq->rx); + uint64_t rxq_proc_cycles = 0; + + rxq_proc_cycles = get_interval_values(rxq->cycles_intrvl, + &rxq->intrvl_idx, + intervals); + total_rxq_proc_cycles += rxq_proc_cycles; + ds_put_format(reply, " port: %-16s queue-id: %2d", name, + netdev_rxq_get_queue_id(list[i].rxq->rx)); + ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx) + ? "(enabled) " : "(disabled)"); + ds_put_format(reply, " pmd usage: "); + if (total_pmd_cycles) { + ds_put_format(reply, "%2"PRIu64"", + rxq_proc_cycles * 100 / total_pmd_cycles); + ds_put_cstr(reply, " %"); + } else { + ds_put_format(reply, "%s", "NOT AVAIL"); + } + ds_put_cstr(reply, "\n"); + } + + if (n_rxq > 0) { + ds_put_cstr(reply, " overhead: "); + if (total_pmd_cycles) { + uint64_t overhead_cycles = 0; + + if (total_rxq_proc_cycles < busy_pmd_cycles) { + overhead_cycles = busy_pmd_cycles - total_rxq_proc_cycles; + } + ds_put_format(reply, "%2"PRIu64" %%", + overhead_cycles * 100 / total_pmd_cycles); + } else { + ds_put_cstr(reply, "NOT AVAIL"); + } + ds_put_cstr(reply, "\n"); + } + + ovs_mutex_unlock(&pmd->port_mutex); + free(list); + } +} + +static int +compare_poll_thread_list(const void *a_, const void *b_) +{ + const struct dp_netdev_pmd_thread *a, *b; + + a = *(struct dp_netdev_pmd_thread **)a_; + b = *(struct dp_netdev_pmd_thread **)b_; + + if (a->core_id < b->core_id) { + return -1; + } + if (a->core_id > b->core_id) { + return 1; + } + return 0; +} + +/* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use + * this list, as long as we do not go to quiescent state. */ +static void +sorted_poll_thread_list(struct dp_netdev *dp, + struct dp_netdev_pmd_thread ***list, + size_t *n) +{ + struct dp_netdev_pmd_thread *pmd; + struct dp_netdev_pmd_thread **pmd_list; + size_t k = 0, n_pmds; + + n_pmds = cmap_count(&dp->poll_threads); + pmd_list = xcalloc(n_pmds, sizeof *pmd_list); + + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + if (k >= n_pmds) { + break; + } + pmd_list[k++] = pmd; + } + + qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list); + + *list = pmd_list; + *n = k; +} + +static void +dpif_netdev_subtable_lookup_get(struct unixctl_conn *conn, int argc OVS_UNUSED, + const char *argv[] OVS_UNUSED, + void *aux OVS_UNUSED) +{ + struct ds reply = DS_EMPTY_INITIALIZER; + + dpcls_impl_print_stats(&reply); + unixctl_command_reply(conn, ds_cstr(&reply)); + ds_destroy(&reply); +} + +static void +dpif_netdev_subtable_lookup_set(struct unixctl_conn *conn, int argc OVS_UNUSED, + const char *argv[], void *aux OVS_UNUSED) +{ + /* This function requires 2 parameters (argv[1] and argv[2]) to execute. + * argv[1] is subtable name + * argv[2] is priority + */ + const char *func_name = argv[1]; + + errno = 0; + char *err_char; + uint32_t new_prio = strtoul(argv[2], &err_char, 10); + uint32_t lookup_dpcls_changed = 0; + uint32_t lookup_subtable_changed = 0; + struct shash_node *node; + if (errno != 0 || new_prio > UINT8_MAX) { + unixctl_command_reply_error(conn, + "error converting priority, use integer in range 0-255\n"); + return; + } + + int32_t err = dpcls_subtable_set_prio(func_name, new_prio); + if (err) { + unixctl_command_reply_error(conn, + "error, subtable lookup function not found\n"); + return; + } + + ovs_mutex_lock(&dp_netdev_mutex); + SHASH_FOR_EACH (node, &dp_netdevs) { + struct dp_netdev *dp = node->data; + + /* Get PMD threads list, required to get DPCLS instances. */ + size_t n; + struct dp_netdev_pmd_thread **pmd_list; + sorted_poll_thread_list(dp, &pmd_list, &n); + + /* take port rwlock as HMAP iters over them. */ + dp_netdev_port_rdlock(dp); + + for (size_t i = 0; i < n; i++) { + struct dp_netdev_pmd_thread *pmd = pmd_list[i]; + if (pmd->core_id == NON_PMD_CORE_ID) { + continue; + } + + struct dp_netdev_port *port = NULL; + HMAP_FOR_EACH (port, node, &dp->ports) { + odp_port_t in_port = port->port_no; + struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); + if (!cls) { + continue; + } + ovs_mutex_lock(&pmd->flow_mutex); + uint32_t subtbl_changes = dpcls_subtable_lookup_reprobe(cls); + ovs_mutex_unlock(&pmd->flow_mutex); + if (subtbl_changes) { + lookup_dpcls_changed++; + lookup_subtable_changed += subtbl_changes; + } + } + } + + /* release port mutex before netdev mutex. */ + ovs_rwlock_unlock(&dp->port_rwlock); + free(pmd_list); + } + ovs_mutex_unlock(&dp_netdev_mutex); + + struct ds reply = DS_EMPTY_INITIALIZER; + ds_put_format(&reply, + "Lookup priority change affected %d dpcls ports and %d subtables.\n", + lookup_dpcls_changed, lookup_subtable_changed); + const char *reply_str = ds_cstr(&reply); + unixctl_command_reply(conn, reply_str); + VLOG_INFO("%s", reply_str); + ds_destroy(&reply); +} + +static void +dpif_netdev_impl_get(struct unixctl_conn *conn, int argc OVS_UNUSED, + const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED) +{ + struct ds reply = DS_EMPTY_INITIALIZER; + struct shash_node *node; + + ovs_mutex_lock(&dp_netdev_mutex); + SHASH_FOR_EACH (node, &dp_netdevs) { + struct dp_netdev_pmd_thread **pmd_list; + struct dp_netdev *dp = node->data; + size_t n; + + /* Get PMD threads list, required to get the DPIF impl used by each PMD + * thread. */ + sorted_poll_thread_list(dp, &pmd_list, &n); + dp_netdev_impl_get(&reply, pmd_list, n); + free(pmd_list); + } + ovs_mutex_unlock(&dp_netdev_mutex); + unixctl_command_reply(conn, ds_cstr(&reply)); + ds_destroy(&reply); +} + +static void +dpif_netdev_impl_set(struct unixctl_conn *conn, int argc OVS_UNUSED, + const char *argv[], void *aux OVS_UNUSED) +{ + /* This function requires just one parameter, the DPIF name. */ + const char *dpif_name = argv[1]; + struct shash_node *node; + + static const char *error_description[2] = { + "Unknown DPIF implementation", + "CPU doesn't support the required instruction for", + }; + + ovs_mutex_lock(&dp_netdev_mutex); + int32_t err = dp_netdev_impl_set_default_by_name(dpif_name); + + if (err) { + struct ds reply = DS_EMPTY_INITIALIZER; + ds_put_format(&reply, "DPIF implementation not available: %s %s.\n", + error_description[ (err == -ENOTSUP) ], dpif_name); + const char *reply_str = ds_cstr(&reply); + unixctl_command_reply_error(conn, reply_str); + VLOG_ERR("%s", reply_str); + ds_destroy(&reply); + ovs_mutex_unlock(&dp_netdev_mutex); + return; + } + + SHASH_FOR_EACH (node, &dp_netdevs) { + struct dp_netdev *dp = node->data; + + /* Get PMD threads list, required to get DPCLS instances. */ + size_t n; + struct dp_netdev_pmd_thread **pmd_list; + sorted_poll_thread_list(dp, &pmd_list, &n); + + for (size_t i = 0; i < n; i++) { + struct dp_netdev_pmd_thread *pmd = pmd_list[i]; + if (pmd->core_id == NON_PMD_CORE_ID) { + continue; + } + + /* Initialize DPIF function pointer to the newly configured + * default. */ + atomic_store_relaxed(&pmd->netdev_input_func, + dp_netdev_impl_get_default()); + }; + + free(pmd_list); + } + ovs_mutex_unlock(&dp_netdev_mutex); + + /* Reply with success to command. */ + struct ds reply = DS_EMPTY_INITIALIZER; + ds_put_format(&reply, "DPIF implementation set to %s.\n", dpif_name); + const char *reply_str = ds_cstr(&reply); + unixctl_command_reply(conn, reply_str); + VLOG_INFO("%s", reply_str); + ds_destroy(&reply); +} + +static void +dpif_miniflow_extract_impl_get(struct unixctl_conn *conn, int argc OVS_UNUSED, + const char *argv[] OVS_UNUSED, + void *aux OVS_UNUSED) +{ + struct ds reply = DS_EMPTY_INITIALIZER; + struct shash_node *node; + + ovs_mutex_lock(&dp_netdev_mutex); + SHASH_FOR_EACH (node, &dp_netdevs) { + struct dp_netdev_pmd_thread **pmd_list; + struct dp_netdev *dp = node->data; + size_t n; + + /* Get PMD threads list, required to get the DPIF impl used by each PMD + * thread. */ + sorted_poll_thread_list(dp, &pmd_list, &n); + dp_mfex_impl_get(&reply, pmd_list, n); + free(pmd_list); + } + ovs_mutex_unlock(&dp_netdev_mutex); + unixctl_command_reply(conn, ds_cstr(&reply)); + ds_destroy(&reply); +} + +static void +dpif_miniflow_extract_impl_set(struct unixctl_conn *conn, int argc, + const char *argv[], void *aux OVS_UNUSED) +{ + /* This command takes some optional and mandatory arguments. The function + * here first parses all of the options, saving results in local variables. + * Then the parsed values are acted on. + */ + unsigned int pmd_thread_to_change = NON_PMD_CORE_ID; + unsigned int study_count = MFEX_MAX_PKT_COUNT; + struct ds reply = DS_EMPTY_INITIALIZER; + bool pmd_thread_update_done = false; + bool mfex_name_is_study = false; + const char *mfex_name = NULL; + const char *reply_str = NULL; + struct shash_node *node; + int err; + + while (argc > 1) { + /* Optional argument "-pmd" limits the commands actions to just this + * PMD thread. + */ + if ((!strcmp(argv[1], "-pmd") && !mfex_name)) { + if (argc < 3) { + ds_put_format(&reply, + "Error: -pmd option requires a thread id" + " argument.\n"); + goto error; + } + + /* Ensure argument can be parsed to an integer. */ + if (!str_to_uint(argv[2], 10, &pmd_thread_to_change) || + (pmd_thread_to_change == NON_PMD_CORE_ID)) { + ds_put_format(&reply, + "Error: miniflow extract parser not changed," + " PMD thread passed is not valid: '%s'." + " Pass a valid pmd thread ID.\n", + argv[2]); + goto error; + } + + argc -= 2; + argv += 2; + + } else if (!mfex_name) { + /* Name of MFEX impl requested by user. */ + mfex_name = argv[1]; + mfex_name_is_study = strcmp("study", mfex_name) == 0; + argc -= 1; + argv += 1; + + /* If name is study and more args exist, parse study_count value. */ + } else if (mfex_name && mfex_name_is_study) { + if (!str_to_uint(argv[1], 10, &study_count) || + (study_count == 0)) { + ds_put_format(&reply, + "Error: invalid study_pkt_cnt value: %s.\n", + argv[1]); + goto error; + } + + argc -= 1; + argv += 1; + } else { + ds_put_format(&reply, "Error: unknown argument %s.\n", argv[1]); + goto error; + } + } + + /* Ensure user passed an MFEX name. */ + if (!mfex_name) { + ds_put_format(&reply, "Error: no miniflow extract name provided." + " Output of miniflow-parser-get shows implementation" + " list.\n"); + goto error; + } + + /* If the MFEX name is "study", set the study packet count. */ + if (mfex_name_is_study) { + err = mfex_set_study_pkt_cnt(study_count, mfex_name); + if (err) { + ds_put_format(&reply, "Error: failed to set study count %d for" + " miniflow extract implementation %s.\n", + study_count, mfex_name); + goto error; + } + } + + /* Set the default MFEX impl only if the command was applied to all PMD + * threads. If a PMD thread was selected, do NOT update the default. + */ + if (pmd_thread_to_change == NON_PMD_CORE_ID) { + err = dp_mfex_impl_set_default_by_name(mfex_name); + if (err == -ENODEV) { + ds_put_format(&reply, + "Error: miniflow extract not available due to CPU" + " ISA requirements: %s", + mfex_name); + goto error; + } else if (err) { + ds_put_format(&reply, + "Error: unknown miniflow extract implementation %s.", + mfex_name); + goto error; + } + } + + /* Get the desired MFEX function pointer and error check its usage. */ + miniflow_extract_func mfex_func = NULL; + err = dp_mfex_impl_get_by_name(mfex_name, &mfex_func); + if (err) { + if (err == -ENODEV) { + ds_put_format(&reply, + "Error: miniflow extract not available due to CPU" + " ISA requirements: %s", mfex_name); + } else { + ds_put_format(&reply, + "Error: unknown miniflow extract implementation %s.", + mfex_name); + } + goto error; + } + + /* Apply the MFEX pointer to each pmd thread in each netdev, filtering + * by the users "-pmd" argument if required. + */ + ovs_mutex_lock(&dp_netdev_mutex); + + SHASH_FOR_EACH (node, &dp_netdevs) { + struct dp_netdev_pmd_thread **pmd_list; + struct dp_netdev *dp = node->data; + size_t n; + + sorted_poll_thread_list(dp, &pmd_list, &n); + + for (size_t i = 0; i < n; i++) { + struct dp_netdev_pmd_thread *pmd = pmd_list[i]; + if (pmd->core_id == NON_PMD_CORE_ID) { + continue; + } + + /* If -pmd specified, skip all other pmd threads. */ + if ((pmd_thread_to_change != NON_PMD_CORE_ID) && + (pmd->core_id != pmd_thread_to_change)) { + continue; + } + + pmd_thread_update_done = true; + atomic_store_relaxed(&pmd->miniflow_extract_opt, mfex_func); + }; + + free(pmd_list); + } + + ovs_mutex_unlock(&dp_netdev_mutex); + + /* If PMD thread was specified, but it wasn't found, return error. */ + if (pmd_thread_to_change != NON_PMD_CORE_ID && !pmd_thread_update_done) { + ds_put_format(&reply, + "Error: miniflow extract parser not changed, " + "PMD thread %d not in use, pass a valid pmd" + " thread ID.\n", pmd_thread_to_change); + goto error; + } + + /* Reply with success to command. */ + ds_put_format(&reply, "Miniflow extract implementation set to %s", + mfex_name); + if (pmd_thread_to_change != NON_PMD_CORE_ID) { + ds_put_format(&reply, ", on pmd thread %d", pmd_thread_to_change); + } + if (mfex_name_is_study) { + ds_put_format(&reply, ", studying %d packets", study_count); + } + ds_put_format(&reply, ".\n"); + + reply_str = ds_cstr(&reply); + VLOG_INFO("%s", reply_str); + unixctl_command_reply(conn, reply_str); + ds_destroy(&reply); + return; + +error: + reply_str = ds_cstr(&reply); + VLOG_ERR("%s", reply_str); + unixctl_command_reply_error(conn, reply_str); + ds_destroy(&reply); +} + +static void +dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc, + const char *argv[], void *aux OVS_UNUSED) +{ + struct ds reply = DS_EMPTY_INITIALIZER; + struct dp_netdev *dp = NULL; + + ovs_mutex_lock(&dp_netdev_mutex); + + if (argc == 2) { + dp = shash_find_data(&dp_netdevs, argv[1]); + } else if (shash_count(&dp_netdevs) == 1) { + /* There's only one datapath */ + dp = shash_first(&dp_netdevs)->data; + } + + if (!dp) { + ovs_mutex_unlock(&dp_netdev_mutex); + unixctl_command_reply_error(conn, + "please specify an existing datapath"); + return; + } + + dp_netdev_request_reconfigure(dp); + ovs_mutex_unlock(&dp_netdev_mutex); + ds_put_cstr(&reply, "pmd rxq rebalance requested.\n"); + unixctl_command_reply(conn, ds_cstr(&reply)); + ds_destroy(&reply); +} + +static void +pmd_info_show_sleep(struct ds *reply, unsigned core_id, int numa_id, + uint64_t pmd_max_sleep) +{ + if (core_id == NON_PMD_CORE_ID) { + return; + } + ds_put_format(reply, + "pmd thread numa_id %d core_id %d:\n" + " max sleep: %4"PRIu64" us\n", + numa_id, core_id, pmd_max_sleep); +} + +static void +dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], + void *aux) +{ + struct ds reply = DS_EMPTY_INITIALIZER; + struct dp_netdev_pmd_thread **pmd_list; + struct dp_netdev *dp = NULL; + enum pmd_info_type type = *(enum pmd_info_type *) aux; + unsigned int core_id; + bool filter_on_pmd = false; + size_t n; + unsigned int secs = 0; + unsigned long long max_secs = (PMD_INTERVAL_LEN * PMD_INTERVAL_MAX) + / INTERVAL_USEC_TO_SEC; + bool show_header = true; + uint64_t max_sleep; + + ovs_mutex_lock(&dp_netdev_mutex); + + while (argc > 1) { + if (!strcmp(argv[1], "-pmd") && argc > 2) { + if (str_to_uint(argv[2], 10, &core_id)) { + filter_on_pmd = true; + } + argc -= 2; + argv += 2; + } else if (type == PMD_INFO_SHOW_RXQ && + !strcmp(argv[1], "-secs") && + argc > 2) { + if (!str_to_uint(argv[2], 10, &secs)) { + secs = max_secs; + } + argc -= 2; + argv += 2; + } else { + dp = shash_find_data(&dp_netdevs, argv[1]); + argc -= 1; + argv += 1; + } + } + + if (!dp) { + if (shash_count(&dp_netdevs) == 1) { + /* There's only one datapath */ + dp = shash_first(&dp_netdevs)->data; + } else { + ovs_mutex_unlock(&dp_netdev_mutex); + unixctl_command_reply_error(conn, + "please specify an existing datapath"); + return; + } + } + + sorted_poll_thread_list(dp, &pmd_list, &n); + for (size_t i = 0; i < n; i++) { + struct dp_netdev_pmd_thread *pmd = pmd_list[i]; + if (!pmd) { + break; + } + if (filter_on_pmd && pmd->core_id != core_id) { + continue; + } + if (type == PMD_INFO_SHOW_RXQ) { + if (show_header) { + if (!secs || secs > max_secs) { + secs = max_secs; + } else { + secs = ROUND_UP(secs, + PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC); + } + ds_put_format(&reply, "Displaying last %u seconds " + "pmd usage %%\n", secs); + show_header = false; + } + pmd_info_show_rxq(&reply, pmd, secs); + } else if (type == PMD_INFO_CLEAR_STATS) { + pmd_perf_stats_clear(&pmd->perf_stats); + } else if (type == PMD_INFO_SHOW_STATS) { + pmd_info_show_stats(&reply, pmd); + } else if (type == PMD_INFO_PERF_SHOW) { + pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux); + } else if (type == PMD_INFO_SLEEP_SHOW) { + if (show_header) { + ds_put_format(&reply, "Default max sleep: %4"PRIu64" us\n", + dp->pmd_max_sleep_default); + show_header = false; + } + atomic_read_relaxed(&pmd->max_sleep, &max_sleep); + pmd_info_show_sleep(&reply, pmd->core_id, pmd->numa_id, + max_sleep); + } + } + free(pmd_list); + + ovs_mutex_unlock(&dp_netdev_mutex); + + unixctl_command_reply(conn, ds_cstr(&reply)); + ds_destroy(&reply); +} + +static void +pmd_perf_show_cmd(struct unixctl_conn *conn, int argc, + const char *argv[], + void *aux OVS_UNUSED) +{ + struct pmd_perf_params par; + long int it_hist = 0, ms_hist = 0; + par.histograms = true; + + while (argc > 1) { + if (!strcmp(argv[1], "-nh")) { + par.histograms = false; + argc -= 1; + argv += 1; + } else if (!strcmp(argv[1], "-it") && argc > 2) { + it_hist = strtol(argv[2], NULL, 10); + if (it_hist < 0) { + it_hist = 0; + } else if (it_hist > HISTORY_LEN) { + it_hist = HISTORY_LEN; + } + argc -= 2; + argv += 2; + } else if (!strcmp(argv[1], "-ms") && argc > 2) { + ms_hist = strtol(argv[2], NULL, 10); + if (ms_hist < 0) { + ms_hist = 0; + } else if (ms_hist > HISTORY_LEN) { + ms_hist = HISTORY_LEN; + } + argc -= 2; + argv += 2; + } else { + break; + } + } + par.iter_hist_len = it_hist; + par.ms_hist_len = ms_hist; + par.command_type = PMD_INFO_PERF_SHOW; + dpif_netdev_pmd_info(conn, argc, argv, &par); +} + +static void +dpif_netdev_bond_show(struct unixctl_conn *conn, int argc, + const char *argv[], void *aux OVS_UNUSED) +{ + struct ds reply = DS_EMPTY_INITIALIZER; + struct dp_netdev *dp = NULL; + + ovs_mutex_lock(&dp_netdev_mutex); + if (argc == 2) { + dp = shash_find_data(&dp_netdevs, argv[1]); + } else if (shash_count(&dp_netdevs) == 1) { + /* There's only one datapath. */ + dp = shash_first(&dp_netdevs)->data; + } + if (!dp) { + ovs_mutex_unlock(&dp_netdev_mutex); + unixctl_command_reply_error(conn, + "please specify an existing datapath"); + return; + } + + if (cmap_count(&dp->tx_bonds) > 0) { + struct tx_bond *dp_bond_entry; + + ds_put_cstr(&reply, "Bonds:\n"); + CMAP_FOR_EACH (dp_bond_entry, node, &dp->tx_bonds) { + ds_put_format(&reply, " bond-id %"PRIu32":\n", + dp_bond_entry->bond_id); + for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) { + uint32_t member_id = odp_to_u32( + dp_bond_entry->member_buckets[bucket].member_id); + ds_put_format(&reply, + " bucket %d - member %"PRIu32"\n", + bucket, member_id); + } + } + } + ovs_mutex_unlock(&dp_netdev_mutex); + unixctl_command_reply(conn, ds_cstr(&reply)); + ds_destroy(&reply); +} + + + +static int +dpif_netdev_init(void) +{ + static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS, + clear_aux = PMD_INFO_CLEAR_STATS, + poll_aux = PMD_INFO_SHOW_RXQ, + sleep_aux = PMD_INFO_SLEEP_SHOW; + + unixctl_command_register("dpif-netdev2/pmd-stats-show", "[-pmd core] [dp]", + 0, 3, dpif_netdev_pmd_info, + (void *)&show_aux); + unixctl_command_register("dpif-netdev2/pmd-stats-clear", "[-pmd core] [dp]", + 0, 3, dpif_netdev_pmd_info, + (void *)&clear_aux); + unixctl_command_register("dpif-netdev2/pmd-rxq-show", "[-pmd core] " + "[-secs secs] [dp]", + 0, 5, dpif_netdev_pmd_info, + (void *)&poll_aux); + unixctl_command_register("dpif-netdev2/pmd-sleep-show", "[dp]", + 0, 1, dpif_netdev_pmd_info, + (void *)&sleep_aux); + unixctl_command_register("dpif-netdev2/pmd-perf-show", + "[-nh] [-it iter-history-len]" + " [-ms ms-history-len]" + " [-pmd core] [dp]", + 0, 8, pmd_perf_show_cmd, + NULL); + unixctl_command_register("dpif-netdev2/pmd-rxq-rebalance", "[dp]", + 0, 1, dpif_netdev_pmd_rebalance, + NULL); + unixctl_command_register("dpif-netdev2/pmd-perf-log-set", + "on|off [-b before] [-a after] [-e|-ne] " + "[-us usec] [-q qlen]", + 0, 10, pmd_perf_log_set_cmd, + NULL); + unixctl_command_register("dpif-netdev2/bond-show", "[dp]", + 0, 1, dpif_netdev_bond_show, + NULL); + unixctl_command_register("dpif-netdev2/subtable-lookup-prio-set", + "[lookup_func] [prio]", + 2, 2, dpif_netdev_subtable_lookup_set, + NULL); + unixctl_command_register("dpif-netdev2/subtable-lookup-info-get", "", + 0, 0, dpif_netdev_subtable_lookup_get, + NULL); + unixctl_command_register("dpif-netdev2/subtable-lookup-prio-get", NULL, + 0, 0, dpif_netdev_subtable_lookup_get, + NULL); + unixctl_command_register("dpif-netdev2/dpif-impl-set", + "dpif_implementation_name", + 1, 1, dpif_netdev_impl_set, + NULL); + unixctl_command_register("dpif-netdev2/dpif-impl-get", "", + 0, 0, dpif_netdev_impl_get, + NULL); + unixctl_command_register("dpif-netdev2/miniflow-parser-set", + "[-pmd core] miniflow_implementation_name" + " [study_pkt_cnt]", + 1, 5, dpif_miniflow_extract_impl_set, + NULL); + unixctl_command_register("dpif-netdev2/miniflow-parser-get", "", + 0, 0, dpif_miniflow_extract_impl_get, + NULL); + unixctl_command_register("dpif-netdev2/dump-packets", "[on/off]", + 0, 1, dp_netdev_dump_packets_toggle, + NULL); + + dpif_netdev_metrics_register(); + dp_netdev_offload_thread_init(&dp_offload_threads[NETDEV_OFFLOAD_THREAD_MAIN]); + + return 0; +} + +static int +dpif_netdev_enumerate(struct sset *all_dps, + const struct dpif_class *dpif_class) +{ + struct shash_node *node; + + ovs_mutex_lock(&dp_netdev_mutex); + SHASH_FOR_EACH(node, &dp_netdevs) { + struct dp_netdev *dp = node->data; + if (dpif_class != dp->class) { + /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs. + * If the class doesn't match, skip this dpif. */ + continue; + } + sset_add(all_dps, node->name); + } + ovs_mutex_unlock(&dp_netdev_mutex); + + return 0; +} + +static bool +dpif_netdev_class_is_dummy(const struct dpif_class *class) +{ + return class != &dpif_netdev2_class; +} + +static const char * +dpif_netdev_port_open_type(const struct dpif_class *class, const char *type) +{ + return strcmp(type, "internal") ? type + : dpif_netdev_class_is_dummy(class) ? "dummy-internal" + : "tap"; +} + +static struct dpif * +create_dpif_netdev(struct dp_netdev *dp) +{ + uint16_t netflow_id = hash_string(dp->name, 0); + struct dpif_netdev *dpif; + + ovs_refcount_ref(&dp->ref_cnt); + + dpif = xmalloc(sizeof *dpif); + dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id); + dpif->dp = dp; + dpif->last_port_seq = seq_read(dp->port_seq); + + return &dpif->dpif; +} + +/* Choose an unused, non-zero port number and return it on success. + * Return ODPP_NONE on failure. */ +static odp_port_t +choose_port(struct dp_netdev *dp, const char *name) + OVS_REQ_RDLOCK(dp->port_rwlock) +{ + uint32_t port_no; + + if (dp->class != &dpif_netdev2_class) { + const char *p; + int start_no = 0; + + /* If the port name begins with "br", start the number search at + * 100 to make writing tests easier. */ + if (!strncmp(name, "br", 2)) { + start_no = 100; + } + + /* If the port name contains a number, try to assign that port number. + * This can make writing unit tests easier because port numbers are + * predictable. */ + for (p = name; *p != '\0'; p++) { + if (isdigit((unsigned char) *p)) { + port_no = start_no + strtol(p, NULL, 10); + if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE) + && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) { + return u32_to_odp(port_no); + } + break; + } + } + } + + for (port_no = 1; port_no <= UINT16_MAX; port_no++) { + if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) { + return u32_to_odp(port_no); + } + } + + return ODPP_NONE; +} + +static uint32_t +dp_meter_hash(uint32_t meter_id) +{ + /* In the ofproto-dpif layer, we use the id-pool to alloc meter id + * orderly (e.g. 1, 2, ... N.), which provides a better hash + * distribution. Use them directly instead of hash_xxx function for + * achieving high-performance. */ + return meter_id; +} + +static void +dp_netdev_meter_destroy(struct dp_netdev *dp) +{ + struct dp_meter *m; + + ovs_mutex_lock(&dp->meters_lock); + CMAP_FOR_EACH (m, node, &dp->meters) { + cmap_remove(&dp->meters, &m->node, dp_meter_hash(m->id)); + ovsrcu_postpone(free, m); + } + + cmap_destroy(&dp->meters); + ovs_mutex_unlock(&dp->meters_lock); + ovs_mutex_destroy(&dp->meters_lock); +} + +static struct dp_meter * +dp_meter_lookup(struct cmap *meters, uint32_t meter_id) +{ + uint32_t hash = dp_meter_hash(meter_id); + struct dp_meter *m; + + CMAP_FOR_EACH_WITH_HASH (m, node, hash, meters) { + if (m->id == meter_id) { + return m; + } + } + + return NULL; +} + +static void +dp_meter_detach_free(struct cmap *meters, uint32_t meter_id) +{ + struct dp_meter *m = dp_meter_lookup(meters, meter_id); + + if (m) { + cmap_remove(meters, &m->node, dp_meter_hash(meter_id)); + ovsrcu_postpone(free, m); + } +} + +static void +dp_meter_attach(struct cmap *meters, struct dp_meter *meter) +{ + cmap_insert(meters, &meter->node, dp_meter_hash(meter->id)); +} + +#define SYSTEM_PORT_WATCHDOG_INTERVAL 5 + +static void * +system_port_watchdog_main(void *dp_) +{ + struct dp_netdev_port *port; + struct dp_netdev *dp = dp_; + + VLOG_INFO("Port watchdog start"); + + for (;;) { + bool exit; + + atomic_read(&dp->system_port_wd_exit, &exit); + if (exit) { + break; + } + + ovs_rwlock_rdlock(&dp->port_rwlock); + HMAP_FOR_EACH (port, node, &dp->ports) { + /* If a "system" port is configured, check its presence. */ + if (netdev_is_configured(port->netdev) && + !strcmp(netdev_get_type(port->netdev), "system") && + !dp_netdev_port_exists(port->netdev)) { + VLOG_WARN("Watchdog detected port %s does not exist", + netdev_get_name(port->netdev)); + netdev_request_reconfigure(port->netdev); + rtnetlink_report_link(); + } + } + ovs_rwlock_unlock(&dp->port_rwlock); + xsleep(SYSTEM_PORT_WATCHDOG_INTERVAL); + } + + VLOG_INFO("Port watchdog exit"); + return NULL; +} + +static int +create_dp_netdev(const char *name, const struct dpif_class *class, + struct dp_netdev **dpp) + OVS_REQUIRES(dp_netdev_mutex) +{ + static struct ovsthread_once tsc_freq_check = OVSTHREAD_ONCE_INITIALIZER; + struct dp_netdev *dp; + int error; + + /* Avoid estimating TSC frequency for dummy datapath to not slow down + * unit tests. */ + if (!dpif_netdev_class_is_dummy(class) + && ovsthread_once_start(&tsc_freq_check)) { + pmd_perf_estimate_tsc_frequency(); + ovsthread_once_done(&tsc_freq_check); + } + + dp = xzalloc(sizeof *dp); + shash_add(&dp_netdevs, name, dp); + + *CONST_CAST(const struct dpif_class **, &dp->class) = class; + *CONST_CAST(const char **, &dp->name) = xstrdup(name); + ovs_refcount_init(&dp->ref_cnt); + atomic_flag_clear(&dp->destroyed); + + ovs_rwlock_init(&dp->port_rwlock); + hmap_init(&dp->ports); + dp->port_seq = seq_create(); + ovs_mutex_init(&dp->bond_mutex); + cmap_init(&dp->tx_bonds); + + fat_rwlock_init(&dp->upcall_rwlock); + + dp->reconfigure_seq = seq_create(); + dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq); + + /* Init meter resources. */ + cmap_init(&dp->meters); + ovs_mutex_init(&dp->meters_lock); + + /* Disable upcalls by default. */ + dp_netdev_disable_upcall(dp); + dp->upcall_aux = NULL; + dp->upcall_cb = NULL; + + /* Initialize flow offloads. This call must be made early to + * ensure the hw-offload thread receives the required ID. */ + dp_netdev_offload_init(); + + dp_netdev_ct_offload_init(dp); + atomic_init(&dp->system_port_wd_exit, false); + dp->system_port_wd_thread = ovs_thread_create("sys_port_wd", system_port_watchdog_main, dp); + + dpif_miniflow_extract_init(); + + atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN); + atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL); + + cmap_init(&dp->poll_threads); + dp->pmd_rxq_assign_type = SCHED_CYCLES; + + ovs_mutex_init(&dp->tx_qid_pool_mutex); + /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */ + dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1); + + ovs_mutex_init_recursive(&dp->non_pmd_mutex); + ovsthread_key_create(&dp->per_pmd_key, NULL); + + ovs_rwlock_wrlock(&dp->port_rwlock); + /* non-PMD will be created before all other threads and will + * allocate static_tx_qid = 0. */ + dp_netdev_set_nonpmd(dp); + + error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class, + "internal"), + ODPP_LOCAL, NULL); + ovs_rwlock_unlock(&dp->port_rwlock); + if (error) { + dp_netdev_free(dp); + return error; + } + + dp->max_sleep_list = NULL; + + dp->last_tnl_conf_seq = seq_read(tnl_conf_seq); + *dpp = dp; + return 0; +} + +static void +dp_netdev_request_reconfigure(struct dp_netdev *dp) +{ + seq_change(dp->reconfigure_seq); +} + +static bool +dp_netdev_is_reconf_required(struct dp_netdev *dp) +{ + return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq; +} + +static int +dpif_netdev2_open(const struct dpif_class *class, const char *name, + bool create, struct dpif **dpifp) +{ + struct dp_netdev *dp; + int error; + + ovs_mutex_lock(&dp_netdev_mutex); + dp = shash_find_data(&dp_netdevs, name); + if (!dp) { + error = create ? create_dp_netdev(name, class, &dp) : ENODEV; + } else { + error = (dp->class != class ? EINVAL + : create ? EEXIST + : 0); + } + if (!error) { + *dpifp = create_dpif_netdev(dp); + } + ovs_mutex_unlock(&dp_netdev_mutex); + + return error; +} + +static void +dp_netdev_destroy_upcall_lock(struct dp_netdev *dp) + OVS_NO_THREAD_SAFETY_ANALYSIS +{ + /* Check that upcalls are disabled, i.e. that the rwlock is taken */ + ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock)); + + /* Before freeing a lock we should release it */ + fat_rwlock_unlock(&dp->upcall_rwlock); + fat_rwlock_destroy(&dp->upcall_rwlock); +} + +static uint32_t +hash_bond_id(uint32_t bond_id) +{ + return hash_int(bond_id, 0); +} + +/* Requires dp_netdev_mutex so that we can't get a new reference to 'dp' + * through the 'dp_netdevs' shash while freeing 'dp'. */ +static void +dp_netdev_free(struct dp_netdev *dp) + OVS_NO_THREAD_SAFETY_ANALYSIS +{ + struct dp_netdev_port *port; + struct tx_bond *bond; + + /* Check that dp_netdev_mutex is locked instead of static thread safety analysis */ + ovs_assert(ovs_mutex_trylock(&dp_netdev_mutex)); + + atomic_store(&dp->system_port_wd_exit, true); + xpthread_join(dp->system_port_wd_thread, NULL); + + shash_find_and_delete(&dp_netdevs, dp->name); + + ovs_rwlock_wrlock(&dp->port_rwlock); + HMAP_FOR_EACH_SAFE (port, node, &dp->ports) { + do_del_port(dp, port); + } + ovs_rwlock_unlock(&dp->port_rwlock); + + ovs_mutex_lock(&dp->bond_mutex); + CMAP_FOR_EACH (bond, node, &dp->tx_bonds) { + cmap_remove(&dp->tx_bonds, &bond->node, hash_bond_id(bond->bond_id)); + ovsrcu_postpone(free, bond); + } + ovs_mutex_unlock(&dp->bond_mutex); + + /* The upcall lock must be unlocked when destroying PMDs. + * + * Each thread having accessed the upcall fat_rwlock will create a per-thread key. + * This creation registers a destructor executed on thread exit. + * These destructors will deadlock if their lock is locked. */ + fat_rwlock_unlock(&dp->upcall_rwlock); + dp_netdev_destroy_all_pmds(dp, true); + fat_rwlock_wrlock(&dp->upcall_rwlock); + + cmap_destroy(&dp->poll_threads); + + ovs_mutex_destroy(&dp->tx_qid_pool_mutex); + id_pool_destroy(dp->tx_qid_pool); + + ovs_mutex_destroy(&dp->non_pmd_mutex); + ovsthread_key_delete(dp->per_pmd_key); + + dp_netdev_ct_offload_uninit(dp); + + seq_destroy(dp->reconfigure_seq); + + seq_destroy(dp->port_seq); + hmap_destroy(&dp->ports); + ovs_rwlock_destroy(&dp->port_rwlock); + + cmap_destroy(&dp->tx_bonds); + ovs_mutex_destroy(&dp->bond_mutex); + + /* Upcalls must be disabled at this point */ + dp_netdev_destroy_upcall_lock(dp); + + dp_netdev_meter_destroy(dp); + + free(dp->max_sleep_list); + free(dp->pmd_cmask); + free(dp->req_pmd_cmask); + free(CONST_CAST(char *, dp->name)); + free(dp); +} + +static void +dp_netdev_unref(struct dp_netdev *dp) +{ + if (dp) { + /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't + * get a new reference to 'dp' through the 'dp_netdevs' shash. */ + ovs_mutex_lock(&dp_netdev_mutex); + if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) { + dp_netdev_free(dp); + } + ovs_mutex_unlock(&dp_netdev_mutex); + } +} + +static void +dpif_netdev_close(struct dpif *dpif) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + + dp_netdev_unref(dp); + free(dpif); +} + +static int +dpif_netdev_destroy(struct dpif *dpif) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + + if (!atomic_flag_test_and_set(&dp->destroyed)) { + if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) { + /* Can't happen: 'dpif' still owns a reference to 'dp'. */ + OVS_NOT_REACHED(); + } + } + + return 0; +} + +/* Add 'n' to the atomic variable 'var' non-atomically and using relaxed + * load/store semantics. While the increment is not atomic, the load and + * store operations are, making it impossible to read inconsistent values. + * + * This is used to update thread local stats counters. */ +static void +non_atomic_ullong_add(atomic_ullong *var, unsigned long long n) +{ + unsigned long long tmp; + + atomic_read_relaxed(var, &tmp); + tmp += n; + atomic_store_relaxed(var, tmp); +} + +static int +dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + struct dp_netdev_pmd_thread *pmd; + uint64_t pmd_stats[PMD_N_STATS]; + + stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0; + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + stats->n_flows += cmap_count(&pmd->flow_table); + pmd_perf_read_counters(&pmd->perf_stats, pmd_stats); + stats->n_hit += pmd_stats[PMD_STAT_PHWOL_HIT]; + stats->n_hit += pmd_stats[PMD_STAT_SIMPLE_HIT]; + stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT]; + stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT]; + stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT]; + stats->n_missed += pmd_stats[PMD_STAT_MISS]; + stats->n_lost += pmd_stats[PMD_STAT_LOST]; + } + stats->n_masks = UINT32_MAX; + stats->n_mask_hit = UINT64_MAX; + stats->n_cache_hit = UINT64_MAX; + + return 0; +} + +static void +dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd) +{ + if (pmd->core_id == NON_PMD_CORE_ID) { + ovs_mutex_lock(&pmd->dp->non_pmd_mutex); + ovs_mutex_lock(&pmd->port_mutex); + pmd_load_cached_ports(pmd); + ovs_mutex_unlock(&pmd->port_mutex); + ovs_mutex_unlock(&pmd->dp->non_pmd_mutex); + return; + } + + seq_change(pmd->reload_seq); + atomic_store_explicit(&pmd->reload, true, memory_order_release); +} + +static uint32_t +hash_port_no(odp_port_t port_no) +{ + return hash_int(odp_to_u32(port_no), 0); +} + +static int +port_create(const char *devname, const char *type, + odp_port_t port_no, struct dp_netdev_port **portp) +{ + struct dp_netdev_port *port; + enum netdev_flags flags; + struct netdev *netdev; + int error; + + *portp = NULL; + + /* Open and validate network device. */ + error = netdev_open(devname, type, &netdev); + if (error) { + return error; + } + /* XXX reject non-Ethernet devices */ + + netdev_get_flags(netdev, &flags); + if (flags & NETDEV_LOOPBACK) { + VLOG_ERR("%s: cannot add a loopback device", devname); + error = EINVAL; + goto out; + } + + port = xzalloc(sizeof *port); + port->port_no = port_no; + port->netdev = netdev; + port->type = xstrdup(type); + port->sf = NULL; + port->emc_enabled = true; + port->need_reconfigure = true; + ovs_mutex_init(&port->txq_used_mutex); + + *portp = port; + + return 0; + +out: + netdev_close(netdev); + return error; +} + +static int +do_add_port(struct dp_netdev *dp, const char *devname, const char *type, + odp_port_t port_no, struct netdev **datapath_netdev) + OVS_REQ_WRLOCK(dp->port_rwlock) +{ + struct netdev_saved_flags *sf; + struct dp_netdev_port *port; + int error; + + /* Reject devices already in 'dp'. */ + if (!get_port_by_name(dp, devname, &port)) { + return EEXIST; + } + + error = port_create(devname, type, port_no, &port); + if (error) { + return error; + } + if (datapath_netdev) { + *datapath_netdev = port->netdev; + } + /* If the netdev is an ESW manager, remove the + * disabled marking for its representors. */ + if (netdev_dpdk_is_esw_mgr(port->netdev)) { + dp_netdev_esw_ports_set_disabled(dp, port->netdev, false); + } + + hmap_insert(&dp->ports, &port->node, hash_port_no(port_no)); + seq_change(dp->port_seq); + + ovs_rwlock_unlock(&dp->port_rwlock); + ovs_rwlock_rdlock(&dp->port_rwlock); + + reconfigure_datapath(dp); + + ovs_rwlock_unlock(&dp->port_rwlock); + ovs_rwlock_wrlock(&dp->port_rwlock); + + /* Check that port was successfully configured. */ + if (!dp_netdev_lookup_port(dp, port_no)) { + return EINVAL; + } + + if (!netdev_is_configured(port->netdev)) { + return port->netdev->reconfigure_status; + } + + /* Updating device flags triggers an if_notifier, which triggers a bridge + * reconfiguration and another attempt to add this port, leading to an + * infinite loop if the device is configured incorrectly and cannot be + * added. Setting the promisc mode after a successful reconfiguration, + * since we already know that the device is somehow properly configured. */ + error = netdev_turn_flags_on(port->netdev, NETDEV_PROMISC, &sf); + if (error) { + VLOG_ERR("%s: cannot set promisc flag", devname); + do_del_port(dp, port); + return error; + } + port->sf = sf; + + return 0; +} + +static int +dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev, + odp_port_t *port_nop, struct netdev **datapath_netdev) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + char namebuf[NETDEV_VPORT_NAME_BUFSIZE]; + const char *dpif_port; + odp_port_t port_no; + int error; + + ovs_rwlock_wrlock(&dp->port_rwlock); + dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf); + if (*port_nop != ODPP_NONE) { + port_no = *port_nop; + error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0; + } else { + port_no = choose_port(dp, dpif_port); + error = port_no == ODPP_NONE ? EFBIG : 0; + } + if (!error) { + *port_nop = port_no; + error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no, + datapath_netdev); + if (!error) { + error = netdev_derive_tunnel_config(netdev, *datapath_netdev); + if (error == EOPNOTSUPP) { + error = 0; + } + } + } + ovs_rwlock_unlock(&dp->port_rwlock); + + return error; +} + +static int +dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + int error; + + ovs_rwlock_wrlock(&dp->port_rwlock); + if (port_no == ODPP_LOCAL) { + error = EINVAL; + } else { + struct dp_netdev_port *port; + + error = get_port_by_number(dp, port_no, &port); + if (!error) { + do_del_port(dp, port); + } + } + ovs_rwlock_unlock(&dp->port_rwlock); + + return error; +} + +static bool +is_valid_port_number(odp_port_t port_no) +{ + return port_no != ODPP_NONE; +} + +static struct dp_netdev_port * +dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no) + OVS_REQ_RDLOCK(dp->port_rwlock) +{ + struct dp_netdev_port *port; + + HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) { + if (port->port_no == port_no) { + return port; + } + } + return NULL; +} + +static int +get_port_by_number(struct dp_netdev *dp, + odp_port_t port_no, struct dp_netdev_port **portp) + OVS_REQ_RDLOCK(dp->port_rwlock) +{ + if (!is_valid_port_number(port_no)) { + *portp = NULL; + return EINVAL; + } else { + *portp = dp_netdev_lookup_port(dp, port_no); + return *portp ? 0 : ENODEV; + } +} + +static void +port_destroy(struct dp_netdev_port *port) +{ + if (!port) { + return; + } + + netdev_close(port->netdev); + netdev_restore_flags(port->sf); + + for (unsigned i = 0; i < port->n_rxq; i++) { + netdev_rxq_close(port->rxqs[i].rx); + } + ovs_mutex_destroy(&port->txq_used_mutex); + free(port->rxq_affinity_list); + free(port->txq_used); + free(port->rxqs); + free(port->type); + free(port); +} + +static int +get_port_by_name(struct dp_netdev *dp, + const char *devname, struct dp_netdev_port **portp) + OVS_REQ_RDLOCK(dp->port_rwlock) +{ + struct dp_netdev_port *port; + + HMAP_FOR_EACH (port, node, &dp->ports) { + if (!strcmp(netdev_get_name(port->netdev), devname)) { + *portp = port; + return 0; + } + } + + /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non + * existing port. */ + return ENODEV; +} + +/* Returns 'true' if there is a port with pmd netdev. */ +static bool +has_pmd_port(struct dp_netdev *dp) + OVS_REQ_RDLOCK(dp->port_rwlock) +{ + struct dp_netdev_port *port; + + HMAP_FOR_EACH (port, node, &dp->ports) { + if (netdev_is_pmd(port->netdev)) { + return true; + } + } + + return false; +} + +static void +do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port) + OVS_REQ_WRLOCK(dp->port_rwlock) +{ + /* If the netdev is an ESW manager, disable its members. + * They will be kept in the datapath but won't be polled by the PMDs. + * The ESW manager must be added back to re-enable them. + * + * This setting must be set before calling 'reconfigure_datapath' to + * properly allocate queues and balance them between PMDs. */ + + if (netdev_dpdk_is_esw_mgr(port->netdev)) { + dp_netdev_esw_ports_set_disabled(dp, port->netdev, true); + } + + hmap_remove(&dp->ports, &port->node); + seq_change(dp->port_seq); + + reconfigure_datapath(dp); + + /* Flush and disable offloads only after 'port' has been made + * inaccessible through datapath reconfiguration. + * This prevents having PMDs enqueuing offload requests after + * the flush. + * When only this port is deleted instead of the whole datapath, + * revalidator threads are still active and can still enqueue + * offload modification or deletion. Managing those stray requests + * is done in the offload threads. */ + dp_netdev_port_flow_flush(dp, port); + + ovs_rwlock_unlock(&dp->port_rwlock); + dp_netdev_offload_flush(dp, port); + ovs_rwlock_wrlock(&dp->port_rwlock); + + netdev_uninit_flow_api(port->netdev); + + port_destroy(port); +} + +static void +answer_port_query(const struct dp_netdev_port *port, + struct dpif_port *dpif_port) +{ + dpif_port->name = xstrdup(netdev_get_name(port->netdev)); + dpif_port->type = xstrdup(port->type); + dpif_port->port_no = port->port_no; +} + +static int +dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no, + struct dpif_port *dpif_port) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + struct dp_netdev_port *port; + int error; + + ovs_rwlock_wrlock(&dp->port_rwlock); + error = get_port_by_number(dp, port_no, &port); + if (!error && dpif_port) { + answer_port_query(port, dpif_port); + } + ovs_rwlock_unlock(&dp->port_rwlock); + + return error; +} + +static int +dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname, + struct dpif_port *dpif_port) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + struct dp_netdev_port *port; + int error; + + dp_netdev_port_rdlock(dp); + error = get_port_by_name(dp, devname, &port); + if (!error && dpif_port) { + answer_port_query(port, dpif_port); + } + ovs_rwlock_unlock(&dp->port_rwlock); + + return error; +} + +static void +dp_netdev_flow_free(struct dp_netdev_flow *flow) +{ + struct dp_netdev_actions *actions = dp_netdev_flow_get_actions(flow); + + if (actions) { + dp_netdev_actions_free(actions); + } + if (flow->dp_extra_info) { + free(flow->dp_extra_info); + } + free(flow); +} + +void dp_netdev_flow_unref(struct dp_netdev_flow *flow) +{ + if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) { + ovsrcu_postpone(dp_netdev_flow_free, flow); + } +} + +inline struct dpcls * +dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd, + odp_port_t in_port) +{ + struct dpcls *cls; + uint32_t hash = hash_port_no(in_port); + CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) { + if (cls->in_port == in_port) { + /* Port classifier exists already */ + return cls; + } + } + return NULL; +} + +static inline struct dpcls * +dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd, + odp_port_t in_port) + OVS_REQUIRES(pmd->flow_mutex) +{ + struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); + + if (!cls) { + uint32_t hash = hash_port_no(in_port); + + /* Create new classifier for in_port */ + cls = xmalloc(sizeof(*cls)); + dpcls_init(cls); + cls->in_port = in_port; + cmap_insert(&pmd->classifiers, &cls->node, hash); + VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port); + } + return cls; +} + +struct megaflow_to_mark_data { + const struct cmap_node node; + ovs_u128 mega_ufid; + uint32_t mark; +}; + +/* associate megaflow with a mark, which is a 1:1 mapping */ +static void +megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t mark) +{ + size_t hash = dp_netdev_flow_hash(mega_ufid); + struct megaflow_to_mark_data *data = xzalloc(sizeof(*data)); + unsigned int tid = netdev_offload_thread_id(); + + data->mega_ufid = *mega_ufid; + data->mark = mark; + + cmap_insert(&dp_offload_threads[tid].megaflow_to_mark, + CONST_CAST(struct cmap_node *, &data->node), hash); +} + +/* disassociate meagaflow with a mark */ +static void +megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid) +{ + size_t hash = dp_netdev_flow_hash(mega_ufid); + struct megaflow_to_mark_data *data; + unsigned int tid = netdev_offload_thread_id(); + + CMAP_FOR_EACH_WITH_HASH (data, node, hash, + &dp_offload_threads[tid].megaflow_to_mark) { + if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) { + cmap_remove(&dp_offload_threads[tid].megaflow_to_mark, + CONST_CAST(struct cmap_node *, &data->node), hash); + ovsrcu_postpone(free, data); + return; + } + } + + VLOG_WARN("Masked ufid "UUID_FMT" is not associated with a mark?\n", + UUID_ARGS((struct uuid *)mega_ufid)); +} + +uint32_t +megaflow_to_mark_find(const ovs_u128 *mega_ufid) +{ + size_t hash = dp_netdev_flow_hash(mega_ufid); + struct megaflow_to_mark_data *data; + unsigned int tid; + + tid = netdev_offload_ufid_to_thread_id(*mega_ufid); + CMAP_FOR_EACH_WITH_HASH (data, node, hash, + &dp_offload_threads[tid].megaflow_to_mark) { + if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) { + return data->mark; + } + } + + VLOG_DBG("Mark id for ufid "UUID_FMT" was not found\n", + UUID_ARGS((struct uuid *)mega_ufid)); + return INVALID_FLOW_MARK; +} + +/* associate mark with a flow, which is 1:N mapping */ +static void +mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow *flow) +{ + unsigned int tid = netdev_offload_thread_id(); + dp_netdev_flow_ref(flow); + + cmap_insert(&dp_offload_threads[tid].mark_to_flow, + CONST_CAST(struct cmap_node *, &flow->mark_node), + hash_int(mark, 0)); + flow->mark = mark; + + VLOG_DBG("Associated dp_netdev flow %p with mark %u mega_ufid "UUID_FMT, + flow, mark, UUID_ARGS((struct uuid *) &flow->mega_ufid)); +} + +static bool +flow_mark_has_no_ref(uint32_t mark) +{ + unsigned int tid = netdev_offload_thread_id(); + struct dp_netdev_flow *flow; + + CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0), + &dp_offload_threads[tid].mark_to_flow) { + if (flow->mark == mark) { + return false; + } + } + + return true; +} + +static void +mark_to_flow_disassociate(struct dp_offload_thread_item *item) +{ + struct dp_netdev_flow *flow = item->data->flow.flow; + bool is_e2e_cache_flow = item->data->flow.is_e2e_cache_flow; + unsigned int tid = netdev_offload_thread_id(); + uint32_t mark = flow->mark; + + flow->mark = INVALID_FLOW_MARK; + + /* + * no flow is referencing the mark any more? If so, let's + * remove the flow from hardware and free the mark. Always remove from + * hardware in case of E2E cache flow. + */ + if (flow_mark_has_no_ref(mark)) { + netdev_offload_flow_mark_free(mark); + VLOG_DBG("Freed flow mark %u mega_ufid "UUID_FMT, mark, + UUID_ARGS((struct uuid *) &flow->mega_ufid)); + + megaflow_to_mark_disassociate(&flow->mega_ufid); + } + + if (!is_e2e_cache_flow) { + struct cmap_node *mark_node; + + /* INVALID_FLOW_MARK may mean that the flow has been disassociated + * or never associated. */ + if (OVS_UNLIKELY(mark == INVALID_FLOW_MARK)) { + return; + } + + mark_node = CONST_CAST(struct cmap_node *, &flow->mark_node); + cmap_remove(&dp_offload_threads[tid].mark_to_flow, mark_node, + hash_int(mark, 0)); + dp_netdev_flow_unref(flow); + } +} + +static struct dp_netdev_flow * +mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd, + const uint32_t mark) +{ + struct dp_offload_thread *thread; + struct dp_netdev_flow *flow; + size_t hash; + + hash = hash_int(mark, 0); + DP_NETDEV_OFFLOAD_FOREACH_THREAD_NO_MAIN (thread) { + CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash, + &thread->mark_to_flow) { + if (flow->mark == mark && flow->pmd_id == pmd->core_id && + flow->dead == false) { + return flow; + } + } + } + + return NULL; +} + +struct dp_offload_thread_item * +dp_netdev_alloc_flow_offload(struct dp_netdev *dp, + struct dp_netdev_flow *flow, + int op, long long now) +{ + struct dp_offload_thread_item *item; + struct dp_offload_flow_item *flow_offload; + + item = xzalloc(sizeof *item + sizeof *flow_offload); + flow_offload = &item->data->flow; + + item->type = DP_OFFLOAD_FLOW; + item->dp = dp; + item->timestamp = now; + + flow_offload->flow = flow; + flow_offload->op = op; + flow_offload->is_e2e_cache_flow = false; + + dp_netdev_flow_ref(flow); + + return item; +} + +static void +dp_netdev_free_flow_offload__(struct dp_offload_thread_item *offload) +{ + struct dp_offload_flow_item *flow_offload = &offload->data->flow; + + free(flow_offload->actions); + free(offload); +} + +static void +dp_netdev_free_flow_offload(struct dp_offload_thread_item *offload) +{ + struct dp_offload_flow_item *flow_offload = &offload->data->flow; + + dp_netdev_flow_unref(flow_offload->flow); + ovsrcu_gc(dp_netdev_free_flow_offload__, offload, gc_node); +} + +void +dp_netdev_free_offload(struct dp_offload_thread_item *offload) +{ + switch (offload->type) { + case DP_OFFLOAD_FLOW: + dp_netdev_free_flow_offload(offload); + break; + case DP_OFFLOAD_STATS_CLEAR: + /* Fallthrough */ + case DP_OFFLOAD_FLUSH: + /* Fallthrough */ + case DP_OFFLOAD_CONN: + free(offload); + break; + default: + OVS_NOT_REACHED(); + }; +} + +static void +dp_netdev_offload_flow_enqueue(struct dp_offload_thread_item *item) +{ + struct dp_offload_flow_item *flow_offload = &item->data->flow; + unsigned int tid; + + ovs_assert(item->type == DP_OFFLOAD_FLOW); + + tid = netdev_offload_ufid_to_thread_id(flow_offload->flow->mega_ufid); + dp_netdev_offload_thread_enqueue(&dp_offload_threads[tid], item); +} + +int +dp_netdev_flow_offload_del(struct dp_offload_thread_item *item) +{ + struct dp_netdev_flow *flow = item->data->flow.flow; + struct dp_netdev *dp = item->dp; + const char *dpif_type_str; + struct netdev *netdev; + odp_port_t in_port; + int ret = 0; + + if (flow->mark == INVALID_FLOW_MARK && + !item->data->flow.is_e2e_cache_flow) { + return 0; + } + + in_port = flow->flow.in_port.odp_port; + dpif_type_str = dpif_normalize_type(dp->class->type); + netdev = netdev_ports_get(in_port, dpif_type_str); + if (netdev) { + /* Taking a global 'port_rwlock' to fulfill thread safety + * restrictions regarding netdev port mapping. */ + dp_netdev_port_rdlock(dp); + ret = netdev_flow_del(netdev, &flow->mega_ufid, NULL); + ovs_rwlock_unlock(&dp->port_rwlock); + netdev_close(netdev); + } + + mark_to_flow_disassociate(item); + + return ret; +} + +/* + * There are two flow offload operations here: addition and modification. + * + * For flow addition, this function does: + * - allocate a new flow mark id + * - perform hardware flow offload + * - associate the flow mark with flow and mega flow + * + * For flow modification, both flow mark and the associations are still + * valid, thus only item 2 needed. + */ +int +dp_netdev_flow_offload_put(struct dp_offload_thread_item *item) +{ + struct dp_offload_flow_item *offload = &item->data->flow; + struct dp_netdev *dp = item->dp; + struct dp_netdev_flow *flow = offload->flow; + odp_port_t in_port = flow->flow.in_port.odp_port; + const char *dpif_type_str = dpif_normalize_type(dp->class->type); + bool modification = offload->op == DP_NETDEV_FLOW_OFFLOAD_OP_MOD + && flow->mark != INVALID_FLOW_MARK; + bool is_e2e_cache_flow = offload->is_e2e_cache_flow; + struct offload_info info = { + .is_ct_conn = false, + }; + struct netdev *port; + uint32_t mark; + int ret; + + if (flow->dead) { + return -1; + } + + if (is_e2e_cache_flow || modification) { + /* For e2e case, mark is invalid. However, CT2CT is also marked as + * is_e2e_cache_flow, and for that case we need to pass the mark of + * last merged megaflow. + */ + mark = flow->mark; + } else { + /* + * If a mega flow has already been offloaded (from other PMD + * instances), do not offload it again. + */ + mark = megaflow_to_mark_find(&flow->mega_ufid); + if (mark != INVALID_FLOW_MARK) { + VLOG_DBG("Flow has already been offloaded with mark %u\n", + mark); + if (flow->mark != INVALID_FLOW_MARK) { + ovs_assert(flow->mark == mark); + } else { + mark_to_flow_associate(mark, flow); + } + return 0; + } + + mark = netdev_offload_flow_mark_alloc(); + if (mark == INVALID_FLOW_MARK) { + VLOG_ERR("Failed to allocate flow mark!\n"); + return -1; + } + } + + /* First associate the mark<->flow, so if the HW flow hits with a mark, + * the flow will be found. + */ + if (!modification) { + if (!is_e2e_cache_flow) { + megaflow_to_mark_associate(&flow->mega_ufid, mark); + mark_to_flow_associate(mark, flow); + } else { + flow->mark = INVALID_FLOW_MARK; + } + } + + info.flow_mark = mark; + info.orig_in_port = offload->orig_in_port; + info.is_e2e_cache_flow = offload->is_e2e_cache_flow; + info.ct_counter_key = offload->ct_counter_key; + memcpy(&info.flows_counter_key, &offload->flows_counter_key, + sizeof offload->flows_counter_key); + + port = netdev_ports_get(in_port, dpif_type_str); + if (!port) { + goto err_free; + } + + /* Taking a global 'port_rwlock' to fulfill thread safety + * restrictions regarding the netdev port mapping. */ + dp_netdev_port_rdlock_limit(dp, 50); + ret = netdev_flow_put(port, &offload->match, + CONST_CAST(struct nlattr *, offload->actions), + offload->actions_len, &flow->mega_ufid, &info, + NULL); + ovs_rwlock_unlock(&dp->port_rwlock); + netdev_close(port); + + if (ret) { + goto err_free; + } + + return 0; + +err_free: + if (!is_e2e_cache_flow) { + mark_to_flow_disassociate(item); + } + return -1; +} + +void +dp_offload_flow(struct dp_offload_thread_item *item) +{ + struct dp_offload_flow_item *flow_offload = &item->data->flow; + const char *op; + int ret; + + switch (flow_offload->op) { + case DP_NETDEV_FLOW_OFFLOAD_OP_ADD: + op = "add"; + ret = dp_netdev_flow_offload_put(item); + break; + case DP_NETDEV_FLOW_OFFLOAD_OP_MOD: + op = "modify"; + ret = dp_netdev_flow_offload_put(item); + break; + case DP_NETDEV_FLOW_OFFLOAD_OP_DEL: + op = "delete"; + ret = dp_netdev_flow_offload_del(item); + break; + default: + OVS_NOT_REACHED(); + } + + VLOG_DBG("%s to %s netdev flow "UUID_FMT, + ret == 0 ? "succeed" : "failed", op, + UUID_ARGS((struct uuid *) &flow_offload->flow->mega_ufid)); +} + +static void +dp_netdev_per_pmd_port_disable(struct dp_netdev *dp, struct netdev *netdev) +{ + struct dp_netdev_pmd_thread *pmd; + + pmd = ovsthread_getspecific(dp->per_pmd_key); + if (pmd) { + pmd_thread_offload_disable(pmd, netdev); + } +} + +void +dp_offload_flush(struct dp_offload_thread_item *item) +{ + struct dp_offload_flush_item *flush = &item->data->flush; + + dp_netdev_port_rdlock_limit(item->dp, 50); + /* Disable access for other offload calls. */ + netdev_ports_set_visible(flush->netdev, false); + dp_netdev_per_pmd_port_disable(item->dp, flush->netdev); + netdev_flow_flush(flush->netdev); + ovs_rwlock_unlock(&item->dp->port_rwlock); + + /* The other remaining reference is on the flush initiator thread. */ + if (ovs_refcount_unref(flush->count) == 2) { + ovs_mutex_lock(flush->mutex); + xpthread_cond_signal(flush->cond); + ovs_mutex_unlock(flush->mutex); + } +} + +static void * +dp_netdev_flow_offload_main(void *arg OVS_UNUSED) +{ + struct e2e_cache_trace_message *trace_msg; + struct dp_offload_thread_item *offload; + struct dp_offload_thread *ofl_thread; + struct e2e_cache_ufid_msg *ufid_msg; + long long int next_rcu; + unsigned int tid; + + tid = netdev_offload_thread_init(); + dp_netdev_offload_thread_init(&dp_offload_threads[tid]); + ofl_thread = &dp_offload_threads[tid]; + + mpsc_queue_acquire(&ofl_thread->ufid_queue); + mpsc_queue_acquire(&ofl_thread->offload_queue); + mpsc_queue_acquire(&ofl_thread->trace_queue); + + next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US; + + for (;;) { + long long int enqueue_time_us = 0, finish_time_us = 0; + + dp_netdev_offload_poll_queues(ofl_thread, &ufid_msg, &offload, + &trace_msg); + + /* Only one of the message types should be popped. */ + ovs_assert((ufid_msg != NULL && offload == NULL && trace_msg == NULL) || + (offload != NULL && ufid_msg == NULL && trace_msg == NULL) || + (trace_msg != NULL && ufid_msg == NULL && offload == NULL)); + + if (ufid_msg != NULL) { + enqueue_time_us = e2e_cache_flow_db_handle_ufid_msg(ufid_msg); + } else if (offload != NULL) { + dp_offload_process(ofl_thread, offload); + } else if (trace_msg != NULL) { + uint32_t i, num_elements; + + ofl_thread->e2e_stats.processed_trcs++; + num_elements = trace_msg->num_elements; + for (i = 0; i < num_elements; i++) { + e2e_cache_process_trace_info((struct dp_netdev *)trace_msg->dp, + &trace_msg->data[i], tid); + } + enqueue_time_us = trace_msg->timestamp; + free_cacheline(trace_msg); + } + + finish_time_us = time_usec(); + if (enqueue_time_us) { + dp_offload_measure_latency(ofl_thread, enqueue_time_us, finish_time_us); + } + + /* Do RCU synchronization at fixed interval. */ + if (finish_time_us > next_rcu) { + coverage_clear(); + ovsrcu_quiesce(); + next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US; + } + } + + OVS_NOT_REACHED(); + mpsc_queue_release(&ofl_thread->ufid_queue); + mpsc_queue_release(&ofl_thread->offload_queue); + mpsc_queue_release(&ofl_thread->trace_queue); + netdev_offload_thread_uninit(); + dp_netdev_offload_thread_uninit(&dp_offload_threads[tid]); + + return NULL; +} + +static void +queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd, + struct dp_netdev_flow *flow) +{ + struct dp_offload_thread_item *offload; + + if (!netdev_is_flow_api_enabled()) { + return; + } + + if (dp_netdev_e2e_cache_enabled) { + e2e_cache_flow_del(&flow->mega_ufid, pmd->dp, NULL, pmd->ctx.now); + } + offload = dp_netdev_alloc_flow_offload(pmd->dp, flow, + DP_NETDEV_FLOW_OFFLOAD_OP_DEL, + pmd->ctx.now); + dp_netdev_offload_flow_enqueue(offload); +} + +static void +log_netdev_flow_change(const struct dp_netdev_flow *flow, + const struct match *match, + const struct dp_netdev_actions *old_actions) +{ + const char *prefix = old_actions ? "flow_mod" : "flow_add"; + const struct dp_netdev_actions *dp_actions; + struct ds ds = DS_EMPTY_INITIALIZER; + struct ofpbuf key_buf, mask_buf; + struct odp_flow_key_parms odp_parms = { + .flow = &match->flow, + .mask = &match->wc.masks, + .support = dp_netdev_support, + }; + + if (OVS_LIKELY(VLOG_DROP_DBG((&upcall_rl)))) { + return; + } + + dp_netdev_flow_format(prefix, &ds, flow); + if (old_actions) { + ds_put_cstr(&ds, ", old_actions:"); + format_odp_actions(&ds, old_actions->actions, old_actions->size, + NULL); + } + + VLOG_DBG("%s", ds_cstr(&ds)); + + /* Add a printout of the temporary flow. + * It can differ from the match within the dp_netdev_flow installed. + */ + ds_clear(&ds); + ds_put_cstr(&ds, "Transient flow: "); + + ofpbuf_init(&key_buf, 0); + ofpbuf_init(&mask_buf, 0); + + odp_flow_key_from_flow(&odp_parms, &key_buf); + odp_parms.key_buf = &key_buf; + odp_flow_key_from_mask(&odp_parms, &mask_buf); + + odp_flow_format(key_buf.data, key_buf.size, + mask_buf.data, mask_buf.size, + NULL, &ds, false); + + ofpbuf_uninit(&key_buf); + ofpbuf_uninit(&mask_buf); + + dp_actions = dp_netdev_flow_get_actions(flow); + ds_put_cstr(&ds, ", actions:"); + format_odp_actions(&ds, dp_actions->actions, dp_actions->size, + NULL); + + VLOG_DBG("%s", ds_cstr(&ds)); + + ds_destroy(&ds); +} + +static void +queue_netdev_flow_put(struct dp_netdev_pmd_thread *pmd, + struct dp_netdev_flow *flow, struct match *match, + const struct nlattr *actions, size_t actions_len, + int op) +{ + struct dp_offload_thread_item *item; + struct dp_offload_flow_item *flow_offload; + + if (!netdev_is_flow_api_enabled()) { + return; + } + + if (dp_netdev_e2e_cache_enabled) { + e2e_cache_flow_put(false, &flow->mega_ufid, match, actions, + actions_len, pmd->ctx.now); + } + + item = dp_netdev_alloc_flow_offload(pmd->dp, flow, op, pmd->ctx.now); + flow_offload = &item->data->flow; + flow_offload->match = *match; + flow_offload->actions = xmalloc(actions_len); + if (actions_len) { + memcpy(flow_offload->actions, actions, actions_len); + } + flow_offload->actions_len = actions_len; + flow_offload->orig_in_port = flow->orig_in_port; + flow->offload_requested = true; + + dp_netdev_offload_flow_enqueue(item); +} + +static void +dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd, + struct dp_netdev_flow *flow) + OVS_REQUIRES(pmd->flow_mutex) +{ + struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node); + struct dpcls *cls; + odp_port_t in_port = flow->flow.in_port.odp_port; + + cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); + ovs_assert(cls != NULL); + dpcls_remove(cls, &flow->cr); + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_UPDATE, 1); + dp_netdev_simple_match_remove(pmd, flow); + cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid)); + ccmap_dec(&pmd->n_flows, odp_to_u32(in_port)); + if (flow->offload_requested) { + queue_netdev_flow_del(pmd, flow); + } + flow->dead = true; + + if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) { + struct ds s = DS_EMPTY_INITIALIZER; + + dp_netdev_flow_format("flow_del", &s, flow); + VLOG_DBG("%s", ds_cstr(&s)); + ds_destroy(&s); + } + + dp_netdev_flow_unref(flow); +} + +static void +dp_netdev_offload_flush_enqueue(struct dp_netdev *dp, + struct netdev *netdev, + struct ovs_refcount *count, + struct ovs_mutex *mutex, + pthread_cond_t *cond) +{ + struct { + struct dp_offload_thread *thread; + struct dp_offload_thread_item *item; + } dispatch[MAX_OFFLOAD_THREAD_NB]; + long long int now_us = time_usec(); + struct dp_offload_thread *thread; + int flush_count = 0; + + memset(dispatch, 0, sizeof dispatch); + + DP_NETDEV_OFFLOAD_FOREACH_THREAD_NO_MAIN (thread) { + struct dp_offload_thread_item *item; + struct dp_offload_flush_item *flush; + + item = xmalloc(sizeof *item + sizeof *flush); + item->type = DP_OFFLOAD_FLUSH; + item->dp = dp; + item->timestamp = now_us; + + flush = &item->data->flush; + flush->netdev = netdev; + flush->count = count; + flush->mutex = mutex; + flush->cond = cond; + + /* Set all the expected refs before enqueuing any request, + * to ensure that no offload thread will spuriously trigger + * the cond due to a lower count. */ + ovs_refcount_ref(count); + dispatch[flush_count].thread = thread; + dispatch[flush_count].item = item; + flush_count++; + } + + for (int i = 0; i < flush_count; i++) { + dp_netdev_offload_thread_enqueue(dispatch[i].thread, dispatch[i].item); + } +} + +/* Blocking call that will wait on the offload threads to + * complete their work. As the flush order will only be + * enqueued after existing offload requests, those previous + * offload requests must be processed, which requires being + * able to read-lock the 'port_rwlock' from the offload thread. + * + * Flow offload flush is done when a port is being deleted. + * Right after this call executes, the offload API is disabled + * for the port. This call must be made blocking until the + * offload provider completed its job. + */ +static void +dp_netdev_offload_flush(struct dp_netdev *dp, + struct dp_netdev_port *port) + OVS_EXCLUDED(dp->port_rwlock) +{ + struct ovs_mutex mutex = OVS_MUTEX_INITIALIZER; + pthread_cond_t cond = PTHREAD_COND_INITIALIZER; + struct ovs_refcount count; + struct netdev *netdev; + + if (!netdev_is_flow_api_enabled()) { + return; + } + + netdev = netdev_ref(port->netdev); + ovs_refcount_init(&count); + + conntrack_offload_netdev_flush(dp->conntrack, netdev); + ovs_mutex_lock(&mutex); + dp_netdev_offload_flush_enqueue(dp, netdev, &count, &mutex, &cond); + ovsrcu_quiesce_start(); + ovs_mutex_cond_wait(&cond, &mutex); + ovsrcu_quiesce_end(); + ovs_mutex_unlock(&mutex); + + netdev_close(netdev); + ovs_mutex_destroy(&mutex); + xpthread_cond_destroy(&cond); +} + +static void +get_dpif_flow_status(const struct dp_netdev *dp, + const struct dp_netdev_flow *netdev_flow_, + struct dpif_flow_stats *stats, + struct dpif_flow_attrs *attrs); + +static void +dp_netdev_pmd_flow_flush__(struct dp_netdev_pmd_thread *pmd, struct dp_netdev_port *port) +{ + struct dp_netdev_flow *netdev_flow; + + ovs_mutex_lock(&pmd->flow_mutex); + CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) { + odp_port_t flow_port_no = netdev_flow->flow.in_port.odp_port; + + if (port != NULL && flow_port_no != port->port_no) { + continue; + } + + dp_netdev_pmd_remove_flow(pmd, netdev_flow); + } + ovs_mutex_unlock(&pmd->flow_mutex); +} + +static void +dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd) +{ + dp_netdev_pmd_flow_flush__(pmd, NULL); +} + +static void +dp_netdev_port_flow_flush(struct dp_netdev *dp, struct dp_netdev_port *port) +{ + struct dp_netdev_pmd_thread *pmd; + + if (netdev_dpdk_is_esw_mgr(port->netdev)) { + struct dp_netdev_port *iter_port; + int esw_mgr_pid; + + esw_mgr_pid = netdev_dpdk_get_esw_mgr_port_id(port->netdev); + + HMAP_FOR_EACH (iter_port, node, &dp->ports) { + if (esw_mgr_pid == netdev_dpdk_get_esw_mgr_port_id(iter_port->netdev)) { + dp_netdev_port_flow_flush(dp, iter_port); + } + } + } + + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + dp_netdev_pmd_flow_flush__(pmd, port); + } +} + +static int +dpif_netdev_flow_flush(struct dpif *dpif) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + struct dp_netdev_pmd_thread *pmd; + + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + dp_netdev_pmd_flow_flush(pmd); + } + + return 0; +} + +struct dp_netdev_port_state { + struct hmap_position position; + char *name; +}; + +static int +dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep) +{ + *statep = xzalloc(sizeof(struct dp_netdev_port_state)); + return 0; +} + +static int +dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_, + struct dpif_port *dpif_port) +{ + struct dp_netdev_port_state *state = state_; + struct dp_netdev *dp = get_dp_netdev2(dpif); + struct hmap_node *node; + int retval; + + dp_netdev_port_rdlock(dp); + node = hmap_at_position(&dp->ports, &state->position); + if (node) { + struct dp_netdev_port *port; + + port = CONTAINER_OF(node, struct dp_netdev_port, node); + + free(state->name); + state->name = xstrdup(netdev_get_name(port->netdev)); + dpif_port->name = state->name; + dpif_port->type = port->type; + dpif_port->port_no = port->port_no; + + retval = 0; + } else { + retval = EOF; + } + ovs_rwlock_unlock(&dp->port_rwlock); + + return retval; +} + +static int +dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_) +{ + struct dp_netdev_port_state *state = state_; + free(state->name); + free(state); + return 0; +} + +static int +dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED) +{ + struct dpif_netdev *dpif = dpif_netdev_cast(dpif_); + uint64_t new_port_seq; + int error; + + new_port_seq = seq_read(dpif->dp->port_seq); + if (dpif->last_port_seq != new_port_seq) { + dpif->last_port_seq = new_port_seq; + error = ENOBUFS; + } else { + error = EAGAIN; + } + + return error; +} + +static void +dpif_netdev_port_poll_wait(const struct dpif *dpif_) +{ + struct dpif_netdev *dpif = dpif_netdev_cast(dpif_); + + seq_wait(dpif->dp->port_seq, dpif->last_port_seq); +} + +static struct dp_netdev_flow * +dp_netdev_flow_cast(const struct dpcls_rule *cr) +{ + return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL; +} + +bool dp_netdev_flow_ref(struct dp_netdev_flow *flow) +{ + return ovs_refcount_try_ref_rcu(&flow->ref_cnt); +} + +/* netdev_flow_key utilities. + * + * netdev_flow_key is basically a miniflow. We use these functions + * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow + * functions (miniflow_clone_inline, miniflow_equal, ...), because: + * + * - Since we are dealing exclusively with miniflows created by + * miniflow_extract(), if the map is different the miniflow is different. + * Therefore we can be faster by comparing the map and the miniflow in a + * single memcmp(). + * - These functions can be inlined by the compiler. */ + +static inline bool +netdev_flow_key_equal(const struct netdev_flow_key *a, + const struct netdev_flow_key *b) +{ + /* 'b->len' may be not set yet. */ + return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len); +} + +static inline void +netdev_flow_key_clone(struct netdev_flow_key *dst, + const struct netdev_flow_key *src) +{ + memcpy(dst, src, + offsetof(struct netdev_flow_key, mf) + src->len); +} + +/* Initialize a netdev_flow_key 'mask' from 'match'. */ +static inline void +netdev_flow_mask_init(struct netdev_flow_key *mask, + const struct match *match) +{ + uint64_t *dst = miniflow_values(&mask->mf); + struct flowmap fmap; + uint32_t hash = 0; + size_t idx; + + /* Only check masks that make sense for the flow. */ + flow_wc_map(&match->flow, &fmap); + flowmap_init(&mask->mf.map); + + FLOWMAP_FOR_EACH_INDEX(idx, fmap) { + uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx); + + if (mask_u64) { + flowmap_set(&mask->mf.map, idx, 1); + *dst++ = mask_u64; + hash = hash_add64(hash, mask_u64); + } + } + + map_t map; + + FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) { + hash = hash_add64(hash, map); + } + + size_t n = dst - miniflow_get_values(&mask->mf); + + mask->hash = hash_finish(hash, n * 8); + mask->len = netdev_flow_key_size(n); +} + +/* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */ +static inline void +netdev_flow_key_init_masked(struct netdev_flow_key *dst, + const struct flow *flow, + const struct netdev_flow_key *mask) +{ + uint64_t *dst_u64 = miniflow_values(&dst->mf); + const uint64_t *mask_u64 = miniflow_get_values(&mask->mf); + uint32_t hash = 0; + uint64_t value; + + dst->len = mask->len; + dst->mf = mask->mf; /* Copy maps. */ + + FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) { + *dst_u64 = value & *mask_u64++; + hash = hash_add64(hash, *dst_u64++); + } + dst->hash = hash_finish(hash, + (dst_u64 - miniflow_get_values(&dst->mf)) * 8); +} + +/* Initializes 'key' as a copy of 'flow'. */ +static inline void +netdev_flow_key_init(struct netdev_flow_key *key, + const struct flow *flow) +{ + uint32_t hash = 0; + uint64_t value; + + miniflow_map_init(&key->mf, flow); + miniflow_init(&key->mf, flow); + + size_t n = miniflow_n_values(&key->mf); + + FLOW_FOR_EACH_IN_MAPS (value, flow, key->mf.map) { + hash = hash_add64(hash, value); + } + + key->hash = hash_finish(hash, n * 8); + key->len = netdev_flow_key_size(n); +} + +static inline void +emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow, + const struct netdev_flow_key *key) +{ + if (ce->flow != flow) { + if (ce->flow) { + dp_netdev_flow_unref(ce->flow); + } + + if (dp_netdev_flow_ref(flow)) { + ce->flow = flow; + } else { + ce->flow = NULL; + } + } + if (key) { + netdev_flow_key_clone(&ce->key, key); + } +} + +static inline void +emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key, + struct dp_netdev_flow *flow) +{ + struct emc_entry *to_be_replaced = NULL; + struct emc_entry *current_entry; + + EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) { + if (netdev_flow_key_equal(¤t_entry->key, key)) { + /* We found the entry with the 'mf' miniflow */ + emc_change_entry(current_entry, flow, NULL); + return; + } + + /* Replacement policy: put the flow in an empty (not alive) entry, or + * in the first entry where it can be */ + if (!to_be_replaced + || (emc_entry_alive(to_be_replaced) + && !emc_entry_alive(current_entry)) + || current_entry->key.hash < to_be_replaced->key.hash) { + to_be_replaced = current_entry; + } + } + /* We didn't find the miniflow in the cache. + * The 'to_be_replaced' entry is where the new flow will be stored */ + if (!emc_entry_alive(to_be_replaced)) { + /* Only count as new insertion if 'to_be_replaced' was not alive. */ + atomic_count_inc(&cache->n_entries); + } + emc_change_entry(to_be_replaced, flow, key); +} + +static inline void +emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd, + const struct netdev_flow_key *key, + struct dp_netdev_flow *flow) +{ + /* Insert an entry into the EMC based on probability value 'min'. By + * default the value is UINT32_MAX / 100 which yields an insertion + * probability of 1/100 ie. 1% */ + + uint32_t min = pmd->ctx.emc_insert_min; + + if (min && random_uint32() <= min) { + emc_insert(&(pmd->flow_cache).emc_cache, key, flow); + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_UPDATE, 1); + } +} + +static inline const struct cmap_node * +smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash) +{ + struct smc_cache *cache = &(pmd->flow_cache).smc_cache; + struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK]; + uint16_t sig = hash >> 16; + uint16_t index = UINT16_MAX; + + for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) { + if (bucket->sig[i] == sig) { + index = bucket->flow_idx[i]; + break; + } + } + if (index != UINT16_MAX) { + return cmap_find_by_index(&pmd->flow_table, index); + } + return NULL; +} + +/* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is + * turned off, 2) the flow_table index is larger than uint16_t can handle. + * If there is already an SMC entry having same signature, the index will be + * updated. If there is no existing entry, but an empty entry is available, + * the empty entry will be taken. If no empty entry or existing same signature, + * a random entry from the hashed bucket will be picked. */ +static inline void +smc_insert(struct dp_netdev_pmd_thread *pmd, + const struct netdev_flow_key *key, + uint32_t hash) +{ + struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache; + struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK]; + uint16_t index; + uint32_t cmap_index; + int i; + + if (!pmd->ctx.smc_enable_db) { + return; + } + + cmap_index = cmap_find_index(&pmd->flow_table, hash); + index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index; + + /* If the index is larger than SMC can handle (uint16_t), we don't + * insert */ + if (index == UINT16_MAX) { + return; + } + + /* If an entry with same signature already exists, update the index */ + uint16_t sig = key->hash >> 16; + for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) { + if (bucket->sig[i] == sig) { + bucket->flow_idx[i] = index; + /* Count 1 delete + 1 add. */ + pmd_perf_update_counter(&pmd->perf_stats, + PMD_STAT_SMC_UPDATE, 2); + return; + } + } + /* If there is an empty entry, occupy it. */ + for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) { + if (bucket->flow_idx[i] == UINT16_MAX) { + bucket->sig[i] = sig; + bucket->flow_idx[i] = index; + atomic_count_inc(&smc_cache->n_entries); + pmd_perf_update_counter(&pmd->perf_stats, + PMD_STAT_SMC_UPDATE, 1); + return; + } + } + /* Otherwise, pick a random entry. */ + i = random_uint32() % SMC_ENTRY_PER_BUCKET; + bucket->sig[i] = sig; + bucket->flow_idx[i] = index; + atomic_count_inc(&smc_cache->n_entries); + pmd_perf_update_counter(&pmd->perf_stats, + PMD_STAT_SMC_UPDATE, 1); +} + +inline void +emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd, + const struct netdev_flow_key *keys, + struct dpcls_rule **rules, + uint32_t emc_insert_mask) +{ + while (emc_insert_mask) { + uint32_t i = raw_ctz(emc_insert_mask); + emc_insert_mask &= emc_insert_mask - 1; + /* Get the require parameters for EMC/SMC from the rule */ + struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]); + /* Insert the key into EMC/SMC. */ + emc_probabilistic_insert(pmd, &keys[i], flow); + } +} + +inline void +smc_insert_batch(struct dp_netdev_pmd_thread *pmd, + const struct netdev_flow_key *keys, + struct dpcls_rule **rules, + uint32_t smc_insert_mask) +{ + while (smc_insert_mask) { + uint32_t i = raw_ctz(smc_insert_mask); + smc_insert_mask &= smc_insert_mask - 1; + /* Get the require parameters for EMC/SMC from the rule */ + struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]); + uint32_t hash = dp_netdev_flow_hash(&flow->ufid); + /* Insert the key into EMC/SMC. */ + smc_insert(pmd, &keys[i], hash); + } +} + +static struct dp_netdev_flow * +dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd, + const struct netdev_flow_key *key, + int *lookup_num_p) +{ + struct dpcls *cls; + struct dpcls_rule *rule = NULL; + odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf, + in_port.odp_port)); + struct dp_netdev_flow *netdev_flow = NULL; + + cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); + if (OVS_LIKELY(cls)) { + dpcls_lookup(cls, &key, &rule, 1, lookup_num_p); + netdev_flow = dp_netdev_flow_cast(rule); + } + return netdev_flow; +} + +static struct dp_netdev_flow * +dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd, + const ovs_u128 *ufidp, const struct nlattr *key, + size_t key_len) +{ + struct dp_netdev_flow *netdev_flow; + struct flow flow; + ovs_u128 ufid; + + /* If a UFID is not provided, determine one based on the key. */ + if (!ufidp && key && key_len + && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) { + odp_flow_key_hash(&flow, sizeof flow, &ufid); + ufidp = &ufid; + } + + if (ufidp) { + CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp), + &pmd->flow_table) { + if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) { + return netdev_flow; + } + } + } + + return NULL; +} + +static void +dp_netdev_flow_set_last_stats_attrs(struct dp_netdev_flow *netdev_flow, + const struct dpif_flow_stats *stats, + const struct dpif_flow_attrs *attrs, + int result) +{ + struct dp_netdev_flow_stats *last_stats = &netdev_flow->last_stats; + struct dp_netdev_flow_attrs *last_attrs = &netdev_flow->last_attrs; + + atomic_store_relaxed(&netdev_flow->netdev_flow_get_result, result); + if (result) { + return; + } + + atomic_store_relaxed(&last_stats->used, stats->used); + atomic_store_relaxed(&last_stats->packet_count, stats->n_packets); + atomic_store_relaxed(&last_stats->byte_count, stats->n_bytes); + atomic_store_relaxed(&last_stats->tcp_flags, stats->tcp_flags); + atomic_store_relaxed(&last_stats->partial_packet_count, stats->n_partial_packets); + atomic_store_relaxed(&last_stats->partial_byte_count, stats->n_partial_bytes); + + atomic_store_relaxed(&last_attrs->offloaded, attrs->offloaded); + atomic_store_relaxed(&last_attrs->partially_offloaded, + attrs->partially_offloaded); + atomic_store_relaxed(&last_attrs->dp_layer, attrs->dp_layer); + +} + +static void +dp_netdev_flow_get_last_stats_attrs(struct dp_netdev_flow *netdev_flow, + struct dpif_flow_stats *stats, + struct dpif_flow_attrs *attrs, + int *result) +{ + struct dp_netdev_flow_stats *last_stats = &netdev_flow->last_stats; + struct dp_netdev_flow_attrs *last_attrs = &netdev_flow->last_attrs; + + atomic_read_relaxed(&netdev_flow->netdev_flow_get_result, result); + if (*result) { + return; + } + + atomic_read_relaxed(&last_stats->used, &stats->used); + atomic_read_relaxed(&last_stats->packet_count, &stats->n_packets); + atomic_read_relaxed(&last_stats->byte_count, &stats->n_bytes); + atomic_read_relaxed(&last_stats->tcp_flags, &stats->tcp_flags); + atomic_read_relaxed(&last_stats->partial_packet_count, &stats->n_partial_packets); + atomic_read_relaxed(&last_stats->partial_byte_count, &stats->n_partial_bytes); + + atomic_read_relaxed(&last_attrs->offloaded, &attrs->offloaded); + atomic_read_relaxed(&last_attrs->partially_offloaded, + &attrs->partially_offloaded); + atomic_read_relaxed(&last_attrs->dp_layer, &attrs->dp_layer); +} + +static int +dpif_netdev_get_flow_offload_status(const struct dp_netdev *dp, + struct dp_netdev_flow *netdev_flow, + struct dpif_flow_stats *stats, + struct dpif_flow_attrs *attrs, + long long now, + long long prev_now) +{ + uint64_t act_buf[1024 / 8]; + bool merged_ret = false; + struct nlattr *actions; + struct netdev *netdev; + struct match match; + struct ofpbuf buf; + int ret = 0; + + if (!netdev_is_flow_api_enabled()) { + return EINVAL; + } + + netdev = netdev_ports_get(netdev_flow->flow.in_port.odp_port, + dpif_normalize_type(dp->class->type)); + if (!netdev) { + return EINVAL; + } + ofpbuf_use_stack(&buf, &act_buf, sizeof act_buf); + /* Taking a global 'port_rwlock' to fulfill thread safety + * restrictions regarding netdev port mapping. + * + * XXX: Main thread will try to pause/stop all revalidators during datapath + * reconfiguration via datapath purge callback (dp_purge_cb) while + * rw-holding 'dp->port_rwlock'. So we're not waiting for lock here. + * Otherwise, deadlock is possible, because revalidators might sleep + * waiting for the main thread to release the lock and main thread + * will wait for them to stop processing. + * This workaround might make statistics less accurate. Especially + * for flow deletion case, since there will be no other attempt. */ + if (!ovs_rwlock_tryrdlock(&dp->port_rwlock)) { + ret = netdev_flow_get(netdev, &match, &actions, + &netdev_flow->mega_ufid, stats, attrs, &buf, now); + /* Storing statistics and attributes from the last request for + * later use on mutex contention. */ + dp_netdev_flow_set_last_stats_attrs(netdev_flow, stats, attrs, ret); + /* Get merged flow stats and update it to mt flow stats. As CT connections + * are offloaded either to MT or e2e (but not both), even if we fail to + * get stats for MT CT, we still need to query the e2e. + */ + if (dp_netdev_e2e_cache_enabled) { + merged_ret = + e2e_cache_get_merged_flows_stats(netdev, &match, &actions, + &netdev_flow->mega_ufid, + stats, &buf, now, prev_now); + } + ovs_rwlock_unlock(&dp->port_rwlock); + } else { + dp_netdev_flow_get_last_stats_attrs(netdev_flow, stats, attrs, &ret); + if (!ret && !attrs->dp_layer) { + /* Flow was never reported as 'offloaded' so it's harmless + * to continue to think so. */ + ret = EAGAIN; + } + } + netdev_close(netdev); + if (ret) { + return merged_ret ? 0 : ret; + } + + return 0; +} + +static void +get_dpif_flow_status(const struct dp_netdev *dp, + const struct dp_netdev_flow *netdev_flow_, + struct dpif_flow_stats *stats, + struct dpif_flow_attrs *attrs) +{ + struct dpif_flow_stats offload_stats; + struct dpif_flow_attrs offload_attrs; + struct dp_netdev_flow *netdev_flow; + unsigned long long n; + long long used; + uint16_t flags; + + netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_); + + if (stats) { + atomic_read_relaxed(&netdev_flow->stats.packet_count, &n); + stats->n_packets = n; + atomic_read_relaxed(&netdev_flow->stats.byte_count, &n); + stats->n_bytes = n; + atomic_read_relaxed(&netdev_flow->stats.used, &used); + stats->used = used; + atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags); + stats->tcp_flags = flags; + atomic_read_relaxed(&netdev_flow->stats.partial_packet_count, &n); + stats->n_partial_packets = n; + atomic_read_relaxed(&netdev_flow->stats.partial_byte_count, &n); + stats->n_partial_bytes = n; + } + + if (!dpif_netdev_get_flow_offload_status(dp, netdev_flow, + &offload_stats, &offload_attrs, + time_msec(), 0)) { + if (stats) { + stats->n_packets += offload_stats.n_packets; + stats->n_bytes += offload_stats.n_bytes; + stats->used = MAX(stats->used, offload_stats.used); + stats->tcp_flags |= offload_stats.tcp_flags; + } + if (attrs) { + attrs->offloaded = offload_attrs.offloaded; + attrs->partially_offloaded = offload_attrs.partially_offloaded; + attrs->dp_layer = offload_attrs.dp_layer; + } + } else if (attrs) { + attrs->offloaded = false; + attrs->partially_offloaded = false; + attrs->dp_layer = "ovs"; + } +} + +/* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for + * storing the netlink-formatted key/mask. 'key_buf' may be the same as + * 'mask_buf'. Actions will be returned without copying, by relying on RCU to + * protect them. */ +static void +dp_netdev_flow_to_dpif_flow(const struct dp_netdev *dp, + const struct dp_netdev_flow *netdev_flow, + struct ofpbuf *key_buf, struct ofpbuf *mask_buf, + struct dpif_flow *flow, bool terse) +{ + if (terse) { + memset(flow, 0, sizeof *flow); + } else { + struct flow_wildcards wc; + struct dp_netdev_actions *actions; + size_t offset; + struct odp_flow_key_parms odp_parms = { + .flow = &netdev_flow->flow, + .mask = &wc.masks, + .support = dp_netdev_support, + }; + + miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks); + /* in_port is exact matched, but we have left it out from the mask for + * optimnization reasons. Add in_port back to the mask. */ + wc.masks.in_port.odp_port = ODPP_NONE; + + /* Key */ + offset = key_buf->size; + flow->key = ofpbuf_tail(key_buf); + odp_flow_key_from_flow(&odp_parms, key_buf); + flow->key_len = key_buf->size - offset; + + /* Mask */ + offset = mask_buf->size; + flow->mask = ofpbuf_tail(mask_buf); + odp_parms.key_buf = key_buf; + odp_flow_key_from_mask(&odp_parms, mask_buf); + flow->mask_len = mask_buf->size - offset; + + /* Actions */ + actions = dp_netdev_flow_get_actions(netdev_flow); + flow->actions = actions->actions; + flow->actions_len = actions->size; + } + + flow->ufid = netdev_flow->ufid; + flow->ufid_present = true; + flow->pmd_id = netdev_flow->pmd_id; + + get_dpif_flow_status(dp, netdev_flow, &flow->stats, &flow->attrs); + flow->attrs.dp_extra_info = netdev_flow->dp_extra_info; +} + +static int +dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len, + const struct nlattr *mask_key, + uint32_t mask_key_len, const struct flow *flow, + struct flow_wildcards *wc, bool probe) +{ + enum odp_key_fitness fitness; + + fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow, NULL); + if (fitness) { + if (!probe) { + /* This should not happen: it indicates that + * odp_flow_key_from_mask() and odp_flow_key_to_mask() + * disagree on the acceptable form of a mask. Log the problem + * as an error, with enough details to enable debugging. */ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + + if (!VLOG_DROP_ERR(&rl)) { + struct ds s; + + ds_init(&s); + odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s, + true); + VLOG_ERR("internal error parsing flow mask %s (%s)", + ds_cstr(&s), odp_key_fitness_to_string(fitness)); + ds_destroy(&s); + } + } + + return EINVAL; + } + + return 0; +} + +static int +dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len, + struct flow *flow, bool probe) +{ + if (odp_flow_key_to_flow(key, key_len, flow, NULL)) { + if (!probe) { + /* This should not happen: it indicates that + * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on + * the acceptable form of a flow. Log the problem as an error, + * with enough details to enable debugging. */ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + + if (!VLOG_DROP_ERR(&rl)) { + struct ds s; + + ds_init(&s); + odp_flow_format(key, key_len, NULL, 0, NULL, &s, true); + VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s)); + ds_destroy(&s); + } + } + + return EINVAL; + } + + if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) { + return EINVAL; + } + + return 0; +} + +static int +dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + struct dp_netdev_flow *netdev_flow; + struct dp_netdev_pmd_thread *pmd; + struct hmapx to_find = HMAPX_INITIALIZER(&to_find); + struct hmapx_node *node; + int error = EINVAL; + + if (get->pmd_id == PMD_ID_NULL) { + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) { + dp_netdev_pmd_unref(pmd); + } + } + } else { + pmd = dp_netdev_get_pmd(dp, get->pmd_id); + if (!pmd) { + goto out; + } + hmapx_add(&to_find, pmd); + } + + if (!hmapx_count(&to_find)) { + goto out; + } + + HMAPX_FOR_EACH (node, &to_find) { + pmd = (struct dp_netdev_pmd_thread *) node->data; + netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key, + get->key_len); + if (netdev_flow) { + dp_netdev_flow_to_dpif_flow(dp, netdev_flow, get->buffer, + get->buffer, get->flow, false); + error = 0; + break; + } else { + error = ENOENT; + } + } + + HMAPX_FOR_EACH (node, &to_find) { + pmd = (struct dp_netdev_pmd_thread *) node->data; + dp_netdev_pmd_unref(pmd); + } +out: + hmapx_destroy(&to_find); + return error; +} + +void +dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid) +{ + struct flow masked_flow; + size_t i; + + for (i = 0; i < sizeof(struct flow); i++) { + ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] & + ((uint8_t *)&match->wc)[i]; + } + odp_flow_key_hash(&masked_flow, sizeof masked_flow, mega_ufid); +} + +uint64_t +dp_netdev_simple_match_mark(odp_port_t in_port, ovs_be16 dl_type, + uint8_t nw_frag, ovs_be16 vlan_tci) +{ + /* Simple Match Mark: + * + * BE: + * +-----------------+-------------++---------+---+-----------+ + * | in_port | dl_type || nw_frag |CFI| VID(12) | + * +-----------------+-------------++---------+---+-----------+ + * 0 32 47 49 51 52 63 + * + * LE: + * +-----------------+-------------+------++-------+---+------+ + * | in_port | dl_type |VID(8)||nw_frag|CFI|VID(4)| + * +-----------------+-------------+------++-------+---+------+ + * 0 32 47 48 55 57 59 60 61 63 + * + * Big Endian Little Endian + * in_port : 32 bits [ 0..31] in_port : 32 bits [ 0..31] + * dl_type : 16 bits [32..47] dl_type : 16 bits [32..47] + * : 1 bit [48..48] vlan VID: 8 bits [48..55] + * nw_frag : 2 bits [49..50] : 1 bit [56..56] + * vlan CFI: 1 bit [51..51] nw_frag : 2 bits [57..59] + * vlan VID: 12 bits [52..63] vlan CFI: 1 bit [60..60] + * vlan VID: 4 bits [61..63] + * + * Layout is different for LE and BE in order to save a couple of + * network to host translations. + * */ + return ((uint64_t) odp_to_u32(in_port) << 32) + | ((OVS_FORCE uint32_t) dl_type << 16) +#if WORDS_BIGENDIAN + | (((uint16_t) nw_frag & FLOW_NW_FRAG_MASK) << VLAN_PCP_SHIFT) +#else + | ((nw_frag & FLOW_NW_FRAG_MASK) << (VLAN_PCP_SHIFT - 8)) +#endif + | (OVS_FORCE uint16_t) (vlan_tci & htons(VLAN_VID_MASK | VLAN_CFI)); +} + +struct dp_netdev_flow * +dp_netdev_simple_match_lookup(const struct dp_netdev_pmd_thread *pmd, + odp_port_t in_port, ovs_be16 dl_type, + uint8_t nw_frag, ovs_be16 vlan_tci) +{ + uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type, + nw_frag, vlan_tci); + uint32_t hash = hash_uint64(mark); + struct dp_netdev_flow *flow; + bool found = false; + + CMAP_FOR_EACH_WITH_HASH (flow, simple_match_node, + hash, &pmd->simple_match_table) { + if (flow->simple_match_mark == mark) { + found = true; + break; + } + } + return found ? flow : NULL; +} + +bool +dp_netdev_simple_match_enabled(const struct dp_netdev_pmd_thread *pmd, + odp_port_t in_port) +{ + return ccmap_find(&pmd->n_flows, odp_to_u32(in_port)) + == ccmap_find(&pmd->n_simple_flows, odp_to_u32(in_port)); +} + +static void +dp_netdev_simple_match_insert(struct dp_netdev_pmd_thread *pmd, + struct dp_netdev_flow *dp_flow) + OVS_REQUIRES(pmd->flow_mutex) +{ + odp_port_t in_port = dp_flow->flow.in_port.odp_port; + ovs_be16 vlan_tci = dp_flow->flow.vlans[0].tci; + ovs_be16 dl_type = dp_flow->flow.dl_type; + uint8_t nw_frag = dp_flow->flow.nw_frag; + + if (!dp_netdev_flow_ref(dp_flow)) { + return; + } + + /* Avoid double insertion. Should not happen in practice. */ + dp_netdev_simple_match_remove(pmd, dp_flow); + + uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type, + nw_frag, vlan_tci); + uint32_t hash = hash_uint64(mark); + + dp_flow->simple_match_mark = mark; + cmap_insert(&pmd->simple_match_table, + CONST_CAST(struct cmap_node *, &dp_flow->simple_match_node), + hash); + ccmap_inc(&pmd->n_simple_flows, odp_to_u32(in_port)); + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SIMPLE_UPDATE, 1); + + VLOG_DBG("Simple match insert: " + "core_id(%d),in_port(%"PRIu32"),mark(0x%016"PRIx64").", + pmd->core_id, in_port, mark); +} + +static void +dp_netdev_simple_match_remove(struct dp_netdev_pmd_thread *pmd, + struct dp_netdev_flow *dp_flow) + OVS_REQUIRES(pmd->flow_mutex) +{ + odp_port_t in_port = dp_flow->flow.in_port.odp_port; + ovs_be16 vlan_tci = dp_flow->flow.vlans[0].tci; + ovs_be16 dl_type = dp_flow->flow.dl_type; + uint8_t nw_frag = dp_flow->flow.nw_frag; + struct dp_netdev_flow *flow; + uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type, + nw_frag, vlan_tci); + uint32_t hash = hash_uint64(mark); + + flow = dp_netdev_simple_match_lookup(pmd, in_port, dl_type, + nw_frag, vlan_tci); + if (flow == dp_flow) { + VLOG_DBG("Simple match remove: " + "core_id(%d),in_port(%"PRIu32"),mark(0x%016"PRIx64").", + pmd->core_id, in_port, mark); + cmap_remove(&pmd->simple_match_table, + CONST_CAST(struct cmap_node *, &flow->simple_match_node), + hash); + ccmap_dec(&pmd->n_simple_flows, odp_to_u32(in_port)); + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SIMPLE_UPDATE, 1); + dp_netdev_flow_unref(flow); + } +} + +static bool +dp_netdev_flow_is_simple_match(const struct match *match) +{ + const struct flow *flow = &match->flow; + const struct flow_wildcards *wc = &match->wc; + + if (flow->recirc_id || flow->packet_type != htonl(PT_ETH)) { + return false; + } + + /* Check that flow matches only minimal set of fields that always set. + * Also checking that VLAN VID+CFI is an exact match, because these + * are not mandatory and could be masked. */ + struct flow_wildcards *minimal = xmalloc(sizeof *minimal); + ovs_be16 vlan_tci_mask = htons(VLAN_VID_MASK | VLAN_CFI); + + flow_wildcards_init_catchall(minimal); + /* 'dpif-netdev' always has following in exact match: + * - recirc_id <-- recirc_id == 0 checked on input. + * - in_port <-- Will be checked on input. + * - packet_type <-- Assuming all packets are PT_ETH. + * - dl_type <-- Need to match with. + * - vlan_tci <-- Need to match with. + * - and nw_frag for ip packets. <-- Need to match with. + */ + WC_MASK_FIELD(minimal, recirc_id); + WC_MASK_FIELD(minimal, in_port); + WC_MASK_FIELD(minimal, packet_type); + WC_MASK_FIELD(minimal, dl_type); + WC_MASK_FIELD_MASK(minimal, vlans[0].tci, vlan_tci_mask); + WC_MASK_FIELD_MASK(minimal, nw_frag, FLOW_NW_FRAG_MASK); + + if (flow_wildcards_has_extra(minimal, wc) + || wc->masks.vlans[0].tci != vlan_tci_mask) { + free(minimal); + return false; + } + free(minimal); + + return true; +} + +static struct dp_netdev_flow * +dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd, + struct match *match, const ovs_u128 *ufid, + const struct nlattr *actions, size_t actions_len, + odp_port_t orig_in_port) + OVS_REQUIRES(pmd->flow_mutex) +{ + struct ds extra_info = DS_EMPTY_INITIALIZER; + struct dp_netdev_flow *flow; + struct netdev_flow_key mask; + struct dpcls *cls; + size_t unit; + + /* Make sure in_port is exact matched before we read it. */ + ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE); + odp_port_t in_port = match->flow.in_port.odp_port; + + /* As we select the dpcls based on the port number, each netdev flow + * belonging to the same dpcls will have the same odp_port value. + * For performance reasons we wildcard odp_port here in the mask. In the + * typical case dp_hash is also wildcarded, and the resulting 8-byte + * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and + * will not be part of the subtable mask. + * This will speed up the hash computation during dpcls_lookup() because + * there is one less call to hash_add64() in this case. */ + match->wc.masks.in_port.odp_port = 0; + netdev_flow_mask_init(&mask, match); + match->wc.masks.in_port.odp_port = ODPP_NONE; + + /* Make sure wc does not have metadata. */ + ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata) + && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs)); + + /* Do not allocate extra space. */ + flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len); + memset(&flow->stats, 0, sizeof flow->stats); + atomic_init(&flow->netdev_flow_get_result, 0); + memset(&flow->last_stats, 0, sizeof flow->last_stats); + memset(&flow->last_attrs, 0, sizeof flow->last_attrs); + flow->dead = false; + flow->batch = NULL; + flow->mark = INVALID_FLOW_MARK; + flow->orig_in_port = orig_in_port; + flow->skip_actions = 0; + flow->partial_offload = false; + flow->offload_requested = false; + *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id; + *CONST_CAST(struct flow *, &flow->flow) = match->flow; + *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid; + ovs_refcount_init(&flow->ref_cnt); + ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len)); + + dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid)); + netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask); + + /* Select dpcls for in_port. Relies on in_port to be exact match. */ + cls = dp_netdev_pmd_find_dpcls(pmd, in_port); + dpcls_insert(cls, &flow->cr, &mask); + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_UPDATE, 1); + + ds_put_cstr(&extra_info, "miniflow_bits("); + FLOWMAP_FOR_EACH_UNIT (unit) { + if (unit) { + ds_put_char(&extra_info, ','); + } + ds_put_format(&extra_info, "%d", + count_1bits(flow->cr.mask->mf.map.bits[unit])); + } + ds_put_char(&extra_info, ')'); + flow->dp_extra_info = ds_steal_cstr(&extra_info); + ds_destroy(&extra_info); + + cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node), + dp_netdev_flow_hash(&flow->ufid)); + ccmap_inc(&pmd->n_flows, odp_to_u32(in_port)); + + if (dp_netdev_flow_is_simple_match(match)) { + dp_netdev_simple_match_insert(pmd, flow); + } + + queue_netdev_flow_put(pmd, flow, match, actions, actions_len, + DP_NETDEV_FLOW_OFFLOAD_OP_ADD); + log_netdev_flow_change(flow, match, NULL); + + return flow; +} + +static int +flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd, + struct netdev_flow_key *key, + struct match *match, + ovs_u128 *ufid, + const struct dpif_flow_put *put, + struct dpif_flow_stats *stats) +{ + struct dp_netdev_flow *netdev_flow = NULL; + int error = 0; + + if (stats) { + memset(stats, 0, sizeof *stats); + } + + ovs_mutex_lock(&pmd->flow_mutex); + if (put->ufid) { + netdev_flow = dp_netdev_pmd_find_flow(pmd, put->ufid, + put->key, put->key_len); + } else { + /* Use key instead of the locally generated ufid + * to search netdev_flow. */ + netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL); + } + + if (put->flags & DPIF_FP_CREATE) { + if (!netdev_flow) { + dp_netdev_flow_add(pmd, match, ufid, + put->actions, put->actions_len, ODPP_NONE); + } else { + error = EEXIST; + } + goto exit; + } + + if (put->flags & DPIF_FP_MODIFY) { + if (!netdev_flow) { + error = ENOENT; + } else { + if (!put->ufid && !flow_equal(&match->flow, &netdev_flow->flow)) { + /* Overlapping flow. */ + error = EINVAL; + goto exit; + } + + struct dp_netdev_actions *new_actions; + struct dp_netdev_actions *old_actions; + + new_actions = dp_netdev_actions_create(put->actions, + put->actions_len); + + old_actions = dp_netdev_flow_get_actions(netdev_flow); + ovsrcu_set(&netdev_flow->actions, new_actions); + + queue_netdev_flow_put(pmd, netdev_flow, match, + put->actions, put->actions_len, + DP_NETDEV_FLOW_OFFLOAD_OP_MOD); + log_netdev_flow_change(netdev_flow, match, old_actions); + + get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL); + if (put->flags & DPIF_FP_ZERO_STATS) { + /* XXX: The userspace datapath uses thread local statistics + * (for flows), which should be updated only by the owning + * thread. Since we cannot write on stats memory here, + * we choose not to support this flag. Please note: + * - This feature is currently used only by dpctl commands with + * option --clear. + * - Should the need arise, this operation can be implemented + * by keeping a base value (to be update here) for each + * counter, and subtracting it before outputting the stats */ + error = EOPNOTSUPP; + } + + ovsrcu_postpone(dp_netdev_actions_free, old_actions); + } + } + +exit: + ovs_mutex_unlock(&pmd->flow_mutex); + return error; +} + +static int +dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + struct netdev_flow_key key; + struct dp_netdev_pmd_thread *pmd; + struct match match; + ovs_u128 ufid; + int error; + bool probe = put->flags & DPIF_FP_PROBE; + + match_init_catchall(&match); + if (put->stats) { + memset(put->stats, 0, sizeof *put->stats); + } + error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow, + probe); + if (error) { + return error; + } + error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len, + put->mask, put->mask_len, + &match.flow, &match.wc, probe); + if (error) { + return error; + } + + if (match.wc.masks.in_port.odp_port != ODPP_NONE) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + + VLOG_ERR_RL(&rl, "failed to put%s flow: in_port is not an exact match", + (put->flags & DPIF_FP_CREATE) ? "[create]" + : (put->flags & DPIF_FP_MODIFY) ? "[modify]" : "[zero]"); + return EINVAL; + } + + if (put->ufid) { + ufid = *put->ufid; + } else { + odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid); + } + + /* The Netlink encoding of datapath flow keys cannot express + * wildcarding the presence of a VLAN tag. Instead, a missing VLAN + * tag is interpreted as exact match on the fact that there is no + * VLAN. Unless we refactor a lot of code that translates between + * Netlink and struct flow representations, we have to do the same + * here. This must be in sync with 'match' in handle_packet_upcall(). */ + if (!match.wc.masks.vlans[0].tci) { + match.wc.masks.vlans[0].tci = htons(VLAN_VID_MASK | VLAN_CFI); + } + + /* Must produce a netdev_flow_key for lookup. + * Use the same method as employed to create the key when adding + * the flow to the dplcs to make sure they match. + * We need to put in the unmasked key as flow_put_on_pmd() will first try + * to see if an entry exists doing a packet type lookup. As masked-out + * fields are interpreted as zeros, they could falsely match a wider IP + * address mask. Installation of the flow will use the match variable. */ + netdev_flow_key_init(&key, &match.flow); + + if (put->pmd_id == PMD_ID_NULL) { + if (cmap_count(&dp->poll_threads) == 0) { + return EINVAL; + } + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + struct dpif_flow_stats pmd_stats; + int pmd_error; + + pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, + &pmd_stats); + if (pmd_error) { + error = pmd_error; + } else if (put->stats) { + put->stats->n_packets += pmd_stats.n_packets; + put->stats->n_bytes += pmd_stats.n_bytes; + put->stats->used = MAX(put->stats->used, pmd_stats.used); + put->stats->tcp_flags |= pmd_stats.tcp_flags; + } + } + } else { + pmd = dp_netdev_get_pmd(dp, put->pmd_id); + if (!pmd) { + return EINVAL; + } + error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats); + dp_netdev_pmd_unref(pmd); + } + + return error; +} + +static int +flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd, + struct dpif_flow_stats *stats, + const struct dpif_flow_del *del) +{ + struct dp_netdev_flow *netdev_flow; + struct dpif_flow_attrs attrs; + int error = 0; + + ovs_mutex_lock(&pmd->flow_mutex); + netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key, + del->key_len); + if (netdev_flow) { + get_dpif_flow_status(pmd->dp, netdev_flow, stats, &attrs); + dp_netdev_pmd_remove_flow(pmd, netdev_flow); + } else { + error = ENOENT; + } + ovs_mutex_unlock(&pmd->flow_mutex); + + return error; +} + +static int +dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + struct dp_netdev_pmd_thread *pmd; + int error = 0; + + if (del->stats) { + memset(del->stats, 0, sizeof *del->stats); + } + + if (del->pmd_id == PMD_ID_NULL) { + if (cmap_count(&dp->poll_threads) == 0) { + return EINVAL; + } + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + struct dpif_flow_stats pmd_stats; + int pmd_error; + + pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del); + if (pmd_error) { + error = pmd_error; + } else if (del->stats) { + del->stats->n_packets += pmd_stats.n_packets; + del->stats->n_bytes += pmd_stats.n_bytes; + del->stats->used = MAX(del->stats->used, pmd_stats.used); + del->stats->tcp_flags |= pmd_stats.tcp_flags; + } + } + } else { + pmd = dp_netdev_get_pmd(dp, del->pmd_id); + if (!pmd) { + return EINVAL; + } + error = flow_del_on_pmd(pmd, del->stats, del); + dp_netdev_pmd_unref(pmd); + } + + + return error; +} + +struct dpif_netdev_flow_dump { + struct dpif_flow_dump up; + struct cmap_position poll_thread_pos; + struct cmap_position flow_pos; + struct dp_netdev_pmd_thread *cur_pmd; + int status; + struct ovs_mutex mutex; +}; + +static struct dpif_netdev_flow_dump * +dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump) +{ + return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up); +} + +static struct dpif_flow_dump * +dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse, + struct dpif_flow_dump_types *types OVS_UNUSED) +{ + struct dpif_netdev_flow_dump *dump; + + dump = xzalloc(sizeof *dump); + dpif_flow_dump_init(&dump->up, dpif_); + dump->up.terse = terse; + ovs_mutex_init(&dump->mutex); + + return &dump->up; +} + +static int +dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_) +{ + struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_); + + ovs_mutex_destroy(&dump->mutex); + free(dump); + return 0; +} + +struct dpif_netdev_flow_dump_thread { + struct dpif_flow_dump_thread up; + struct dpif_netdev_flow_dump *dump; + struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH]; + struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH]; +}; + +static struct dpif_netdev_flow_dump_thread * +dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread) +{ + return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up); +} + +static struct dpif_flow_dump_thread * +dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_) +{ + struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_); + struct dpif_netdev_flow_dump_thread *thread; + + thread = xmalloc(sizeof *thread); + dpif_flow_dump_thread_init(&thread->up, &dump->up); + thread->dump = dump; + return &thread->up; +} + +static void +dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_) +{ + struct dpif_netdev_flow_dump_thread *thread + = dpif_netdev_flow_dump_thread_cast(thread_); + + free(thread); +} + +static int +dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_, + struct dpif_flow *flows, int max_flows) +{ + struct dpif_netdev_flow_dump_thread *thread + = dpif_netdev_flow_dump_thread_cast(thread_); + struct dpif_netdev_flow_dump *dump = thread->dump; + struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH]; + struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif); + struct dp_netdev *dp = get_dp_netdev2(&dpif->dpif); + int n_flows = 0; + int i; + + ovs_mutex_lock(&dump->mutex); + if (!dump->status) { + struct dp_netdev_pmd_thread *pmd = dump->cur_pmd; + int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH); + + /* First call to dump_next(), extracts the first pmd thread. + * If there is no pmd thread, returns immediately. */ + if (!pmd) { + pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos); + if (!pmd) { + ovs_mutex_unlock(&dump->mutex); + return n_flows; + + } + } + + do { + for (n_flows = 0; n_flows < flow_limit; n_flows++) { + struct cmap_node *node; + + node = cmap_next_position(&pmd->flow_table, &dump->flow_pos); + if (!node) { + break; + } + netdev_flows[n_flows] = CONTAINER_OF(node, + struct dp_netdev_flow, + node); + } + /* When finishing dumping the current pmd thread, moves to + * the next. */ + if (n_flows < flow_limit) { + memset(&dump->flow_pos, 0, sizeof dump->flow_pos); + dp_netdev_pmd_unref(pmd); + pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos); + if (!pmd) { + dump->status = EOF; + break; + } + } + /* Keeps the reference to next caller. */ + dump->cur_pmd = pmd; + + /* If the current dump is empty, do not exit the loop, since the + * remaining pmds could have flows to be dumped. Just dumps again + * on the new 'pmd'. */ + } while (!n_flows); + } + ovs_mutex_unlock(&dump->mutex); + + for (i = 0; i < n_flows; i++) { + struct odputil_keybuf *maskbuf = &thread->maskbuf[i]; + struct odputil_keybuf *keybuf = &thread->keybuf[i]; + struct dp_netdev_flow *netdev_flow = netdev_flows[i]; + struct dpif_flow *f = &flows[i]; + struct ofpbuf key, mask; + + ofpbuf_use_stack(&key, keybuf, sizeof *keybuf); + ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf); + dp_netdev_flow_to_dpif_flow(dp, netdev_flow, &key, &mask, f, + dump->up.terse); + } + + return n_flows; +} + +static int +dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute) + OVS_NO_THREAD_SAFETY_ANALYSIS +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + struct dp_netdev_pmd_thread *pmd; + struct dp_packet_batch pp; + + if (dp_packet_size(execute->packet) < ETH_HEADER_LEN || + dp_packet_size(execute->packet) > UINT16_MAX) { + return EINVAL; + } + + /* Tries finding the 'pmd'. If NULL is returned, that means + * the current thread is a non-pmd thread and should use + * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */ + pmd = ovsthread_getspecific(dp->per_pmd_key); + if (!pmd) { + pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID); + if (!pmd) { + return EBUSY; + } + } + + if (execute->probe) { + /* If this is part of a probe, Drop the packet, since executing + * the action may actually cause spurious packets be sent into + * the network. */ + if (pmd->core_id == NON_PMD_CORE_ID) { + dp_netdev_pmd_unref(pmd); + } + return 0; + } + + /* If the current thread is non-pmd thread, acquires + * the 'non_pmd_mutex'. */ + if (pmd->core_id == NON_PMD_CORE_ID) { + ovs_mutex_lock(&dp->non_pmd_mutex); + } + + /* Update current time in PMD context. We don't care about EMC insertion + * probability, because we are on a slow path. */ + pmd_thread_ctx_time_update(pmd); + + /* The action processing expects the RSS hash to be valid, because + * it's always initialized at the beginning of datapath processing. + * In this case, though, 'execute->packet' may not have gone through + * the datapath at all, it may have been generated by the upper layer + * (OpenFlow packet-out, BFD frame, ...). */ + if (!dp_packet_rss_valid(execute->packet)) { + dp_packet_set_rss_hash(execute->packet, + flow_hash_5tuple(execute->flow, 0)); + } + + /* Making a copy because the packet might be stolen during the execution + * and caller might still need it. */ + struct dp_packet *packet_clone = dp_packet_clone(execute->packet); + dp_packet_batch_init_packet(&pp, packet_clone); + dp_netdev_execute_actions(pmd, &pp, false, execute->flow, NULL, + execute->actions, execute->actions_len); + dp_netdev_pmd_flush_output_packets(pmd, true); + + if (pmd->core_id == NON_PMD_CORE_ID) { + ovs_mutex_unlock(&dp->non_pmd_mutex); + dp_netdev_pmd_unref(pmd); + } + + if (dp_packet_batch_size(&pp) == 1) { + /* Packet wasn't dropped during the execution. Swapping content with + * the original packet, because the caller might expect actions to + * modify it. Uisng the packet from a batch instead of 'packet_clone' + * because it maybe stolen and replaced by other packet, e.g. by + * the fragmentation engine. */ + dp_packet_swap(execute->packet, pp.packets[0]); + dp_packet_delete_batch(&pp, true); + } else if (dp_packet_batch_size(&pp)) { + /* FIXME: We have more packets than expected. Likely, we got IP + * fragments of the reassembled packet. Dropping them here as we have + * no way to get them to the caller. It might be that all the required + * actions with them are already executed, but it also might not be a + * case, e.g. if dpif_netdev_execute() called to execute a single + * tunnel push. */ + dp_packet_delete_batch(&pp, true); + } + + return 0; +} + +static void +dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops, + enum dpif_offload_type offload_type OVS_UNUSED) +{ + size_t i; + + for (i = 0; i < n_ops; i++) { + struct dpif_op *op = ops[i]; + + switch (op->type) { + case DPIF_OP_FLOW_PUT: + op->error = dpif_netdev_flow_put(dpif, &op->flow_put); + break; + + case DPIF_OP_FLOW_DEL: + op->error = dpif_netdev_flow_del(dpif, &op->flow_del); + break; + + case DPIF_OP_EXECUTE: + op->error = dpif_netdev_execute(dpif, &op->execute); + break; + + case DPIF_OP_FLOW_GET: + op->error = dpif_netdev_flow_get(dpif, &op->flow_get); + break; + } + } +} + +/* Enable or Disable PMD auto load balancing. */ +static void +set_pmd_auto_lb(struct dp_netdev *dp, bool state, bool always_log) +{ + struct pmd_auto_lb *pmd_alb = &dp->pmd_alb; + + if (pmd_alb->is_enabled != state || always_log) { + pmd_alb->is_enabled = state; + if (pmd_alb->is_enabled) { + uint8_t rebalance_load_thresh; + + atomic_read_relaxed(&pmd_alb->rebalance_load_thresh, + &rebalance_load_thresh); + VLOG_INFO("PMD auto load balance is enabled, " + "interval %"PRIu64" mins, " + "pmd load threshold %"PRIu8"%%, " + "improvement threshold %"PRIu8"%%.", + pmd_alb->rebalance_intvl / MIN_TO_MSEC, + rebalance_load_thresh, + pmd_alb->rebalance_improve_thresh); + } else { + pmd_alb->rebalance_poll_timer = 0; + VLOG_INFO("PMD auto load balance is disabled."); + } + } +} + +static int +parse_pmd_sleep_list(const char *max_sleep_list, + struct pmd_sleep **pmd_sleeps) +{ + char *list, *copy, *key, *value; + int num_vals = 0; + + if (!max_sleep_list) { + return num_vals; + } + + list = copy = xstrdup(max_sleep_list); + + while (ofputil_parse_key_value(&list, &key, &value)) { + uint64_t temp, pmd_max_sleep; + char *error = NULL; + unsigned core; + int i; + + error = str_to_u64(key, &temp); + if (error) { + free(error); + continue; + } + + if (value[0] == '\0') { + /* No value specified. key is dp default. */ + core = UINT_MAX; + pmd_max_sleep = temp; + } else { + error = str_to_u64(value, &pmd_max_sleep); + if (!error && temp < UINT_MAX) { + /* Key is pmd core id. */ + core = (unsigned) temp; + } else { + free(error); + continue; + } + } + + /* Detect duplicate max sleep values. */ + for (i = 0; i < num_vals; i++) { + if ((*pmd_sleeps)[i].core_id == core) { + break; + } + } + if (i == num_vals) { + /* Not duplicate, add a new entry. */ + *pmd_sleeps = xrealloc(*pmd_sleeps, + (num_vals + 1) * sizeof **pmd_sleeps); + num_vals++; + } + + pmd_max_sleep = MIN(PMD_RCU_QUIESCE_INTERVAL, pmd_max_sleep); + + (*pmd_sleeps)[i].core_id = core; + (*pmd_sleeps)[i].max_sleep = pmd_max_sleep; + } + + free(copy); + return num_vals; +} + +static void +log_pmd_sleep(unsigned core_id, int numa_id, uint64_t pmd_max_sleep) +{ + if (core_id == NON_PMD_CORE_ID) { + return; + } + VLOG_INFO("PMD thread on numa_id: %d, core id: %2d, " + "max sleep: %4"PRIu64" us.", numa_id, core_id, pmd_max_sleep); +} + +static void +pmd_init_max_sleep(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd) +{ + uint64_t max_sleep = dp->pmd_max_sleep_default; + struct pmd_sleep *pmd_sleeps = NULL; + int num_vals; + + num_vals = parse_pmd_sleep_list(dp->max_sleep_list, &pmd_sleeps); + + /* Check if the user has set a specific value for this pmd. */ + for (int i = 0; i < num_vals; i++) { + if (pmd_sleeps[i].core_id == pmd->core_id) { + max_sleep = pmd_sleeps[i].max_sleep; + break; + } + } + atomic_init(&pmd->max_sleep, max_sleep); + log_pmd_sleep(pmd->core_id, pmd->numa_id, max_sleep); + free(pmd_sleeps); +} + +static bool +assign_sleep_values_to_pmds(struct dp_netdev *dp, int num_vals, + struct pmd_sleep *pmd_sleeps) +{ + struct dp_netdev_pmd_thread *pmd; + bool value_changed = false; + + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + uint64_t new_max_sleep, cur_pmd_max_sleep; + + if (pmd->core_id == NON_PMD_CORE_ID) { + continue; + } + + /* Default to global value. */ + new_max_sleep = dp->pmd_max_sleep_default; + + /* Check for pmd specific value. */ + for (int i = 0; i < num_vals; i++) { + if (pmd->core_id == pmd_sleeps[i].core_id) { + new_max_sleep = pmd_sleeps[i].max_sleep; + break; + } + } + atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep); + if (new_max_sleep != cur_pmd_max_sleep) { + atomic_store_relaxed(&pmd->max_sleep, new_max_sleep); + value_changed = true; + } + } + return value_changed; +} + +void +log_all_pmd_sleeps(struct dp_netdev *dp) +{ + struct dp_netdev_pmd_thread **pmd_list = NULL; + struct dp_netdev_pmd_thread *pmd; + size_t n; + + VLOG_INFO("Default PMD thread max sleep: %4"PRIu64" us.", + dp->pmd_max_sleep_default); + + sorted_poll_thread_list(dp, &pmd_list, &n); + + for (size_t i = 0; i < n; i++) { + uint64_t cur_pmd_max_sleep; + + pmd = pmd_list[i]; + atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep); + log_pmd_sleep(pmd->core_id, pmd->numa_id, cur_pmd_max_sleep); + } + free(pmd_list); +} + +bool +set_all_pmd_max_sleeps(struct dp_netdev *dp, const struct smap *config) +{ + const char *max_sleep_list = smap_get(config, "pmd-sleep-max"); + struct pmd_sleep *pmd_sleeps = NULL; + uint64_t default_max_sleep = 0; + bool default_changed = false; + bool pmd_changed = false; + uint64_t pmd_maxsleep; + int num_vals = 0; + + /* Check for deprecated 'pmd-maxsleep' value. */ + pmd_maxsleep = smap_get_ullong(config, "pmd-maxsleep", UINT64_MAX); + if (pmd_maxsleep != UINT64_MAX && !max_sleep_list) { + VLOG_WARN_ONCE("pmd-maxsleep is deprecated. " + "Please use pmd-sleep-max instead."); + default_max_sleep = pmd_maxsleep; + } + + /* Check if there is no change in string or value. */ + if (!!dp->max_sleep_list == !!max_sleep_list) { + if (max_sleep_list + ? nullable_string_is_equal(max_sleep_list, dp->max_sleep_list) + : default_max_sleep == dp->pmd_max_sleep_default) { + return false; + } + } + + /* Free existing string and copy new one (if any). */ + free(dp->max_sleep_list); + dp->max_sleep_list = nullable_xstrdup(max_sleep_list); + + if (max_sleep_list) { + num_vals = parse_pmd_sleep_list(max_sleep_list, &pmd_sleeps); + + /* Check if the user has set a global value. */ + for (int i = 0; i < num_vals; i++) { + if (pmd_sleeps[i].core_id == UINT_MAX) { + default_max_sleep = pmd_sleeps[i].max_sleep; + break; + } + } + } + + if (dp->pmd_max_sleep_default != default_max_sleep) { + dp->pmd_max_sleep_default = default_max_sleep; + default_changed = true; + } + pmd_changed = assign_sleep_values_to_pmds(dp, num_vals, pmd_sleeps); + + free(pmd_sleeps); + return default_changed || pmd_changed; +} + +/* Applies datapath configuration from the database. Some of the changes are + * actually applied in dpif_netdev_run(). */ +static int +dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + const char *cmask = smap_get(other_config, "pmd-cpu-mask"); + const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign", + "cycles"); + unsigned long long insert_prob = + smap_get_ullong(other_config, "emc-insert-inv-prob", + DEFAULT_EM_FLOW_INSERT_INV_PROB); + uint32_t insert_min, cur_min; + uint32_t tx_flush_interval, cur_tx_flush_interval; + uint64_t rebalance_intvl; + uint8_t cur_rebalance_load; + uint32_t rebalance_load, rebalance_improve; + bool log_autolb = false; + enum sched_assignment_type pmd_rxq_assign_type; + static bool first_set_config = true; + + tx_flush_interval = smap_get_int(other_config, "tx-flush-interval", + DEFAULT_TX_FLUSH_INTERVAL); + atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval); + if (tx_flush_interval != cur_tx_flush_interval) { + atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval); + VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us", + tx_flush_interval); + } + + if (!nullable_string_is_equal(dp->req_pmd_cmask, cmask)) { + free(dp->req_pmd_cmask); + dp->req_pmd_cmask = nullable_xstrdup(cmask); + if (ovs_doca_initialized() && !first_set_config) { + VLOG_WARN("Reconfiguring PMD threads requires restarting the daemon. " + "Change to PMD coremask is ignored."); + } else { + free(dp->pmd_cmask); + dp->pmd_cmask = nullable_xstrdup(cmask); + dp_netdev_request_reconfigure(dp); + } + } + + if (first_set_config) { + dpif_netdev2_set_n_pmd_threads(dp->pmd_cmask); + } + + atomic_read_relaxed(&dp->emc_insert_min, &cur_min); + if (insert_prob <= UINT32_MAX) { + insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob; + } else { + insert_min = DEFAULT_EM_FLOW_INSERT_MIN; + insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB; + } + + if (insert_min != cur_min) { + atomic_store_relaxed(&dp->emc_insert_min, insert_min); + if (insert_min == 0) { + VLOG_INFO("EMC insertion probability changed to zero"); + } else { + VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)", + insert_prob, (100 / (float)insert_prob)); + } + } + + bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false); + bool cur_perf_enabled; + atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled); + if (perf_enabled != cur_perf_enabled) { + atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled); + if (perf_enabled) { + VLOG_INFO("PMD performance metrics collection enabled"); + } else { + VLOG_INFO("PMD performance metrics collection disabled"); + } + } + + bool smc_enable = smap_get_bool(other_config, "smc-enable", false); + bool cur_smc; + atomic_read_relaxed(&dp->smc_enable_db, &cur_smc); + if (smc_enable != cur_smc) { + atomic_store_relaxed(&dp->smc_enable_db, smc_enable); + if (smc_enable) { + VLOG_INFO("SMC cache is enabled"); + } else { + VLOG_INFO("SMC cache is disabled"); + } + } + + if (!strcmp(pmd_rxq_assign, "roundrobin")) { + pmd_rxq_assign_type = SCHED_ROUNDROBIN; + } else if (!strcmp(pmd_rxq_assign, "cycles")) { + pmd_rxq_assign_type = SCHED_CYCLES; + } else if (!strcmp(pmd_rxq_assign, "group")) { + pmd_rxq_assign_type = SCHED_GROUP; + } else { + /* Default. */ + VLOG_WARN("Unsupported rx queue to PMD assignment mode in " + "pmd-rxq-assign. Defaulting to 'cycles'."); + pmd_rxq_assign_type = SCHED_CYCLES; + pmd_rxq_assign = "cycles"; + } + + dpif_netdev_set_config_e2e_cache(other_config); + + if (dp->pmd_rxq_assign_type != pmd_rxq_assign_type) { + dp->pmd_rxq_assign_type = pmd_rxq_assign_type; + VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.", + pmd_rxq_assign); + dp_netdev_request_reconfigure(dp); + } + + bool pmd_iso = smap_get_bool(other_config, "pmd-rxq-isolate", true); + + if (pmd_rxq_assign_type != SCHED_GROUP && pmd_iso == false) { + /* Invalid combination. */ + VLOG_WARN("pmd-rxq-isolate can only be set false " + "when using pmd-rxq-assign=group"); + pmd_iso = true; + } + if (dp->pmd_iso != pmd_iso) { + dp->pmd_iso = pmd_iso; + if (pmd_iso) { + VLOG_INFO("pmd-rxq-affinity isolates PMD core"); + } else { + VLOG_INFO("pmd-rxq-affinity does not isolate PMD core"); + } + dp_netdev_request_reconfigure(dp); + } + + struct pmd_auto_lb *pmd_alb = &dp->pmd_alb; + + rebalance_intvl = smap_get_ullong(other_config, + "pmd-auto-lb-rebal-interval", + ALB_REBALANCE_INTERVAL); + if (rebalance_intvl > MAX_ALB_REBALANCE_INTERVAL) { + rebalance_intvl = ALB_REBALANCE_INTERVAL; + } + + /* Input is in min, convert it to msec. */ + rebalance_intvl = + rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC; + + if (pmd_alb->rebalance_intvl != rebalance_intvl) { + pmd_alb->rebalance_intvl = rebalance_intvl; + VLOG_INFO("PMD auto load balance interval set to " + "%"PRIu64" mins\n", rebalance_intvl / MIN_TO_MSEC); + log_autolb = true; + } + + rebalance_improve = smap_get_uint(other_config, + "pmd-auto-lb-improvement-threshold", + ALB_IMPROVEMENT_THRESHOLD); + if (rebalance_improve > 100) { + rebalance_improve = ALB_IMPROVEMENT_THRESHOLD; + } + if (rebalance_improve != pmd_alb->rebalance_improve_thresh) { + pmd_alb->rebalance_improve_thresh = rebalance_improve; + VLOG_INFO("PMD auto load balance improvement threshold set to " + "%"PRIu32"%%", rebalance_improve); + log_autolb = true; + } + + rebalance_load = smap_get_uint(other_config, "pmd-auto-lb-load-threshold", + ALB_LOAD_THRESHOLD); + if (rebalance_load > 100) { + rebalance_load = ALB_LOAD_THRESHOLD; + } + atomic_read_relaxed(&pmd_alb->rebalance_load_thresh, &cur_rebalance_load); + if (rebalance_load != cur_rebalance_load) { + atomic_store_relaxed(&pmd_alb->rebalance_load_thresh, + rebalance_load); + VLOG_INFO("PMD auto load balance load threshold set to %"PRIu32"%%", + rebalance_load); + log_autolb = true; + } + + bool autolb_state = smap_get_bool(other_config, "pmd-auto-lb", false); + + set_pmd_auto_lb(dp, autolb_state, log_autolb); + + bool sleep_changed = set_all_pmd_max_sleeps(dp, other_config); + if (first_set_config || sleep_changed) { + log_all_pmd_sleeps(dp); + } + + dpif_netdev2_set_ext_config(dp, other_config, first_set_config); + + first_set_config = false; + return 0; +} + +/* Parses affinity list and returns result in 'core_ids'. */ +static int +parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq) +{ + unsigned i; + char *list, *copy, *key, *value; + int error = 0; + + for (i = 0; i < n_rxq; i++) { + core_ids[i] = OVS_CORE_UNSPEC; + } + + if (!affinity_list) { + return 0; + } + + list = copy = xstrdup(affinity_list); + + while (ofputil_parse_key_value(&list, &key, &value)) { + int rxq_id, core_id; + + if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0 + || !str_to_int(value, 0, &core_id) || core_id < 0) { + error = EINVAL; + break; + } + + if (rxq_id < n_rxq) { + core_ids[rxq_id] = core_id; + } + } + + free(copy); + return error; +} + +/* Parses 'affinity_list' and applies configuration if it is valid. */ +static int +dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port, + const char *affinity_list) +{ + unsigned *core_ids, i; + int error = 0; + + core_ids = xmalloc(port->n_rxq * sizeof *core_ids); + if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) { + error = EINVAL; + goto exit; + } + + for (i = 0; i < port->n_rxq; i++) { + port->rxqs[i].core_id = core_ids[i]; + } + +exit: + free(core_ids); + return error; +} + +/* Returns 'true' if one of the 'port's RX queues exists in 'poll_list' + * of given PMD thread. */ +static bool +dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd, + struct dp_netdev_port *port) + OVS_EXCLUDED(pmd->port_mutex) +{ + struct rxq_poll *poll; + bool found = false; + + ovs_mutex_lock(&pmd->port_mutex); + HMAP_FOR_EACH (poll, node, &pmd->poll_list) { + if (port == poll->rxq->port) { + found = true; + break; + } + } + ovs_mutex_unlock(&pmd->port_mutex); + return found; +} + +/* Updates port configuration from the database. The changes are actually + * applied in dpif_netdev_run(). */ +static int +dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no, + const struct smap *cfg) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + struct dp_netdev_port *port; + int error = 0; + char *affinity_list = nullable_xstrdup(smap_get(cfg, "pmd-rxq-affinity")); + bool emc_enabled = smap_get_bool(cfg, "emc-enable", true); + const char *tx_steering_mode = smap_get(cfg, "tx-steering"); + enum txq_req_mode txq_mode; + + ovs_rwlock_wrlock(&dp->port_rwlock); + error = get_port_by_number(dp, port_no, &port); + if (error) { + goto unlock; + } + + if (emc_enabled != port->emc_enabled) { + struct dp_netdev_pmd_thread *pmd; + struct ds ds = DS_EMPTY_INITIALIZER; + uint32_t cur_min, insert_prob; + + port->emc_enabled = emc_enabled; + /* Mark for reload all the threads that polls this port and request + * for reconfiguration for the actual reloading of threads. */ + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + if (dpif_netdev_pmd_polls_port(pmd, port)) { + pmd->need_reload = true; + } + } + dp_netdev_request_reconfigure(dp); + + ds_put_format(&ds, "%s: EMC has been %s.", + netdev_get_name(port->netdev), + (emc_enabled) ? "enabled" : "disabled"); + if (emc_enabled) { + ds_put_cstr(&ds, " Current insertion probability is "); + atomic_read_relaxed(&dp->emc_insert_min, &cur_min); + if (!cur_min) { + ds_put_cstr(&ds, "zero."); + } else { + insert_prob = UINT32_MAX / cur_min; + ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).", + insert_prob, 100 / (float) insert_prob); + } + } + VLOG_INFO("%s", ds_cstr(&ds)); + ds_destroy(&ds); + } + + /* Checking for RXq affinity changes. */ + if (ovs_doca_initialized() && netdev_is_pmd(port->netdev)) { + dp_netdev_doca_affinity_list(dp, port->netdev, &affinity_list); + if (!nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) { + VLOG_INFO("%s: Using fixed affinity list '%s'", + netdev_get_name(port->netdev), affinity_list); + } + } + if (netdev_is_pmd(port->netdev) + && !nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) { + + error = dpif_netdev_port_set_rxq_affinity(port, affinity_list); + if (error) { + goto unlock; + } + free(port->rxq_affinity_list); + port->rxq_affinity_list = nullable_xstrdup(affinity_list); + + dp_netdev_request_reconfigure(dp); + } + + if (nullable_string_is_equal(tx_steering_mode, "hash")) { + txq_mode = TXQ_REQ_MODE_HASH; + } else { + txq_mode = TXQ_REQ_MODE_THREAD; + } + + if (txq_mode != port->txq_requested_mode) { + port->txq_requested_mode = txq_mode; + VLOG_INFO("%s: Tx packet steering mode has been set to '%s'.", + netdev_get_name(port->netdev), + (txq_mode == TXQ_REQ_MODE_THREAD) ? "thread" : "hash"); + dp_netdev_request_reconfigure(dp); + } + +unlock: + ovs_rwlock_unlock(&dp->port_rwlock); + free(affinity_list); + return error; +} + +static int +dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED, + uint32_t queue_id, uint32_t *priority) +{ + *priority = queue_id; + return 0; +} + + +/* Creates and returns a new 'struct dp_netdev_actions', whose actions are + * a copy of the 'size' bytes of 'actions' input parameters. */ +struct dp_netdev_actions * +dp_netdev_actions_create(const struct nlattr *actions, size_t size) +{ + struct dp_netdev_actions *netdev_actions; + + netdev_actions = xmalloc(sizeof *netdev_actions + size); + netdev_actions->size = size; + if (size) { + memcpy(netdev_actions->actions, actions, size); + } + + return netdev_actions; +} + +struct dp_netdev_actions * +dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow) +{ + return ovsrcu_get(struct dp_netdev_actions *, &flow->actions); +} + +static void +dp_netdev_actions_free(struct dp_netdev_actions *actions) +{ + free(actions); +} + +static void +dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx, + enum rxq_cycles_counter_type type, + unsigned long long cycles) +{ + atomic_store_relaxed(&rx->cycles[type], cycles); +} + +static void +dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx, + enum rxq_cycles_counter_type type, + unsigned long long cycles) +{ + non_atomic_ullong_add(&rx->cycles[type], cycles); +} + +static uint64_t +dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx, + enum rxq_cycles_counter_type type) +{ + unsigned long long processing_cycles; + atomic_read_relaxed(&rx->cycles[type], &processing_cycles); + return processing_cycles; +} + +static void +dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx, + unsigned long long cycles) +{ + unsigned int idx = atomic_count_inc(&rx->intrvl_idx) % PMD_INTERVAL_MAX; + atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles); +} + +static uint64_t +dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx) +{ + unsigned long long processing_cycles; + atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles); + return processing_cycles; +} + +#if ATOMIC_ALWAYS_LOCK_FREE_8B +static inline bool +pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd) +{ + bool pmd_perf_enabled; + atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled); + return pmd_perf_enabled; +} +#else +/* If stores and reads of 64-bit integers are not atomic, the full PMD + * performance metrics are not available as locked access to 64 bit + * integers would be prohibitively expensive. */ +static inline bool +pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED) +{ + return false; +} +#endif + +static int +dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd, + struct tx_port *p) +{ + int i; + int tx_qid; + int output_cnt; + bool concurrent_txqs; + struct cycle_timer timer; + uint64_t cycles; + uint32_t tx_flush_interval; + + cycle_timer_start(&pmd->perf_stats, &timer); + + output_cnt = dp_packet_batch_size(&p->output_pkts); + ovs_assert(output_cnt > 0); + + if (p->port->txq_mode == TXQ_MODE_XPS_HASH) { + int n_txq = netdev_n_txq(p->port->netdev); + + /* Re-batch per txq based on packet hash. */ + struct dp_packet *packet; + DP_PACKET_BATCH_FOR_EACH (j, packet, &p->output_pkts) { + uint32_t hash; + + if (OVS_LIKELY(dp_packet_rss_valid(packet))) { + hash = dp_packet_get_rss_hash(packet); + } else { + struct flow flow; + + flow_extract(packet, &flow); + hash = flow_hash_5tuple(&flow, 0); + } + dp_packet_batch_add(&p->txq_pkts[hash % n_txq], packet); + } + + /* Flush batches of each Tx queues. */ + for (i = 0; i < n_txq; i++) { + if (dp_packet_batch_is_empty(&p->txq_pkts[i])) { + continue; + } + netdev_send(p->port->netdev, i, &p->txq_pkts[i], true); + dp_packet_batch_init(&p->txq_pkts[i]); + } + } else { + if (p->port->txq_mode == TXQ_MODE_XPS) { + tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p); + concurrent_txqs = true; + } else { + tx_qid = pmd->static_tx_qid; + concurrent_txqs = false; + } + netdev_send(p->port->netdev, tx_qid, &p->output_pkts, concurrent_txqs); + } + dp_packet_batch_init(&p->output_pkts); + + /* Update time of the next flush. */ + atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval); + p->flush_time = pmd->ctx.now + tx_flush_interval; + + ovs_assert(pmd->n_output_batches > 0); + pmd->n_output_batches--; + + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt); + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1); + + /* Distribute send cycles evenly among transmitted packets and assign to + * their respective rx queues. */ + cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt; + for (i = 0; i < output_cnt; i++) { + if (p->output_pkts_rxqs[i]) { + dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i], + RXQ_CYCLES_PROC_CURR, cycles); + } + } + + return output_cnt; +} + +static int +dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd, + bool force) +{ + struct tx_port *p; + int output_cnt = 0; + + if (!pmd->n_output_batches) { + return 0; + } + + HMAP_FOR_EACH (p, node, &pmd->send_port_cache) { + if (!dp_packet_batch_is_empty(&p->output_pkts) + && (force || pmd->ctx.now >= p->flush_time)) { + dp_netdev_pmd_idle_end(pmd); + output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p); + } + } + return output_cnt; +} + +static int +dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd, + struct dp_netdev_rxq *rxq, + odp_port_t port_no) +{ + struct pmd_perf_stats *s = &pmd->perf_stats; + struct dp_packet_batch batch; + struct cycle_timer timer; + int error; + int batch_cnt = 0; + int rem_qlen = 0, *qlen_p = NULL; + uint64_t cycles; + + /* Measure duration for polling and processing rx burst. */ + cycle_timer_start(&pmd->perf_stats, &timer); + + pmd->ctx.last_rxq = rxq; + dp_packet_batch_init(&batch); + + /* Fetch the rx queue length only for vhostuser ports. */ + if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) { + qlen_p = &rem_qlen; + } + + error = netdev_rxq_recv(rxq->rx, &batch, qlen_p); + if (!error) { + dp_netdev_pmd_idle_end(pmd); + /* At least one packet received. */ + *recirc_depth_get() = 0; + pmd_thread_ctx_time_update(pmd); + batch_cnt = dp_packet_batch_size(&batch); + if (pmd_perf_metrics_enabled(pmd)) { + /* Update batch histogram. */ + s->current.batches++; + histogram_add_sample(&s->pkts_per_batch, batch_cnt); + /* Update the maximum vhost rx queue fill level. */ + if (rxq->is_vhost && rem_qlen >= 0) { + uint32_t qfill = batch_cnt + rem_qlen; + if (qfill > s->current.max_vhost_qfill) { + s->current.max_vhost_qfill = qfill; + } + } + } + + /* Process packet batch. */ + int ret = pmd->netdev_input_func(pmd, &batch, port_no); + if (ret) { + dp_netdev_input(pmd, &batch, port_no); + } + + /* Assign processing cycles to rx queue. */ + cycles = cycle_timer_stop(&pmd->perf_stats, &timer); + dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles); + + dp_netdev_pmd_flush_output_packets(pmd, false); + } else { + /* Discard cycles. */ + cycle_timer_stop(&pmd->perf_stats, &timer); + if (error != EAGAIN && error != EOPNOTSUPP) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + + if (dp_netdev_port_exists(rxq->port->netdev)) { + VLOG_WARN_RL(&rl, "error receiving data from %s: %s", + netdev_rxq_get_name(rxq->rx), ovs_strerror(error)); + } + } + } + + pmd->ctx.last_rxq = NULL; + + return batch_cnt; +} + +static struct tx_port * +tx_port_lookup(const struct hmap *hmap, odp_port_t port_no) +{ + struct tx_port *tx; + + HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) { + if (tx->port->port_no == port_no) { + return tx; + } + } + + return NULL; +} + +static struct tx_bond * +tx_bond_lookup(const struct cmap *tx_bonds, uint32_t bond_id) +{ + uint32_t hash = hash_bond_id(bond_id); + struct tx_bond *tx; + + CMAP_FOR_EACH_WITH_HASH (tx, node, hash, tx_bonds) { + if (tx->bond_id == bond_id) { + return tx; + } + } + return NULL; +} + +static int +port_reconfigure(struct dp_netdev_port *port) +{ + struct netdev *netdev = port->netdev; + int i, err; + + /* Closes the existing 'rxq's. */ + for (i = 0; i < port->n_rxq; i++) { + netdev_rxq_close(port->rxqs[i].rx); + port->rxqs[i].rx = NULL; + } + unsigned last_nrxq = port->n_rxq; + port->n_rxq = 0; + + /* Allows 'netdev' to apply the pending configuration changes. */ + if (netdev_is_reconf_required(netdev) || port->need_reconfigure) { + err = netdev_reconfigure(netdev); + if (err && (err != EOPNOTSUPP)) { + if (err != EAGAIN) { + VLOG_ERR("Failed to set interface %s new configuration", + netdev_get_name(netdev)); + } + return err; + } + } + /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */ + port->rxqs = xrealloc(port->rxqs, + sizeof *port->rxqs * netdev_n_rxq(netdev)); + /* Realloc 'used' counters for tx queues. */ + free(port->txq_used); + port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used); + + for (i = 0; i < netdev_n_rxq(netdev); i++) { + bool new_queue = i >= last_nrxq; + if (new_queue) { + memset(&port->rxqs[i], 0, sizeof port->rxqs[i]); + } + + port->rxqs[i].port = port; + port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9); + + err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i); + if (err) { + return err; + } + port->n_rxq++; + } + + /* Parse affinity list to apply configuration for new queues. */ + dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list); + + /* If reconfiguration was successful mark it as such, so we can use it */ + port->need_reconfigure = false; + + return 0; +} + +struct sched_numa_list { + struct hmap numas; /* Contains 'struct sched_numa'. */ +}; + +/* Meta data for out-of-place pmd rxq assignments. */ +struct sched_pmd { + struct sched_numa *numa; + /* Associated PMD thread. */ + struct dp_netdev_pmd_thread *pmd; + uint64_t pmd_proc_cycles; + struct dp_netdev_rxq **rxqs; + unsigned n_rxq; + bool isolated; +}; + +struct sched_numa { + struct hmap_node node; + int numa_id; + /* PMDs on numa node. */ + struct sched_pmd *pmds; + /* Num of PMDs on numa node. */ + unsigned n_pmds; + /* Num of isolated PMDs on numa node. */ + unsigned n_isolated; + int rr_cur_index; + bool rr_idx_inc; +}; + +static size_t +sched_numa_list_count(struct sched_numa_list *numa_list) +{ + return hmap_count(&numa_list->numas); +} + +static struct sched_numa * +sched_numa_list_next(struct sched_numa_list *numa_list, + const struct sched_numa *numa) +{ + struct hmap_node *node = NULL; + + if (numa) { + node = hmap_next(&numa_list->numas, &numa->node); + } + if (!node) { + node = hmap_first(&numa_list->numas); + } + + return (node) ? CONTAINER_OF(node, struct sched_numa, node) : NULL; +} + +static struct sched_numa * +sched_numa_list_lookup(struct sched_numa_list *numa_list, int numa_id) +{ + struct sched_numa *numa; + + HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), + &numa_list->numas) { + if (numa->numa_id == numa_id) { + return numa; + } + } + return NULL; +} + +static int +compare_sched_pmd_list(const void *a_, const void *b_) +{ + struct sched_pmd *a, *b; + + a = (struct sched_pmd *) a_; + b = (struct sched_pmd *) b_; + + return compare_poll_thread_list(&a->pmd, &b->pmd); +} + +static void +sort_numa_list_pmds(struct sched_numa_list *numa_list) +{ + struct sched_numa *numa; + + HMAP_FOR_EACH (numa, node, &numa_list->numas) { + if (numa->n_pmds > 1) { + qsort(numa->pmds, numa->n_pmds, sizeof *numa->pmds, + compare_sched_pmd_list); + } + } +} + +/* Populate numas and pmds on those numas. */ +static void +sched_numa_list_populate(struct sched_numa_list *numa_list, + struct dp_netdev *dp) +{ + struct dp_netdev_pmd_thread *pmd; + + hmap_init(&numa_list->numas); + + /* For each pmd on this datapath. */ + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + struct sched_numa *numa; + struct sched_pmd *sched_pmd; + if (pmd->core_id == NON_PMD_CORE_ID) { + continue; + } + + /* Get the numa of the PMD. */ + numa = sched_numa_list_lookup(numa_list, pmd->numa_id); + /* Create a new numa node for it if not already created. */ + if (!numa) { + numa = xzalloc(sizeof *numa); + numa->numa_id = pmd->numa_id; + hmap_insert(&numa_list->numas, &numa->node, + hash_int(pmd->numa_id, 0)); + } + + /* Create a sched_pmd on this numa for the pmd. */ + numa->n_pmds++; + numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds); + sched_pmd = &numa->pmds[numa->n_pmds - 1]; + memset(sched_pmd, 0, sizeof *sched_pmd); + sched_pmd->numa = numa; + sched_pmd->pmd = pmd; + /* At least one pmd is present so initialize curr_idx and idx_inc. */ + numa->rr_cur_index = 0; + numa->rr_idx_inc = true; + } + sort_numa_list_pmds(numa_list); +} + +static void +sched_numa_list_free_entries(struct sched_numa_list *numa_list) +{ + struct sched_numa *numa; + + HMAP_FOR_EACH_POP (numa, node, &numa_list->numas) { + for (unsigned i = 0; i < numa->n_pmds; i++) { + struct sched_pmd *sched_pmd; + + sched_pmd = &numa->pmds[i]; + sched_pmd->n_rxq = 0; + free(sched_pmd->rxqs); + } + numa->n_pmds = 0; + free(numa->pmds); + free(numa); + } + hmap_destroy(&numa_list->numas); +} + +static struct sched_pmd * +sched_pmd_find_by_pmd(struct sched_numa_list *numa_list, + struct dp_netdev_pmd_thread *pmd) +{ + struct sched_numa *numa; + + HMAP_FOR_EACH (numa, node, &numa_list->numas) { + for (unsigned i = 0; i < numa->n_pmds; i++) { + struct sched_pmd *sched_pmd; + + sched_pmd = &numa->pmds[i]; + if (pmd == sched_pmd->pmd) { + return sched_pmd; + } + } + } + return NULL; +} + +static void +sched_pmd_add_rxq(struct sched_pmd *sched_pmd, struct dp_netdev_rxq *rxq, + uint64_t cycles) +{ + /* As sched_pmd is allocated outside this fn. better to not assume + * rxqs is initialized to NULL. */ + if (sched_pmd->n_rxq == 0) { + sched_pmd->rxqs = xmalloc(sizeof *sched_pmd->rxqs); + } else { + sched_pmd->rxqs = xrealloc(sched_pmd->rxqs, (sched_pmd->n_rxq + 1) * + sizeof *sched_pmd->rxqs); + } + + sched_pmd->rxqs[sched_pmd->n_rxq++] = rxq; + sched_pmd->pmd_proc_cycles += cycles; +} + +static void +sched_numa_list_assignments(struct sched_numa_list *numa_list, + struct dp_netdev *dp) + OVS_REQ_RDLOCK(dp->port_rwlock) +{ + struct dp_netdev_port *port; + + /* For each port. */ + HMAP_FOR_EACH (port, node, &dp->ports) { + if (!netdev_is_pmd(port->netdev)) { + continue; + } + /* For each rxq on the port. */ + for (unsigned qid = 0; qid < port->n_rxq; qid++) { + struct dp_netdev_rxq *rxq = &port->rxqs[qid]; + struct sched_pmd *sched_pmd; + uint64_t proc_cycles = 0; + + for (int i = 0; i < PMD_INTERVAL_MAX; i++) { + proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, i); + } + + sched_pmd = sched_pmd_find_by_pmd(numa_list, rxq->pmd); + if (sched_pmd) { + if (rxq->core_id != OVS_CORE_UNSPEC && dp->pmd_iso) { + sched_pmd->isolated = true; + } + sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles); + } + } + } +} + +static void +sched_numa_list_put_in_place(struct sched_numa_list *numa_list) +{ + struct sched_numa *numa; + + /* For each numa. */ + HMAP_FOR_EACH (numa, node, &numa_list->numas) { + /* For each pmd. */ + for (int i = 0; i < numa->n_pmds; i++) { + struct sched_pmd *sched_pmd; + + sched_pmd = &numa->pmds[i]; + sched_pmd->pmd->isolated = sched_pmd->isolated; + /* For each rxq. */ + for (unsigned k = 0; k < sched_pmd->n_rxq; k++) { + /* Store the new pmd from the out of place sched_numa_list + * struct to the dp_netdev_rxq struct */ + sched_pmd->rxqs[k]->pmd = sched_pmd->pmd; + } + } + } +} + +/* Returns 'true' if OVS rxq scheduling algorithm assigned any unpinned rxq to + * a PMD thread core on a non-local numa node. */ +static bool +sched_numa_list_cross_numa_polling(struct sched_numa_list *numa_list) +{ + struct sched_numa *numa; + + HMAP_FOR_EACH (numa, node, &numa_list->numas) { + for (int i = 0; i < numa->n_pmds; i++) { + struct sched_pmd *sched_pmd; + + sched_pmd = &numa->pmds[i]; + if (sched_pmd->isolated) { + /* All rxqs on this PMD thread core are pinned. */ + continue; + } + for (unsigned k = 0; k < sched_pmd->n_rxq; k++) { + struct dp_netdev_rxq *rxq = sched_pmd->rxqs[k]; + /* Check if the rxq is not pinned to a specific PMD thread core + * by the user AND the PMD thread core that OVS assigned is + * non-local to the rxq port. */ + if (rxq->core_id == OVS_CORE_UNSPEC && + rxq->pmd->numa_id != + netdev_get_numa_id(rxq->port->netdev)) { + return true; + } + } + } + } + return false; +} + +static unsigned +sched_numa_noniso_pmd_count(struct sched_numa *numa) +{ + if (numa->n_pmds > numa->n_isolated) { + return numa->n_pmds - numa->n_isolated; + } + return 0; +} + +/* Sort Rx Queues by the processing cycles they are consuming. */ +static int +compare_rxq_cycles(const void *a, const void *b) +{ + struct dp_netdev_rxq *qa; + struct dp_netdev_rxq *qb; + uint64_t cycles_qa, cycles_qb; + + qa = *(struct dp_netdev_rxq **) a; + qb = *(struct dp_netdev_rxq **) b; + + cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST); + cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST); + + if (cycles_qa != cycles_qb) { + return (cycles_qa < cycles_qb) ? 1 : -1; + } else { + /* Cycles are the same so tiebreak on port/queue id. + * Tiebreaking (as opposed to return 0) ensures consistent + * sort results across multiple OS's. */ + uint32_t port_qa = odp_to_u32(qa->port->port_no); + uint32_t port_qb = odp_to_u32(qb->port->port_no); + if (port_qa != port_qb) { + return port_qa > port_qb ? 1 : -1; + } else { + return netdev_rxq_get_queue_id(qa->rx) + - netdev_rxq_get_queue_id(qb->rx); + } + } +} + +static bool +sched_pmd_new_lowest(struct sched_pmd *current_lowest, struct sched_pmd *pmd, + bool has_proc) +{ + uint64_t current_num, pmd_num; + + if (current_lowest == NULL) { + return true; + } + + if (has_proc) { + current_num = current_lowest->pmd_proc_cycles; + pmd_num = pmd->pmd_proc_cycles; + } else { + current_num = current_lowest->n_rxq; + pmd_num = pmd->n_rxq; + } + + if (pmd_num < current_num) { + return true; + } + return false; +} + +static struct sched_pmd * +sched_pmd_get_lowest(struct sched_numa *numa, bool has_cyc) +{ + struct sched_pmd *lowest_sched_pmd = NULL; + + for (unsigned i = 0; i < numa->n_pmds; i++) { + struct sched_pmd *sched_pmd; + + sched_pmd = &numa->pmds[i]; + if (sched_pmd->isolated) { + continue; + } + if (sched_pmd_new_lowest(lowest_sched_pmd, sched_pmd, has_cyc)) { + lowest_sched_pmd = sched_pmd; + } + } + return lowest_sched_pmd; +} + +/* + * Returns the next pmd from the numa node. + * + * If 'updown' is 'true' it will alternate between selecting the next pmd in + * either an up or down walk, switching between up/down when the first or last + * core is reached. e.g. 1,2,3,3,2,1,1,2... + * + * If 'updown' is 'false' it will select the next pmd wrapping around when + * last core reached. e.g. 1,2,3,1,2,3,1,2... + */ +static struct sched_pmd * +sched_pmd_next_rr(struct sched_numa *numa, bool updown) +{ + int numa_idx = numa->rr_cur_index; + + if (numa->rr_idx_inc == true) { + /* Incrementing through list of pmds. */ + if (numa->rr_cur_index == numa->n_pmds - 1) { + /* Reached the last pmd. */ + if (updown) { + numa->rr_idx_inc = false; + } else { + numa->rr_cur_index = 0; + } + } else { + numa->rr_cur_index++; + } + } else { + /* Decrementing through list of pmds. */ + if (numa->rr_cur_index == 0) { + /* Reached the first pmd. */ + numa->rr_idx_inc = true; + } else { + numa->rr_cur_index--; + } + } + return &numa->pmds[numa_idx]; +} + +static struct sched_pmd * +sched_pmd_next_noniso_rr(struct sched_numa *numa, bool updown) +{ + struct sched_pmd *sched_pmd = NULL; + + /* sched_pmd_next_rr() may return duplicate PMDs before all PMDs have been + * returned depending on updown. Call it more than n_pmds to ensure all + * PMDs can be searched for the next non-isolated PMD. */ + for (unsigned i = 0; i < numa->n_pmds * 2; i++) { + sched_pmd = sched_pmd_next_rr(numa, updown); + if (!sched_pmd->isolated) { + break; + } + sched_pmd = NULL; + } + return sched_pmd; +} + +static struct sched_pmd * +sched_pmd_next(struct sched_numa *numa, enum sched_assignment_type algo, + bool has_proc) +{ + if (algo == SCHED_GROUP) { + return sched_pmd_get_lowest(numa, has_proc); + } + + /* By default RR the PMDs. */ + return sched_pmd_next_noniso_rr(numa, algo == SCHED_CYCLES ? true : false); +} + +static const char * +get_assignment_type_string(enum sched_assignment_type algo) +{ + switch (algo) { + case SCHED_ROUNDROBIN: return "roundrobin"; + case SCHED_CYCLES: return "cycles"; + case SCHED_GROUP: return "group"; + default: return "Unknown"; + } +} + +#define MAX_RXQ_CYC_TEXT 40 +#define MAX_RXQ_CYC_STRLEN (INT_STRLEN(uint64_t) + MAX_RXQ_CYC_TEXT) + +static char * +get_rxq_cyc_log(char *a, enum sched_assignment_type algo, uint64_t cycles) +{ + int ret = 0; + + if (algo != SCHED_ROUNDROBIN) { + ret = snprintf(a, MAX_RXQ_CYC_STRLEN, + " (measured processing cycles %"PRIu64")", cycles); + } + + if (algo == SCHED_ROUNDROBIN || ret <= 0) { + a[0] = '\0'; + } + return a; +} + +static void +sched_numa_list_schedule(struct sched_numa_list *numa_list, + struct dp_netdev *dp, + enum sched_assignment_type algo, + enum vlog_level level) + OVS_REQ_RDLOCK(dp->port_rwlock) +{ + struct dp_netdev_port *port; + struct dp_netdev_rxq **rxqs = NULL; + struct sched_numa *last_cross_numa; + unsigned n_rxqs = 0; + bool start_logged = false; + size_t n_numa; + + /* For each port. */ + HMAP_FOR_EACH (port, node, &dp->ports) { + if (!netdev_is_pmd(port->netdev)) { + continue; + } + + /* For each rxq on the port. */ + for (int qid = 0; qid < port->n_rxq; qid++) { + struct dp_netdev_rxq *rxq = &port->rxqs[qid]; + + if (algo != SCHED_ROUNDROBIN) { + uint64_t cycle_hist = 0; + + /* Sum the queue intervals and store the cycle history. */ + for (unsigned i = 0; i < PMD_INTERVAL_MAX; i++) { + cycle_hist += dp_netdev_rxq_get_intrvl_cycles(rxq, i); + } + dp_netdev_rxq_set_cycles(rxq, RXQ_CYCLES_PROC_HIST, + cycle_hist); + } + + /* Check if this rxq is pinned. */ + if (rxq->core_id != OVS_CORE_UNSPEC) { + struct sched_pmd *sched_pmd; + struct dp_netdev_pmd_thread *pmd; + struct sched_numa *numa; + bool iso = dp->pmd_iso; + uint64_t proc_cycles; + char rxq_cyc_log[MAX_RXQ_CYC_STRLEN]; + + /* This rxq should be pinned, pin it now. */ + pmd = dp_netdev_get_pmd(dp, rxq->core_id); + sched_pmd = sched_pmd_find_by_pmd(numa_list, pmd); + dp_netdev_pmd_unref(pmd); + if (!sched_pmd) { + /* Cannot find the PMD. Cannot pin this rxq. */ + VLOG(level == VLL_DBG ? VLL_DBG : VLL_WARN, + "Core %2u cannot be pinned with " + "port \'%s\' rx queue %d. Use pmd-cpu-mask to " + "enable a pmd on core %u. An alternative core " + "will be assigned.", + rxq->core_id, + netdev_rxq_get_name(rxq->rx), + netdev_rxq_get_queue_id(rxq->rx), + rxq->core_id); + rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs); + rxqs[n_rxqs++] = rxq; + continue; + } + if (iso) { + /* Mark PMD as isolated if not done already. */ + if (sched_pmd->isolated == false) { + sched_pmd->isolated = true; + numa = sched_pmd->numa; + numa->n_isolated++; + } + } + proc_cycles = dp_netdev_rxq_get_cycles(rxq, + RXQ_CYCLES_PROC_HIST); + VLOG(level, "Core %2u on numa node %d is pinned with " + "port \'%s\' rx queue %d%s", + sched_pmd->pmd->core_id, sched_pmd->pmd->numa_id, + netdev_rxq_get_name(rxq->rx), + netdev_rxq_get_queue_id(rxq->rx), + get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles)); + sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles); + } else { + rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs); + rxqs[n_rxqs++] = rxq; + } + } + } + + if (n_rxqs > 1 && algo != SCHED_ROUNDROBIN) { + /* Sort the queues in order of the processing cycles + * they consumed during their last pmd interval. */ + qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles); + } + + last_cross_numa = NULL; + n_numa = sched_numa_list_count(numa_list); + for (unsigned i = 0; i < n_rxqs; i++) { + struct dp_netdev_rxq *rxq = rxqs[i]; + struct sched_pmd *sched_pmd = NULL; + struct sched_numa *numa; + int port_numa_id; + uint64_t proc_cycles; + char rxq_cyc_log[MAX_RXQ_CYC_STRLEN]; + + if (start_logged == false && level != VLL_DBG) { + VLOG(level, "Performing pmd to rx queue assignment using %s " + "algorithm.", get_assignment_type_string(algo)); + start_logged = true; + } + + /* Store the cycles for this rxq as we will log these later. */ + proc_cycles = dp_netdev_rxq_get_cycles(rxq, RXQ_CYCLES_PROC_HIST); + + port_numa_id = netdev_get_numa_id(rxq->port->netdev); + + /* Select numa. */ + numa = sched_numa_list_lookup(numa_list, port_numa_id); + + /* Check if numa has no PMDs or no non-isolated PMDs. */ + if (!numa || !sched_numa_noniso_pmd_count(numa)) { + /* Unable to use this numa to find a PMD. */ + numa = NULL; + /* Find any numa with available PMDs. */ + for (int j = 0; j < n_numa; j++) { + numa = sched_numa_list_next(numa_list, last_cross_numa); + last_cross_numa = numa; + if (sched_numa_noniso_pmd_count(numa)) { + break; + } + numa = NULL; + } + } + + if (ovs_doca_initialized()) { + /* In doca configuration, port/queue is assigned statically. Reduce severity. */ + level = VLL_DBG; + } + + if (numa) { + /* Select the PMD that should be used for this rxq. */ + sched_pmd = sched_pmd_next(numa, algo, + proc_cycles ? true : false); + } + + /* Check that a pmd has been selected. */ + if (sched_pmd) { + int pmd_numa_id; + + pmd_numa_id = sched_pmd->numa->numa_id; + /* Check if selected pmd numa matches port numa. */ + if (pmd_numa_id != port_numa_id) { + VLOG(level, "There's no available (non-isolated) pmd thread " + "on numa node %d. Port \'%s\' rx queue %d will " + "be assigned to a pmd on numa node %d. " + "This may lead to reduced performance.", + port_numa_id, netdev_rxq_get_name(rxq->rx), + netdev_rxq_get_queue_id(rxq->rx), pmd_numa_id); + } + VLOG(level, "Core %2u on numa node %d assigned port \'%s\' " + "rx queue %d%s.", + sched_pmd->pmd->core_id, sched_pmd->pmd->numa_id, + netdev_rxq_get_name(rxq->rx), + netdev_rxq_get_queue_id(rxq->rx), + get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles)); + sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles); + } else { + VLOG(level == VLL_DBG ? level : VLL_WARN, + "No non-isolated pmd on any numa available for " + "port \'%s\' rx queue %d%s. " + "This rx queue will not be polled.", + netdev_rxq_get_name(rxq->rx), + netdev_rxq_get_queue_id(rxq->rx), + get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles)); + } + } + free(rxqs); +} + +static void +rxq_scheduling(struct dp_netdev *dp) + OVS_REQ_RDLOCK(dp->port_rwlock) +{ + struct sched_numa_list numa_list; + enum sched_assignment_type algo = dp->pmd_rxq_assign_type; + + sched_numa_list_populate(&numa_list, dp); + sched_numa_list_schedule(&numa_list, dp, algo, VLL_INFO); + sched_numa_list_put_in_place(&numa_list); + + sched_numa_list_free_entries(&numa_list); +} + +static uint64_t variance(uint64_t a[], int n); + +static uint64_t +sched_numa_variance(struct sched_numa *numa) +{ + uint64_t *percent_busy = NULL; + int n_proc = 0; + uint64_t var; + + percent_busy = xmalloc(numa->n_pmds * sizeof *percent_busy); + + for (unsigned i = 0; i < numa->n_pmds; i++) { + struct sched_pmd *sched_pmd; + uint64_t total_cycles = 0; + + sched_pmd = &numa->pmds[i]; + /* Exclude isolated PMDs from variance calculations. */ + if (sched_pmd->isolated == true) { + continue; + } + /* Get the total pmd cycles for an interval. */ + atomic_read_relaxed(&sched_pmd->pmd->intrvl_cycles, &total_cycles); + + if (total_cycles) { + /* Estimate the cycles to cover all intervals. */ + total_cycles *= PMD_INTERVAL_MAX; + percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100) + / total_cycles; + } else { + percent_busy[n_proc++] = 0; + } + } + var = variance(percent_busy, n_proc); + free(percent_busy); + return var; +} + +/* + * This function checks that some basic conditions needed for a rebalance to be + * effective are met. Such as Rxq scheduling assignment type, more than one + * PMD, more than 2 Rxqs on a PMD. If there was no reconfiguration change + * since the last check, it reuses the last result. + * + * It is not intended to be an inclusive check of every condition that may make + * a rebalance ineffective. It is done as a quick check so a full + * pmd_rebalance_dry_run() can be avoided when it is not needed. + */ +static bool +pmd_rebalance_dry_run_needed(struct dp_netdev *dp) + OVS_REQ_RDLOCK(dp->port_rwlock) +{ + struct dp_netdev_pmd_thread *pmd; + struct pmd_auto_lb *pmd_alb = &dp->pmd_alb; + unsigned int cnt = 0; + bool multi_rxq = false; + + /* Check if there was no reconfiguration since last check. */ + if (!pmd_alb->recheck_config) { + if (!pmd_alb->do_dry_run) { + VLOG_DBG("PMD auto load balance nothing to do, " + "no configuration changes since last check."); + return false; + } + return true; + } + pmd_alb->recheck_config = false; + + /* Check for incompatible assignment type. */ + if (dp->pmd_rxq_assign_type == SCHED_ROUNDROBIN) { + VLOG_DBG("PMD auto load balance nothing to do, " + "pmd-rxq-assign=roundrobin assignment type configured."); + return pmd_alb->do_dry_run = false; + } + + /* Check that there is at least 2 non-isolated PMDs and + * one of them is polling more than one rxq. */ + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) { + continue; + } + + if (hmap_count(&pmd->poll_list) > 1) { + multi_rxq = true; + } + if (cnt && multi_rxq) { + return pmd_alb->do_dry_run = true; + } + cnt++; + } + + VLOG_DBG("PMD auto load balance nothing to do, " + "not enough non-isolated PMDs or RxQs."); + return pmd_alb->do_dry_run = false; +} + +static bool +pmd_rebalance_dry_run(struct dp_netdev *dp) + OVS_REQ_RDLOCK(dp->port_rwlock) +{ + struct sched_numa_list numa_list_cur; + struct sched_numa_list numa_list_est; + bool thresh_met = false; + uint64_t current_var, estimate_var; + struct sched_numa *numa_cur, *numa_est; + uint64_t improvement = 0; + + VLOG_DBG("PMD auto load balance performing dry run."); + + /* Populate current assignments. */ + sched_numa_list_populate(&numa_list_cur, dp); + sched_numa_list_assignments(&numa_list_cur, dp); + + /* Populate estimated assignments. */ + sched_numa_list_populate(&numa_list_est, dp); + sched_numa_list_schedule(&numa_list_est, dp, + dp->pmd_rxq_assign_type, VLL_DBG); + + /* Check if cross-numa polling, there is only one numa with PMDs. */ + if (!sched_numa_list_cross_numa_polling(&numa_list_est) || + sched_numa_list_count(&numa_list_est) == 1) { + + /* Calculate variances. */ + HMAP_FOR_EACH (numa_cur, node, &numa_list_cur.numas) { + numa_est = sched_numa_list_lookup(&numa_list_est, + numa_cur->numa_id); + if (!numa_est) { + continue; + } + current_var = sched_numa_variance(numa_cur); + estimate_var = sched_numa_variance(numa_est); + if (estimate_var < current_var) { + improvement = ((current_var - estimate_var) * 100) + / current_var; + } + VLOG_DBG("Numa node %d. Current variance %"PRIu64" Estimated " + "variance %"PRIu64". Variance improvement %"PRIu64"%%.", + numa_cur->numa_id, current_var, + estimate_var, improvement); + if (improvement >= dp->pmd_alb.rebalance_improve_thresh) { + thresh_met = true; + } + } + VLOG_DBG("PMD load variance improvement threshold %u%% is %s.", + dp->pmd_alb.rebalance_improve_thresh, + thresh_met ? "met" : "not met"); + } else { + VLOG_DBG("PMD auto load balance detected cross-numa polling with " + "multiple numa nodes. Unable to accurately estimate."); + } + + sched_numa_list_free_entries(&numa_list_cur); + sched_numa_list_free_entries(&numa_list_est); + + return thresh_met; +} + +static void +reload_affected_pmds(struct dp_netdev *dp) +{ + struct dp_netdev_pmd_thread *pmd; + + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + if (pmd->need_reload) { + dp_netdev_reload_pmd__(pmd); + } + } + + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + if (pmd->need_reload) { + if (pmd->core_id != NON_PMD_CORE_ID) { + bool reload; + + do { + atomic_read_explicit(&pmd->reload, &reload, + memory_order_acquire); + } while (reload); + } + pmd->need_reload = false; + } + } +} + +static void +reconfigure_pmd_threads(struct dp_netdev *dp) + OVS_REQ_RDLOCK(dp->port_rwlock) +{ + struct dp_netdev_pmd_thread *pmd; + struct ovs_numa_dump *pmd_cores; + struct ovs_numa_info_core *core; + struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete); + struct hmapx_node *node; + bool changed = false; + bool need_to_adjust_static_tx_qids = false; + + /* The pmd threads should be started only if there's a pmd port in the + * datapath. If the user didn't provide any "pmd-cpu-mask", we start + * NR_PMD_THREADS per numa node. */ + if (!has_pmd_port(dp) && !ovs_doca_initialized()) { + pmd_cores = ovs_numa_dump_n_cores_per_numa(0); + } else { + pmd_cores = dp_netdev_pmd_cmask2cores(dp->pmd_cmask); + } + + /* We need to adjust 'static_tx_qid's only if we're reducing number of + * PMD threads. Otherwise, new threads will allocate all the freed ids. */ + if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) { + /* Adjustment is required to keep 'static_tx_qid's sequential and + * avoid possible issues, for example, imbalanced tx queue usage + * and unnecessary locking caused by remapping on netdev level. */ + need_to_adjust_static_tx_qids = true; + } + + /* Check for unwanted pmd threads */ + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + if (pmd->core_id == NON_PMD_CORE_ID) { + continue; + } + if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id, + pmd->core_id)) { + hmapx_add(&to_delete, pmd); + } else if (need_to_adjust_static_tx_qids) { + atomic_store_relaxed(&pmd->reload_tx_qid, true); + pmd->need_reload = true; + } + } + + HMAPX_FOR_EACH (node, &to_delete) { + pmd = (struct dp_netdev_pmd_thread *) node->data; + VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.", + pmd->numa_id, pmd->core_id); + dp_netdev_del_pmd(dp, pmd); + } + changed = !hmapx_is_empty(&to_delete); + hmapx_destroy(&to_delete); + + if (need_to_adjust_static_tx_qids) { + /* 'static_tx_qid's are not sequential now. + * Reload remaining threads to fix this. */ + reload_affected_pmds(dp); + } + + /* Check for required new pmd threads */ + FOR_EACH_CORE_ON_DUMP(core, pmd_cores) { + pmd = dp_netdev_get_pmd(dp, core->core_id); + if (!pmd) { + struct ds name = DS_EMPTY_INITIALIZER; + + pmd = xzalloc(sizeof *pmd); + dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id); + + ds_put_format(&name, "pmd-c%02d/id:", core->core_id); + pmd->thread = ovs_thread_create(ds_cstr(&name), + pmd_thread_main, pmd); + ds_destroy(&name); + + VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.", + pmd->numa_id, pmd->core_id); + changed = true; + } else { + dp_netdev_pmd_unref(pmd); + } + } + + if (changed) { + struct ovs_numa_info_numa *numa; + + /* Log the number of pmd threads per numa node. */ + FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) { + VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d", + numa->n_cores, numa->numa_id); + } + } + + ovs_numa_dump_destroy(pmd_cores); +} + +static void +pmd_remove_stale_ports(struct dp_netdev *dp, + struct dp_netdev_pmd_thread *pmd) + OVS_EXCLUDED(pmd->port_mutex) + OVS_REQ_RDLOCK(dp->port_rwlock) +{ + struct rxq_poll *poll; + struct tx_port *tx; + + ovs_mutex_lock(&pmd->port_mutex); + HMAP_FOR_EACH_SAFE (poll, node, &pmd->poll_list) { + struct dp_netdev_port *port = poll->rxq->port; + + if (port->need_reconfigure + || !hmap_contains(&dp->ports, &port->node) + || port->disabled) { + dp_netdev_del_rxq_from_pmd(pmd, poll); + } + } + HMAP_FOR_EACH_SAFE (tx, node, &pmd->tx_ports) { + struct dp_netdev_port *port = tx->port; + + if (port->need_reconfigure + || !hmap_contains(&dp->ports, &port->node) + || port->disabled) { + dp_netdev_del_port_tx_from_pmd(pmd, tx); + } + } + ovs_mutex_unlock(&pmd->port_mutex); +} + +/* Must be called each time a port is added/removed or the cmask changes. + * This creates and destroys pmd threads, reconfigures ports, opens their + * rxqs and assigns all rxqs/txqs to pmd threads. */ +static void +reconfigure_datapath(struct dp_netdev *dp) + OVS_REQ_RDLOCK(dp->port_rwlock) +{ + struct hmapx busy_threads = HMAPX_INITIALIZER(&busy_threads); + struct dp_netdev_pmd_thread *pmd; + struct dp_netdev_port *port; + int wanted_txqs; + + dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq); + + /* Step 1: Adjust the pmd threads based on the datapath ports, the cores + * on the system and the user configuration. */ + reconfigure_pmd_threads(dp); + + wanted_txqs = cmap_count(&dp->poll_threads); + + /* The number of pmd threads might have changed, or a port can be new: + * adjust the txqs. */ + HMAP_FOR_EACH (port, node, &dp->ports) { + netdev_set_tx_multiq(port->netdev, wanted_txqs); + } + + /* Step 2: Remove from the pmd threads ports that have been removed or + * need reconfiguration. */ + + /* Check for all the ports that need reconfiguration. We cache this in + * 'port->need_reconfigure', because netdev_is_reconf_required() can + * change at any time. + * Also mark for reconfiguration all ports which will likely change their + * 'txq_mode' parameter. It's required to stop using them before + * changing this setting and it's simpler to mark ports here and allow + * 'pmd_remove_stale_ports' to remove them from threads. There will be + * no actual reconfiguration in 'port_reconfigure' because it's + * unnecessary. */ + HMAP_FOR_EACH (port, node, &dp->ports) { + if (netdev_is_reconf_required(port->netdev) + || ((port->txq_mode == TXQ_MODE_XPS) + != (netdev_n_txq(port->netdev) < wanted_txqs)) + || ((port->txq_mode == TXQ_MODE_XPS_HASH) + != (port->txq_requested_mode == TXQ_REQ_MODE_HASH + && netdev_n_txq(port->netdev) > 1))) { + port->need_reconfigure = true; + } + } + + /* Remove from the pmd threads all the ports that have been deleted or + * need reconfiguration. */ + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + pmd_remove_stale_ports(dp, pmd); + } + + /* Reload affected pmd threads. We must wait for the pmd threads before + * reconfiguring the ports, because a port cannot be reconfigured while + * it's being used. */ + reload_affected_pmds(dp); + + /* Step 3: Reconfigure ports. */ + + /* We only reconfigure the ports that we determined above, because they're + * not being used by any pmd thread at the moment. If a port fails to + * reconfigure we remove it from the datapath. */ + HMAP_FOR_EACH_SAFE (port, node, &dp->ports) { + int err; + + if (!port->need_reconfigure) { + continue; + } + + err = port_reconfigure(port); + if (err) { + if (err != EAGAIN) { + hmap_remove(&dp->ports, &port->node); + seq_change(dp->port_seq); + port_destroy(port); + } + } else { + /* With a single queue, there is no point in using hash mode. */ + if (port->txq_requested_mode == TXQ_REQ_MODE_HASH && + netdev_n_txq(port->netdev) > 1) { + port->txq_mode = TXQ_MODE_XPS_HASH; + } else if (netdev_n_txq(port->netdev) < wanted_txqs) { + port->txq_mode = TXQ_MODE_XPS; + } else { + port->txq_mode = TXQ_MODE_STATIC; + } + } + } + + /* Step 4: Compute new rxq scheduling. We don't touch the pmd threads + * for now, we just update the 'pmd' pointer in each rxq to point to the + * wanted thread according to the scheduling policy. */ + + /* Reset all the pmd threads to non isolated. */ + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + pmd->isolated = false; + } + + /* Reset all the queues to unassigned */ + HMAP_FOR_EACH (port, node, &dp->ports) { + for (int i = 0; i < port->n_rxq; i++) { + port->rxqs[i].pmd = NULL; + } + } + rxq_scheduling(dp); + + /* Step 5: Remove queues not compliant with new scheduling. */ + + /* Count all the threads that will have at least one queue to poll. */ + HMAP_FOR_EACH (port, node, &dp->ports) { + for (int qid = 0; qid < port->n_rxq; qid++) { + struct dp_netdev_rxq *q = &port->rxqs[qid]; + + if (q->pmd) { + hmapx_add(&busy_threads, q->pmd); + } + } + } + + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + struct rxq_poll *poll; + + ovs_mutex_lock(&pmd->port_mutex); + HMAP_FOR_EACH_SAFE (poll, node, &pmd->poll_list) { + if (poll->rxq->pmd != pmd) { + dp_netdev_del_rxq_from_pmd(pmd, poll); + + /* This pmd might sleep after this step if it has no rxq + * remaining. Tell it to busy wait for new assignment if it + * has at least one scheduled queue. */ + if (hmap_count(&pmd->poll_list) == 0 && + hmapx_contains(&busy_threads, pmd)) { + atomic_store_relaxed(&pmd->wait_for_reload, true); + } + } + } + ovs_mutex_unlock(&pmd->port_mutex); + } + + hmapx_destroy(&busy_threads); + + /* Reload affected pmd threads. We must wait for the pmd threads to remove + * the old queues before readding them, otherwise a queue can be polled by + * two threads at the same time. */ + reload_affected_pmds(dp); + + /* Step 6: Add queues from scheduling, if they're not there already. */ + HMAP_FOR_EACH (port, node, &dp->ports) { + if (!netdev_is_pmd(port->netdev) || port->disabled) { + continue; + } + + for (int qid = 0; qid < port->n_rxq; qid++) { + struct dp_netdev_rxq *q = &port->rxqs[qid]; + + if (q->pmd) { + ovs_mutex_lock(&q->pmd->port_mutex); + dp_netdev_add_rxq_to_pmd(q->pmd, q); + ovs_mutex_unlock(&q->pmd->port_mutex); + } + } + } + + /* Add every port and bond to the tx port and bond caches of + * every pmd thread, if it's not there already and if this pmd + * has at least one rxq to poll. + */ + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + ovs_mutex_lock(&pmd->port_mutex); + if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) { + struct tx_bond *bond; + + HMAP_FOR_EACH (port, node, &dp->ports) { + if (port->disabled) { + continue; + } + dp_netdev_add_port_tx_to_pmd(pmd, port); + } + + CMAP_FOR_EACH (bond, node, &dp->tx_bonds) { + dp_netdev_add_bond_tx_to_pmd(pmd, bond, false); + } + } + ovs_mutex_unlock(&pmd->port_mutex); + } + + /* Reload affected pmd threads. */ + reload_affected_pmds(dp); + + /* PMD ALB will need to recheck if dry run needed. */ + dp->pmd_alb.recheck_config = true; +} + +/* Returns true if one of the netdevs in 'dp' requires a reconfiguration */ +static bool +ports_require_restart(const struct dp_netdev *dp) + OVS_REQ_RDLOCK(dp->port_rwlock) +{ + struct dp_netdev_port *port; + + HMAP_FOR_EACH (port, node, &dp->ports) { + if (netdev_is_reconf_required(port->netdev)) { + return true; + } + } + + return false; +} + +/* Calculates variance in the values stored in array 'a'. 'n' is the number + * of elements in array to be considered for calculating vairance. + * Usage example: data array 'a' contains the processing load of each pmd and + * 'n' is the number of PMDs. It returns the variance in processing load of + * PMDs*/ +static uint64_t +variance(uint64_t a[], int n) +{ + /* Compute mean (average of elements). */ + uint64_t sum = 0; + uint64_t mean = 0; + uint64_t sqDiff = 0; + + if (!n) { + return 0; + } + + for (int i = 0; i < n; i++) { + sum += a[i]; + } + + if (sum) { + mean = sum / n; + + /* Compute sum squared differences with mean. */ + for (int i = 0; i < n; i++) { + sqDiff += (a[i] - mean)*(a[i] - mean); + } + } + return (sqDiff ? (sqDiff / n) : 0); +} + +/* Return true if needs to revalidate datapath flows. */ +static bool +dpif_netdev_run(struct dpif *dpif) +{ + struct dp_netdev_port *port; + struct dp_netdev *dp = get_dp_netdev2(dpif); + struct dp_netdev_pmd_thread *non_pmd; + uint64_t new_tnl_seq; + bool need_to_flush = true; + bool pmd_rebalance = false; + long long int now = time_msec(); + struct dp_netdev_pmd_thread *pmd; + + dp_netdev_port_rdlock(dp); + non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID); + if (non_pmd) { + ovs_mutex_lock(&dp->non_pmd_mutex); + + atomic_read_relaxed(&dp->smc_enable_db, &non_pmd->ctx.smc_enable_db); + + HMAP_FOR_EACH (port, node, &dp->ports) { + if (!netdev_is_pmd(port->netdev)) { + int i; + + if (port->emc_enabled) { + atomic_read_relaxed(&dp->emc_insert_min, + &non_pmd->ctx.emc_insert_min); + } else { + non_pmd->ctx.emc_insert_min = 0; + } + + for (i = 0; i < port->n_rxq; i++) { + + if (!netdev_rxq_enabled(port->rxqs[i].rx)) { + continue; + } + + if (dp_netdev_process_rxq_port(non_pmd, + &port->rxqs[i], + port->port_no)) { + need_to_flush = false; + } + } + } + } + if (need_to_flush) { + /* We didn't receive anything in the process loop. + * Check if we need to send something. + * There was no time updates on current iteration. */ + pmd_thread_ctx_time_update(non_pmd); + dp_netdev_pmd_flush_output_packets(non_pmd, false); + } + + dpif_netdev_xps_revalidate_pmd(non_pmd, false); + ovs_mutex_unlock(&dp->non_pmd_mutex); + + dp_netdev_pmd_unref(non_pmd); + } + + struct pmd_auto_lb *pmd_alb = &dp->pmd_alb; + if (pmd_alb->is_enabled) { + if (!pmd_alb->rebalance_poll_timer) { + pmd_alb->rebalance_poll_timer = now; + } else if ((pmd_alb->rebalance_poll_timer + + pmd_alb->rebalance_intvl) < now) { + pmd_alb->rebalance_poll_timer = now; + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + if (atomic_count_get(&pmd->pmd_overloaded) >= + PMD_INTERVAL_MAX) { + pmd_rebalance = true; + break; + } + } + + if (pmd_rebalance && + !dp_netdev_is_reconf_required(dp) && + !ports_require_restart(dp) && + pmd_rebalance_dry_run_needed(dp) && + pmd_rebalance_dry_run(dp)) { + VLOG_INFO("PMD auto load balance dry run. " + "Requesting datapath reconfigure."); + dp_netdev_request_reconfigure(dp); + } + } + } + + if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) { + reconfigure_datapath(dp); + } + ovs_rwlock_unlock(&dp->port_rwlock); + + tnl_neigh_cache_run(); + tnl_port_map_run(); + new_tnl_seq = seq_read(tnl_conf_seq); + + if (dp->last_tnl_conf_seq != new_tnl_seq) { + dp->last_tnl_conf_seq = new_tnl_seq; + return true; + } + return false; +} + +static void +dpif_netdev_wait(struct dpif *dpif) +{ + struct dp_netdev_port *port; + struct dp_netdev *dp = get_dp_netdev2(dpif); + + ovs_mutex_lock(&dp_netdev_mutex); + dp_netdev_port_rdlock(dp); + HMAP_FOR_EACH (port, node, &dp->ports) { + netdev_wait_reconf_required(port->netdev); + if (!netdev_is_pmd(port->netdev)) { + int i; + + for (i = 0; i < port->n_rxq; i++) { + netdev_rxq_wait(port->rxqs[i].rx); + } + } + } + ovs_rwlock_unlock(&dp->port_rwlock); + ovs_mutex_unlock(&dp_netdev_mutex); + seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq); +} + +static void +pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd) +{ + struct tx_port *tx_port_cached; + + /* Flush all the queued packets. */ + dp_netdev_pmd_flush_output_packets(pmd, true); + /* Free all used tx queue ids. */ + dpif_netdev_xps_revalidate_pmd(pmd, true); + + HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) { + free(tx_port_cached->txq_pkts); + free(tx_port_cached); + } + HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) { + free(tx_port_cached->txq_pkts); + free(tx_port_cached); + } + free(pmd->rx_port_cache); + pmd->rx_port_cache = NULL; + pmd->rx_port_count = 0; +} + +static void +pmd_load_rx_ports(struct dp_netdev_pmd_thread *pmd) + OVS_REQUIRES(pmd->port_mutex) +{ + struct dp_netdev_port **cache; + struct rxq_poll *poll; + int count; + + cache = xrealloc(pmd->rx_port_cache, + hmap_count(&pmd->poll_list) * sizeof(*cache)); + + count = 0; + HMAP_FOR_EACH (poll, node, &pmd->poll_list) { + struct dp_netdev_port *port = poll->rxq->port; + int i; + + for (i = 0; i < count; i++) { + if (cache[i] == port) { + break; + } + } + if (i == count) { + cache[count++] = port; + } + } + + pmd->rx_port_cache = cache; + pmd->rx_port_count = count; +} + +/* Copies ports from 'pmd->tx_ports' (shared with the main thread) to + * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel + * device, otherwise to 'pmd->send_port_cache' if the port has at least + * one txq. */ +static void +pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd) + OVS_REQUIRES(pmd->port_mutex) +{ + struct tx_port *tx_port, *tx_port_cached; + + pmd_free_cached_ports(pmd); + hmap_shrink(&pmd->send_port_cache); + hmap_shrink(&pmd->tnl_port_cache); + + HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) { + int n_txq = netdev_n_txq(tx_port->port->netdev); + struct dp_packet_batch *txq_pkts_cached; + + if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) { + tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached); + if (tx_port->txq_pkts) { + txq_pkts_cached = xmemdup(tx_port->txq_pkts, + n_txq * sizeof *tx_port->txq_pkts); + tx_port_cached->txq_pkts = txq_pkts_cached; + } + hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node, + hash_port_no(tx_port_cached->port->port_no)); + } + + if (n_txq) { + tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached); + if (tx_port->txq_pkts) { + txq_pkts_cached = xmemdup(tx_port->txq_pkts, + n_txq * sizeof *tx_port->txq_pkts); + tx_port_cached->txq_pkts = txq_pkts_cached; + } + hmap_insert(&pmd->send_port_cache, &tx_port_cached->node, + hash_port_no(tx_port_cached->port->port_no)); + } + } + + pmd_load_rx_ports(pmd); +} + +static void +pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd) +{ + ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex); + if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) { + VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d" + ", numa_id %d.", pmd->core_id, pmd->numa_id); + } + ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex); + + VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d" + ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id); +} + +static void +pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd) +{ + ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex); + id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid); + ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex); +} + +static int +pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd, + struct polled_queue **ppoll_list) +{ + struct polled_queue *poll_list = *ppoll_list; + struct rxq_poll *poll; + int i; + + ovs_mutex_lock(&pmd->port_mutex); + poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list) + * sizeof *poll_list); + + i = 0; + HMAP_FOR_EACH (poll, node, &pmd->poll_list) { + poll_list[i].rxq = poll->rxq; + poll_list[i].port_no = poll->rxq->port->port_no; + poll_list[i].emc_enabled = poll->rxq->port->emc_enabled; + poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx); + poll_list[i].change_seq = + netdev_get_change_seq(poll->rxq->port->netdev); + i++; + } + + pmd_load_cached_ports(pmd); + + ovs_mutex_unlock(&pmd->port_mutex); + + *ppoll_list = poll_list; + return i; +} + +static void * +pmd_thread_main(void *f_) +{ + struct dp_netdev_pmd_thread *pmd = f_; + struct pmd_perf_stats *s = &pmd->perf_stats; + unsigned int lc = 0; + struct polled_queue *poll_list; + bool wait_for_reload = false; + bool dpdk_attached; + bool reload_tx_qid; + bool exiting; + bool reload; + int poll_cnt; + int i; + int process_packets = 0; + uint64_t sleep_time = 0; + + poll_list = NULL; + + /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */ + ovsthread_setspecific(pmd->dp->per_pmd_key, pmd); + ovs_numa_thread_setaffinity_core(pmd->core_id); + dpdk_attached = dpdk_attach_thread(pmd->core_id); + poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list); + dfc_cache_init(&pmd->flow_cache); + pmd_alloc_static_tx_qid(pmd); + set_timer_resolution(PMD_TIMER_RES_NS); + + /* For e2e, PMDs should not handle offloads. */ + if (!dp_netdev_e2e_cache_enabled) { + pmd_thread_offload_init(pmd); + } + +reload: + atomic_count_init(&pmd->pmd_overloaded, 0); + + pmd->intrvl_tsc_prev = 0; + atomic_store_relaxed(&pmd->intrvl_cycles, 0); + + if (!dpdk_attached) { + dpdk_attached = dpdk_attach_thread(pmd->core_id); + } + + /* List port/core affinity */ + for (i = 0; i < poll_cnt; i++) { + VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n", + pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx), + netdev_rxq_get_queue_id(poll_list[i].rxq->rx)); + /* Reset the rxq current cycles counter. */ + dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0); + for (int j = 0; j < PMD_INTERVAL_MAX; j++) { + dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, 0); + } + } + + if (!poll_cnt) { + if (wait_for_reload) { + /* Don't sleep, control thread will ask for a reload shortly. */ + do { + atomic_read_explicit(&pmd->reload, &reload, + memory_order_acquire); + } while (!reload); + } else { + while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) { + seq_wait(pmd->reload_seq, pmd->last_reload_seq); + poll_timer_wait_until(pmd->offload_next_upkeep / 1000); + poll_block(); + pmd_thread_offload_process(pmd, 1024); + } + } + } + + for (i = 0; i < PMD_INTERVAL_MAX; i++) { + atomic_store_relaxed(&pmd->busy_cycles_intrvl[i], 0); + } + atomic_count_set(&pmd->intrvl_idx, 0); + cycles_counter_update(s); + + pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL; + + /* Protect pmd stats from external clearing while polling. */ + ovs_mutex_lock(&pmd->perf_stats.stats_mutex); + for (;;) { + uint64_t rx_packets = 0, tx_packets = 0; + unsigned int n_offload_msgs = 0; + bool quiet_idle = false; + uint64_t time_slept = 0; + uint64_t max_sleep; + + pmd_perf_start_iteration(s); + + atomic_read_relaxed(&pmd->dp->smc_enable_db, &pmd->ctx.smc_enable_db); + atomic_read_relaxed(&pmd->max_sleep, &max_sleep); + atomic_read_relaxed(&pmd->dp->pmd_quiet_idle, &quiet_idle); + + for (i = 0; i < poll_cnt; i++) { + + if (!poll_list[i].rxq_enabled) { + continue; + } + + if (poll_list[i].emc_enabled) { + atomic_read_relaxed(&pmd->dp->emc_insert_min, + &pmd->ctx.emc_insert_min); + } else { + pmd->ctx.emc_insert_min = 0; + } + + process_packets = + dp_netdev_process_rxq_port(pmd, poll_list[i].rxq, + poll_list[i].port_no); + rx_packets += process_packets; + if (process_packets >= PMD_SLEEP_THRESH) { + sleep_time = 0; + } + } + + if (!rx_packets) { + /* We didn't receive anything in the process loop. + * Check if we need to send something. + * There was no time updates on current iteration. */ + pmd_thread_ctx_time_update(pmd); + tx_packets = dp_netdev_pmd_flush_output_packets(pmd, + max_sleep && sleep_time + ? true : false); + } + + n_offload_msgs = pmd_thread_offload_process(pmd, 1024); + + /* Only manage an 'idle' state if it matters: + * if the pmd-quiet-idle configuration is enabled. */ + if (quiet_idle) { + /* If we have nothing to do, and we are not yet considered 'idle', + * transition to idle state. */ + if (!rx_packets && !tx_packets && !n_offload_msgs && !pmd->idle) { + dp_netdev_pmd_idle_begin(pmd); + } + } + + if (max_sleep) { + /* Check if a sleep should happen on this iteration. */ + if (sleep_time) { + struct cycle_timer sleep_timer; + + cycle_timer_start(&pmd->perf_stats, &sleep_timer); + xnanosleep_no_quiesce(sleep_time * 1000); + time_slept = cycle_timer_stop(&pmd->perf_stats, &sleep_timer); + pmd_thread_ctx_time_update(pmd); + } + if (sleep_time < max_sleep) { + /* Increase sleep time for next iteration. */ + sleep_time += PMD_SLEEP_INC_US; + } else { + sleep_time = max_sleep; + } + } else { + /* Reset sleep time as max sleep policy may have been changed. */ + sleep_time = 0; + } + + /* Do RCU synchronization at fixed interval if not already in a + * continuous quiescent state. This ensures that synchronization + * would not be delayed long even at high load of packet processing. */ + if (!pmd->idle && pmd->ctx.now > pmd->next_rcu_quiesce) { + if (!ovsrcu_try_quiesce()) { + pmd->next_rcu_quiesce = + pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL; + } + } + + if (lc++ > 1024) { + lc = 0; + + dp_netdev_pmd_idle_end(pmd); + coverage_try_clear(); + dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt); + if (!ovsrcu_try_quiesce()) { + emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache)); + pmd->next_rcu_quiesce = + pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL; + } + + for (i = 0; i < poll_cnt; i++) { + uint64_t current_seq = + netdev_get_change_seq(poll_list[i].rxq->port->netdev); + if (poll_list[i].change_seq != current_seq) { + poll_list[i].change_seq = current_seq; + poll_list[i].rxq_enabled = + netdev_rxq_enabled(poll_list[i].rxq->rx); + } + } + } + + atomic_read_explicit(&pmd->reload, &reload, memory_order_acquire); + if (OVS_UNLIKELY(reload)) { + break; + } + + pmd_perf_end_iteration(s, rx_packets, tx_packets, time_slept, + pmd_perf_metrics_enabled(pmd)); + } + ovs_mutex_unlock(&pmd->perf_stats.stats_mutex); + + poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list); + atomic_read_relaxed(&pmd->wait_for_reload, &wait_for_reload); + atomic_read_relaxed(&pmd->reload_tx_qid, &reload_tx_qid); + atomic_read_relaxed(&pmd->exit, &exiting); + /* Signal here to make sure the pmd finishes + * reloading the updated configuration. */ + dp_netdev_pmd_reload_done(pmd); + + if (reload_tx_qid) { + pmd_free_static_tx_qid(pmd); + pmd_alloc_static_tx_qid(pmd); + } + + if (!exiting) { + goto reload; + } + + pmd_free_static_tx_qid(pmd); + dfc_cache_uninit(&pmd->flow_cache); + /* For e2e, PMDs should not handle offloads. */ + if (!dp_netdev_e2e_cache_enabled) { + pmd_thread_offload_uninit(pmd); + } + free(poll_list); + pmd_free_cached_ports(pmd); + if (dpdk_attached) { + dpdk_detach_thread(); + } + return NULL; +} + +static void +dp_netdev_disable_upcall(struct dp_netdev *dp) + OVS_ACQUIRES(dp->upcall_rwlock) +{ + fat_rwlock_wrlock(&dp->upcall_rwlock); +} + + +/* Meters */ +static void +dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED, + struct ofputil_meter_features *features) +{ + features->max_meters = MAX_METERS - 1; /* meter ID 0 is not used */ + features->band_types = DP_SUPPORTED_METER_BAND_TYPES; + features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK; + features->max_bands = MAX_BANDS; + features->max_color = 0; +} + +/* Tries to atomically add 'n' to 'value' in terms of saturation arithmetic, + * i.e., if the result will be larger than 'max_value', will store 'max_value' + * instead. */ +static void +atomic_sat_add(atomic_uint64_t *value, uint64_t n, uint64_t max_value) +{ + uint64_t current, new_value; + + atomic_read_relaxed(value, ¤t); + do { + new_value = current + n; + new_value = MIN(new_value, max_value); + } while (!atomic_compare_exchange_weak_relaxed(value, ¤t, + new_value)); +} + +/* Tries to atomically subtract 'n' from 'value'. Does not perform the + * operation and returns 'false' if the result will be less than 'min_value'. + * Otherwise, stores the result and returns 'true'. */ +static bool +atomic_bound_sub(atomic_uint64_t *value, uint64_t n, uint64_t min_value) +{ + uint64_t current; + + atomic_read_relaxed(value, ¤t); + do { + if (current < min_value + n) { + return false; + } + } while (!atomic_compare_exchange_weak_relaxed(value, ¤t, + current - n)); + return true; +} + +/* Applies the meter identified by 'meter_id' to 'packets_'. Packets + * that exceed a band are dropped in-place. */ +static void +dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_, + uint32_t meter_id, long long int now_ms) +{ + const size_t cnt = dp_packet_batch_size(packets_); + uint32_t exceeded_rate[NETDEV_MAX_BURST]; + uint32_t exceeded_band[NETDEV_MAX_BURST]; + uint64_t bytes, volume, meter_used, old; + uint64_t band_packets[MAX_BANDS]; + uint64_t band_bytes[MAX_BANDS]; + struct dp_meter_band *band; + struct dp_packet *packet; + struct dp_meter *meter; + bool exceeded = false; + + if (meter_id >= MAX_METERS) { + return; + } + + meter = dp_meter_lookup(&dp->meters, meter_id); + if (!meter) { + return; + } + + /* Initialize as negative values. */ + memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band); + /* Initialize as zeroes. */ + memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate); + + atomic_read_relaxed(&meter->used, &meter_used); + do { + if (meter_used >= now_ms) { + /* The '>' condition means that we have several threads hitting the + * same meter, and the other one already advanced the time. */ + meter_used = now_ms; + break; + } + } while (!atomic_compare_exchange_weak_relaxed(&meter->used, + &meter_used, now_ms)); + + /* Refill all buckets right away, since other threads may use them. */ + if (meter_used < now_ms) { + /* All packets will hit the meter at the same time. */ + uint64_t delta_t = now_ms - meter_used; + + /* Make sure delta_t will not be too large, so that bucket will not + * wrap around below. */ + delta_t = MIN(delta_t, meter->max_delta_t); + + for (int m = 0; m < meter->n_bands; m++) { + band = &meter->bands[m]; + /* Update band's bucket. We can't just use atomic add here, + * because we should never add above the max capacity. */ + atomic_sat_add(&band->bucket, delta_t * band->rate, + band->burst_size * 1000ULL); + } + } + + /* Update meter stats. */ + atomic_add_relaxed(&meter->packet_count, cnt, &old); + bytes = 0; + DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { + bytes += dp_packet_size(packet); + } + atomic_add_relaxed(&meter->byte_count, bytes, &old); + + /* Meters can operate in terms of packets per second or kilobits per + * second. */ + if (meter->flags & OFPMF13_PKTPS) { + /* Rate in packets/second, bucket 1/1000 packets. + * msec * packets/sec = 1/1000 packets. */ + volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */ + } else { + /* Rate in kbps, bucket in bits. + * msec * kbps = bits */ + volume = bytes * 8; + } + + /* Find the band hit with the highest rate for each packet (if any). */ + for (int m = 0; m < meter->n_bands; m++) { + band = &meter->bands[m]; + + /* Drain the bucket for all the packets, if possible. */ + if (atomic_bound_sub(&band->bucket, volume, 0)) { + continue; + } + + /* Band limit hit, must process packet-by-packet. */ + DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { + uint64_t packet_volume = (meter->flags & OFPMF13_PKTPS) + ? 1000 : (dp_packet_size(packet) * 8); + + if (!atomic_bound_sub(&band->bucket, packet_volume, 0)) { + /* Update the exceeding band for the exceeding packet. + * Only one band will be fired by a packet, and that can + * be different for each packet. */ + if (band->rate > exceeded_rate[i]) { + exceeded_rate[i] = band->rate; + exceeded_band[i] = m; + exceeded = true; + } + } + } + } + + /* No need to iterate over packets if there are no drops. */ + if (!exceeded) { + return; + } + + /* Fire the highest rate band exceeded by each packet, and drop + * packets if needed. */ + + memset(band_packets, 0, sizeof band_packets); + memset(band_bytes, 0, sizeof band_bytes); + + size_t j; + DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) { + uint32_t m = exceeded_band[j]; + + if (m != UINT32_MAX) { + /* Meter drop packet. */ + band_packets[m]++; + band_bytes[m] += dp_packet_size(packet); + dp_packet_delete(packet); + } else { + /* Meter accepts packet. */ + dp_packet_batch_refill(packets_, packet, j); + } + } + + for (int m = 0; m < meter->n_bands; m++) { + if (!band_packets[m]) { + continue; + } + band = &meter->bands[m]; + atomic_add_relaxed(&band->packet_count, band_packets[m], &old); + atomic_add_relaxed(&band->byte_count, band_bytes[m], &old); + COVERAGE_ADD(datapath_drop_meter, band_packets[m]); + } +} + +/* Meter set/get/del processing is still single-threaded. */ +static int +dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id, + struct ofputil_meter_config *config) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + uint32_t mid = meter_id.uint32; + struct dp_meter *meter; + int i; + + if (mid >= MAX_METERS) { + return EFBIG; /* Meter_id out of range. */ + } + + if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) { + return EBADF; /* Unsupported flags set */ + } + + if (config->n_bands > MAX_BANDS) { + return EINVAL; + } + + for (i = 0; i < config->n_bands; ++i) { + switch (config->bands[i].type) { + case OFPMBT13_DROP: + break; + default: + return ENODEV; /* Unsupported band type */ + } + } + + /* Allocate meter */ + meter = xzalloc(sizeof *meter + + config->n_bands * sizeof(struct dp_meter_band)); + + meter->flags = config->flags; + meter->n_bands = config->n_bands; + meter->max_delta_t = 0; + meter->id = mid; + atomic_init(&meter->used, time_msec()); + + /* set up bands */ + for (i = 0; i < config->n_bands; ++i) { + uint32_t band_max_delta_t; + uint64_t bucket_size; + + /* Set burst size to a workable value if none specified. */ + if (config->bands[i].burst_size == 0) { + config->bands[i].burst_size = config->bands[i].rate; + } + + meter->bands[i].rate = config->bands[i].rate; + meter->bands[i].burst_size = config->bands[i].burst_size; + /* Start with a full bucket. */ + bucket_size = meter->bands[i].burst_size * 1000ULL; + atomic_init(&meter->bands[i].bucket, bucket_size); + + /* Figure out max delta_t that is enough to fill any bucket. */ + band_max_delta_t = bucket_size / meter->bands[i].rate; + if (band_max_delta_t > meter->max_delta_t) { + meter->max_delta_t = band_max_delta_t; + } + } + + ovs_mutex_lock(&dp->meters_lock); + + dp_meter_detach_free(&dp->meters, mid); /* Free existing meter, if any. */ + dp_meter_attach(&dp->meters, meter); + + ovs_mutex_unlock(&dp->meters_lock); + + dp_netdev_offload_netdev_meter_set(mid, config); + + return 0; +} + +static int +dpif_netdev_meter_get(const struct dpif *dpif, + ofproto_meter_id meter_id_, + struct ofputil_meter_stats *stats, uint16_t n_bands) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + uint32_t meter_id = meter_id_.uint32; + struct dp_meter *meter; + int retval = 0; + + if (meter_id >= MAX_METERS) { + return EFBIG; + } + + meter = dp_meter_lookup(&dp->meters, meter_id); + if (!meter) { + return ENOENT; + } + + if (stats) { + int i = 0; + + atomic_read_relaxed(&meter->packet_count, &stats->packet_in_count); + atomic_read_relaxed(&meter->byte_count, &stats->byte_in_count); + + for (i = 0; i < n_bands && i < meter->n_bands; ++i) { + atomic_read_relaxed(&meter->bands[i].packet_count, + &stats->bands[i].packet_count); + atomic_read_relaxed(&meter->bands[i].byte_count, + &stats->bands[i].byte_count); + } + stats->n_bands = i; + + dp_netdev_offload_netdev_meter_get(meter_id, stats, stats->n_bands); + } + + return retval; +} + +static int +dpif_netdev_meter_del(struct dpif *dpif, + ofproto_meter_id meter_id_, + struct ofputil_meter_stats *stats, uint16_t n_bands) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + int error; + + error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands); + if (!error) { + uint32_t meter_id = meter_id_.uint32; + + ovs_mutex_lock(&dp->meters_lock); + dp_netdev_offload_netdev_meter_del(meter_id, stats, n_bands); + dp_meter_detach_free(&dp->meters, meter_id); + ovs_mutex_unlock(&dp->meters_lock); + } + return error; +} + + +static void +dpif_netdev_disable_upcall(struct dpif *dpif) + OVS_NO_THREAD_SAFETY_ANALYSIS +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + dp_netdev_disable_upcall(dp); +} + +static void +dp_netdev_enable_upcall(struct dp_netdev *dp) + OVS_RELEASES(dp->upcall_rwlock) +{ + fat_rwlock_unlock(&dp->upcall_rwlock); +} + +static void +dpif_netdev_enable_upcall(struct dpif *dpif) + OVS_NO_THREAD_SAFETY_ANALYSIS +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + dp_netdev_enable_upcall(dp); +} + +static void +dpif_netdev_register_sflow_upcall_cb(struct dpif *dpif OVS_UNUSED, + sflow_upcall_callback *cb) +{ + sflow_upcall_cb = cb; +} + +static void +dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd) +{ + atomic_store_relaxed(&pmd->wait_for_reload, false); + atomic_store_relaxed(&pmd->reload_tx_qid, false); + pmd->last_reload_seq = seq_read(pmd->reload_seq); + atomic_store_explicit(&pmd->reload, false, memory_order_release); +} + +/* Finds and refs the dp_netdev_pmd_thread on core 'core_id'. Returns + * the pointer if succeeds, otherwise, NULL (it can return NULL even if + * 'core_id' is NON_PMD_CORE_ID). + * + * Caller must unrefs the returned reference. */ +static struct dp_netdev_pmd_thread * +dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id) +{ + struct dp_netdev_pmd_thread *pmd; + + CMAP_FOR_EACH_WITH_HASH (pmd, node, hash_int(core_id, 0), + &dp->poll_threads) { + if (pmd->core_id == core_id) { + return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL; + } + } + + return NULL; +} + +/* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */ +static void +dp_netdev_set_nonpmd(struct dp_netdev *dp) + OVS_REQ_WRLOCK(dp->port_rwlock) +{ + struct dp_netdev_pmd_thread *non_pmd; + + non_pmd = xzalloc(sizeof *non_pmd); + dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC); +} + +/* Caller must have valid pointer to 'pmd'. */ +static bool +dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd) +{ + return ovs_refcount_try_ref_rcu(&pmd->ref_cnt); +} + +static void +dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd) +{ + if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) { + ovsrcu_postpone(dp_netdev_destroy_pmd, pmd); + } +} + +/* Given cmap position 'pos', tries to ref the next node. If try_ref() + * fails, keeps checking for next node until reaching the end of cmap. + * + * Caller must unrefs the returned reference. */ +static struct dp_netdev_pmd_thread * +dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos) +{ + struct dp_netdev_pmd_thread *next; + + do { + struct cmap_node *node; + + node = cmap_next_position(&dp->poll_threads, pos); + next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node) + : NULL; + } while (next && !dp_netdev_pmd_try_ref(next)); + + return next; +} + +/* Configures the 'pmd' based on the input argument. */ +static void +dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp, + unsigned core_id, int numa_id) +{ + pmd->dp = dp; + pmd->core_id = core_id; + pmd->numa_id = numa_id; + pmd->need_reload = false; + pmd->n_output_batches = 0; + + ovs_refcount_init(&pmd->ref_cnt); + atomic_init(&pmd->exit, false); + pmd->reload_seq = seq_create(); + pmd->last_reload_seq = seq_read(pmd->reload_seq); + atomic_init(&pmd->reload, false); + ovs_mutex_init(&pmd->flow_mutex); + ovs_mutex_init(&pmd->port_mutex); + ovs_mutex_init(&pmd->bond_mutex); + cmap_init(&pmd->flow_table); + cmap_init(&pmd->classifiers); + cmap_init(&pmd->simple_match_table); + ccmap_init(&pmd->n_flows); + ccmap_init(&pmd->n_simple_flows); + pmd->ctx.last_rxq = NULL; + pmd_thread_ctx_time_update(pmd); + pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL; + pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL; + pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN; + pmd->busy_cycles_intrvl = xzalloc(PMD_INTERVAL_MAX * + sizeof *pmd->busy_cycles_intrvl); + hmap_init(&pmd->poll_list); + hmap_init(&pmd->tx_ports); + hmap_init(&pmd->tnl_port_cache); + hmap_init(&pmd->send_port_cache); + cmap_init(&pmd->tx_bonds); + + pmd_init_max_sleep(dp, pmd); + + /* Initialize DPIF function pointer to the default configured version. */ + atomic_init(&pmd->netdev_input_func, dp_netdev_impl_get_default()); + + /* Init default miniflow_extract function */ + atomic_init(&pmd->miniflow_extract_opt, dp_mfex_impl_get_default()); + + /* init the 'flow_cache' since there is no + * actual thread created for NON_PMD_CORE_ID. */ + if (core_id == NON_PMD_CORE_ID) { + dfc_cache_init(&pmd->flow_cache); + pmd_alloc_static_tx_qid(pmd); + } + pmd_perf_stats_init(&pmd->perf_stats); + cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node), + hash_int(core_id, 0)); +} + +static void +dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd) +{ + struct dpcls *cls; + + dp_netdev_pmd_flow_flush(pmd); + hmap_destroy(&pmd->send_port_cache); + hmap_destroy(&pmd->tnl_port_cache); + hmap_destroy(&pmd->tx_ports); + cmap_destroy(&pmd->tx_bonds); + hmap_destroy(&pmd->poll_list); + free(pmd->busy_cycles_intrvl); + /* All flows (including their dpcls_rules) have been deleted already */ + CMAP_FOR_EACH (cls, node, &pmd->classifiers) { + dpcls_destroy(cls); + ovsrcu_postpone(free, cls); + } + cmap_destroy(&pmd->classifiers); + cmap_destroy(&pmd->flow_table); + cmap_destroy(&pmd->simple_match_table); + ccmap_destroy(&pmd->n_flows); + ccmap_destroy(&pmd->n_simple_flows); + ovs_mutex_destroy(&pmd->flow_mutex); + seq_destroy(pmd->reload_seq); + ovs_mutex_destroy(&pmd->port_mutex); + ovs_mutex_destroy(&pmd->bond_mutex); + free(pmd->netdev_input_func_userdata); + free(pmd); +} + +/* Stops the pmd thread, removes it from the 'dp->poll_threads', + * and unrefs the struct. */ +static void +dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd) +{ + /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize, + * but extra cleanup is necessary */ + if (pmd->core_id == NON_PMD_CORE_ID) { + ovs_mutex_lock(&dp->non_pmd_mutex); + dfc_cache_uninit(&pmd->flow_cache); + pmd_free_cached_ports(pmd); + pmd_free_static_tx_qid(pmd); + ovs_mutex_unlock(&dp->non_pmd_mutex); + } else { + atomic_store_relaxed(&pmd->exit, true); + dp_netdev_reload_pmd__(pmd); + xpthread_join(pmd->thread, NULL); + } + + dp_netdev_pmd_clear_ports(pmd); + + /* Purges the 'pmd''s flows after stopping the thread, but before + * destroying the flows, so that the flow stats can be collected. */ + if (dp->dp_purge_cb) { + dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id); + } + cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0)); + dp_netdev_pmd_unref(pmd); +} + +/* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd + * thread. */ +static void +dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd) + OVS_EXCLUDED(dp->upcall_rwlock) +{ + struct dp_netdev_pmd_thread *pmd; + struct dp_netdev_pmd_thread **pmd_list; + size_t k = 0, n_pmds; + + n_pmds = cmap_count(&dp->poll_threads); + pmd_list = xcalloc(n_pmds, sizeof *pmd_list); + + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) { + continue; + } + /* We cannot call dp_netdev_del_pmd(), since it alters + * 'dp->poll_threads' (while we're iterating it) and it + * might quiesce. */ + ovs_assert(k < n_pmds); + pmd_list[k++] = pmd; + } + + for (size_t i = 0; i < k; i++) { + dp_netdev_del_pmd(dp, pmd_list[i]); + } + free(pmd_list); +} + +/* Deletes all rx queues from pmd->poll_list and all the ports from + * pmd->tx_ports. */ +static void +dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd) +{ + struct rxq_poll *poll; + struct tx_port *port; + struct tx_bond *tx; + + ovs_mutex_lock(&pmd->port_mutex); + HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) { + free(poll); + } + HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) { + free(port->txq_pkts); + free(port); + } + ovs_mutex_unlock(&pmd->port_mutex); + + ovs_mutex_lock(&pmd->bond_mutex); + CMAP_FOR_EACH (tx, node, &pmd->tx_bonds) { + cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id)); + ovsrcu_postpone(free, tx); + } + ovs_mutex_unlock(&pmd->bond_mutex); +} + +/* Adds rx queue to poll_list of PMD thread, if it's not there already. */ +static void +dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd, + struct dp_netdev_rxq *rxq) + OVS_REQUIRES(pmd->port_mutex) +{ + int qid = netdev_rxq_get_queue_id(rxq->rx); + uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid); + struct rxq_poll *poll; + + HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) { + if (poll->rxq == rxq) { + /* 'rxq' is already polled by this thread. Do nothing. */ + return; + } + } + + poll = xmalloc(sizeof *poll); + poll->rxq = rxq; + hmap_insert(&pmd->poll_list, &poll->node, hash); + + pmd->need_reload = true; +} + +/* Delete 'poll' from poll_list of PMD thread. */ +static void +dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd, + struct rxq_poll *poll) + OVS_REQUIRES(pmd->port_mutex) +{ + hmap_remove(&pmd->poll_list, &poll->node); + free(poll); + + pmd->need_reload = true; +} + +/* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the + * changes to take effect. */ +static void +dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, + struct dp_netdev_port *port) + OVS_REQUIRES(pmd->port_mutex) +{ + struct tx_port *tx; + + tx = tx_port_lookup(&pmd->tx_ports, port->port_no); + if (tx) { + /* 'port' is already on this thread tx cache. Do nothing. */ + return; + } + + tx = xzalloc(sizeof *tx); + + tx->port = port; + tx->qid = -1; + tx->flush_time = 0LL; + dp_packet_batch_init(&tx->output_pkts); + + if (tx->port->txq_mode == TXQ_MODE_XPS_HASH) { + int i, n_txq = netdev_n_txq(tx->port->netdev); + + tx->txq_pkts = xzalloc(n_txq * sizeof *tx->txq_pkts); + for (i = 0; i < n_txq; i++) { + dp_packet_batch_init(&tx->txq_pkts[i]); + } + } + + hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no)); + pmd->need_reload = true; +} + +/* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the + * changes to take effect. */ +static void +dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd, + struct tx_port *tx) + OVS_REQUIRES(pmd->port_mutex) +{ + hmap_remove(&pmd->tx_ports, &tx->node); + free(tx->txq_pkts); + free(tx); + pmd->need_reload = true; +} + +/* Add bond to the tx bond cmap of 'pmd'. */ +static void +dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, + struct tx_bond *bond, bool update) + OVS_EXCLUDED(pmd->bond_mutex) +{ + struct tx_bond *tx; + + ovs_mutex_lock(&pmd->bond_mutex); + tx = tx_bond_lookup(&pmd->tx_bonds, bond->bond_id); + + if (tx && !update) { + /* It's not an update and the entry already exists. Do nothing. */ + goto unlock; + } + + if (tx) { + struct tx_bond *new_tx = xmemdup(bond, sizeof *bond); + + /* Copy the stats for each bucket. */ + for (int i = 0; i < BOND_BUCKETS; i++) { + uint64_t n_packets, n_bytes; + + atomic_read_relaxed(&tx->member_buckets[i].n_packets, &n_packets); + atomic_read_relaxed(&tx->member_buckets[i].n_bytes, &n_bytes); + atomic_init(&new_tx->member_buckets[i].n_packets, n_packets); + atomic_init(&new_tx->member_buckets[i].n_bytes, n_bytes); + } + cmap_replace(&pmd->tx_bonds, &tx->node, &new_tx->node, + hash_bond_id(bond->bond_id)); + ovsrcu_postpone(free, tx); + } else { + tx = xmemdup(bond, sizeof *bond); + cmap_insert(&pmd->tx_bonds, &tx->node, hash_bond_id(bond->bond_id)); + } +unlock: + ovs_mutex_unlock(&pmd->bond_mutex); +} + +/* Delete bond from the tx bond cmap of 'pmd'. */ +static void +dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd, + uint32_t bond_id) + OVS_EXCLUDED(pmd->bond_mutex) +{ + struct tx_bond *tx; + + ovs_mutex_lock(&pmd->bond_mutex); + tx = tx_bond_lookup(&pmd->tx_bonds, bond_id); + if (tx) { + cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id)); + ovsrcu_postpone(free, tx); + } + ovs_mutex_unlock(&pmd->bond_mutex); +} + +static char * +dpif_netdev_get_datapath_version(void) +{ + return xstrdup(""); +} + +static void +dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size, + uint16_t tcp_flags, long long now) +{ + uint16_t flags; + + atomic_store_relaxed(&netdev_flow->stats.used, now); + non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt); + non_atomic_ullong_add(&netdev_flow->stats.byte_count, size); + atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags); + flags |= tcp_flags; + atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags); + if (netdev_flow->partial_offload) { + non_atomic_ullong_add(&netdev_flow->stats.partial_packet_count, cnt); + non_atomic_ullong_add(&netdev_flow->stats.partial_byte_count, size); + } +} + +static int +dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_, + struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid, + enum dpif_upcall_type type, const struct nlattr *userdata, + struct ofpbuf *actions, struct ofpbuf *put_actions) +{ + struct dp_netdev *dp = pmd->dp; + + if (OVS_UNLIKELY(!dp->upcall_cb)) { + return ENODEV; + } + + if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) { + struct ds ds = DS_EMPTY_INITIALIZER; + char *packet_str; + struct ofpbuf key; + struct odp_flow_key_parms odp_parms = { + .flow = flow, + .mask = wc ? &wc->masks : NULL, + .support = dp_netdev_support, + }; + + ofpbuf_init(&key, 0); + odp_flow_key_from_flow(&odp_parms, &key); + packet_str = ofp_dp_packet_to_string(packet_); + + odp_flow_key_format(key.data, key.size, &ds); + + VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name, + dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str); + + ofpbuf_uninit(&key); + free(packet_str); + + ds_destroy(&ds); + } + + if (type != DPIF_UC_MISS) { + dp_packet_ol_send_prepare(packet_, 0); + } + + return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata, + actions, wc, put_actions, dp->upcall_aux); +} + +static inline uint32_t +dpif_netdev_packet_get_rss_hash(struct dp_packet *packet, + const struct miniflow *mf) +{ + uint32_t hash, recirc_depth; + + if (OVS_LIKELY(dp_packet_rss_valid(packet))) { + hash = dp_packet_get_rss_hash(packet); + } else { + hash = miniflow_hash_5tuple(mf, 0); + dp_packet_set_rss_hash(packet, hash); + } + + /* The RSS hash must account for the recirculation depth to avoid + * collisions in the exact match cache */ + recirc_depth = *recirc_depth_get_unsafe(); + if (OVS_UNLIKELY(recirc_depth)) { + hash = hash_finish(hash, recirc_depth); + } + return hash; +} + +struct packet_batch_per_flow { + unsigned int byte_count; + unsigned int sample_byte_count; + uint16_t tcp_flags; + struct dp_netdev_flow *flow; + + struct dp_packet_batch array; +}; + +static inline void +packet_batch_per_flow_update(struct packet_batch_per_flow *batch, + struct dp_packet *packet, + uint16_t tcp_flags) +{ + batch->byte_count += dp_packet_size(packet); + if (packet->md.sample) { + batch->sample_byte_count += dp_packet_size(packet); + } + batch->tcp_flags |= tcp_flags; + dp_packet_batch_add(&batch->array, packet); +} + +static inline void +packet_batch_per_flow_init(struct packet_batch_per_flow *batch, + struct dp_netdev_flow *flow) +{ + flow->batch = batch; + + batch->flow = flow; + dp_packet_batch_init(&batch->array); + batch->byte_count = 0; + batch->sample_byte_count = 0; + batch->tcp_flags = 0; +} + +static inline void +packet_batch_per_flow_execute(struct packet_batch_per_flow *batch, + struct dp_netdev_pmd_thread *pmd) +{ + struct dp_netdev_actions *actions; + struct dp_netdev_flow *flow = batch->flow; + struct nlattr *updated_actions; + size_t updated_actions_size; + uint32_t pkts_count; + uint32_t byte_count; + size_t pad; + int i; + + pkts_count = dp_packet_batch_size(&batch->array) - + dp_packet_batch_sampled_pkts(&batch->array); + byte_count = batch->byte_count - batch->sample_byte_count; + + dp_netdev_flow_used(flow, pkts_count, byte_count, batch->tcp_flags, + pmd->ctx.now / 1000); + + /*skip the actions that were executed by the HW */ + actions = dp_netdev_flow_get_actions(flow); + updated_actions = actions->actions; + updated_actions_size = actions->size; + for (i = 0; i < flow->skip_actions; i++) { + pad = PAD_SIZE(updated_actions->nla_len, NLMSG_ALIGNTO); + updated_actions_size -= updated_actions->nla_len + pad; + updated_actions = nl_attr_next(updated_actions); + } + + dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow, flow, + updated_actions, updated_actions_size); +} + +void +dp_netdev_batch_execute(struct dp_netdev_pmd_thread *pmd, + struct dp_packet_batch *packets, + struct dpcls_rule *rule, + uint32_t bytes, + uint32_t sample_bytes, + uint16_t tcp_flags) +{ + /* Gets action* from the rule. */ + struct dp_netdev_flow *flow = dp_netdev_flow_cast(rule); + struct dp_netdev_actions *actions = dp_netdev_flow_get_actions(flow); + uint32_t pkts_num = dp_packet_batch_size(packets) - + dp_packet_batch_sampled_pkts(packets); + + dp_netdev_flow_used(flow, pkts_num, bytes - sample_bytes, tcp_flags, + pmd->ctx.now / 1000); + const uint32_t steal = 1; + dp_netdev_execute_actions(pmd, packets, steal, &flow->flow, flow, + actions->actions, actions->size); +} + +static inline void +dp_netdev_queue_batches(struct dp_packet *pkt, + struct dp_netdev_flow *flow, uint16_t tcp_flags, + struct packet_batch_per_flow *batches, + size_t *n_batches) +{ + struct packet_batch_per_flow *batch = flow->batch; + + if (OVS_UNLIKELY(!batch)) { + batch = &batches[(*n_batches)++]; + packet_batch_per_flow_init(batch, flow); + } + + packet_batch_per_flow_update(batch, pkt, tcp_flags); +} + +/* SMC lookup function for a batch of packets. + * By doing batching SMC lookup, we can use prefetch + * to hide memory access latency. + */ +static inline void +smc_lookup_batch(struct dp_netdev_pmd_thread *pmd, + struct netdev_flow_key *keys, + struct netdev_flow_key **missed_keys, + struct dp_packet_batch *packets_, + const int cnt, + struct dp_packet_flow_map *flow_map, + uint8_t *index_map) +{ + int i; + struct dp_packet *packet; + size_t n_smc_hit = 0, n_missed = 0; + struct dfc_cache *cache = &pmd->flow_cache; + struct smc_cache *smc_cache = &cache->smc_cache; + const struct cmap_node *flow_node; + int recv_idx; + uint16_t tcp_flags; + + /* Prefetch buckets for all packets */ + for (i = 0; i < cnt; i++) { + OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]); + } + + DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) { + struct dp_netdev_flow *flow = NULL; + flow_node = smc_entry_get(pmd, keys[i].hash); + bool hit = false; + /* Get the original order of this packet in received batch. */ + recv_idx = index_map[i]; + + if (OVS_LIKELY(flow_node != NULL)) { + CMAP_NODE_FOR_EACH (flow, node, flow_node) { + /* Since we dont have per-port megaflow to check the port + * number, we need to verify that the input ports match. */ + if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) && + flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) { + tcp_flags = miniflow_get_tcp_flags(&keys[i].mf); + + /* SMC hit and emc miss, we insert into EMC */ + keys[i].len = + netdev_flow_key_size(miniflow_n_values(&keys[i].mf)); + emc_probabilistic_insert(pmd, &keys[i], flow); + /* Add these packets into the flow map in the same order + * as received. + */ + packet_enqueue_to_flow_map(packet, flow, tcp_flags, + flow_map, recv_idx); + n_smc_hit++; + hit = true; + + if (dp_netdev_e2e_cache_enabled) { + e2e_cache_trace_add_flow(packet, &flow->mega_ufid); + } + break; + } + } + if (hit) { + continue; + } + } + + /* SMC missed. Group missed packets together at + * the beginning of the 'packets' array. */ + dp_packet_batch_refill(packets_, packet, i); + + /* Preserve the order of packet for flow batching. */ + index_map[n_missed] = recv_idx; + + /* Put missed keys to the pointer arrays return to the caller */ + missed_keys[n_missed++] = &keys[i]; + } + + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit); + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_MISS, n_missed); +} + +struct dp_netdev_flow * +smc_lookup_single(struct dp_netdev_pmd_thread *pmd, + struct dp_packet *packet, + struct netdev_flow_key *key) +{ + const struct cmap_node *flow_node = smc_entry_get(pmd, key->hash); + + if (OVS_LIKELY(flow_node != NULL)) { + struct dp_netdev_flow *flow = NULL; + + CMAP_NODE_FOR_EACH (flow, node, flow_node) { + /* Since we dont have per-port megaflow to check the port + * number, we need to verify that the input ports match. */ + if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, key) && + flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) { + + return (void *) flow; + } + } + } + + return NULL; +} + +inline int +dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd, + struct dp_packet *packet, + struct dp_netdev_flow **flow, + uint8_t *skip_actions OVS_UNUSED, bool dump OVS_UNUSED) +{ + struct user_action_cookie sflow_cookie; + struct flow_tnl sflow_tunnel_info; + struct dpif_sflow_attr sflow_attr OVS_UNUSED = { + .userdata = &sflow_cookie, + .tunnel = &sflow_tunnel_info }; + uint32_t mark; + +#ifdef ALLOW_EXPERIMENTAL_API /* Packet restoration API required. */ + /* Restore the packet if HW processing was terminated before completion. */ + struct dp_netdev_rxq *rxq = pmd->ctx.last_rxq; + bool miss_api_supported; + + atomic_read_relaxed(&rxq->port->netdev->hw_info.miss_api_supported, + &miss_api_supported); + if (miss_api_supported) { + int err = netdev_hw_miss_packet_recover(rxq->port->netdev, packet, + skip_actions, &sflow_attr, dump); + + /* Packet sample flag indicates successfully recovered sFlow packet, + * this packet will need to go through normal datapath processing and + * will be dropped once sample action is reached. + */ + if (packet->md.sample) { + return 0; + } + /* Return code EIO for this case indicates successfully recovered sFlow + * packet that does not need to go through normal datapath processing, + * handle this packet in the sFlow upcall then drop it from the + * datapath. + */ + if (err == EIO) { + struct dpif_upcall_sflow dupcall; + + dupcall.iifindex = -1; + dupcall.packet = *packet; + dupcall.in_port = packet->md.in_port.odp_port; + dupcall.sflow_attr = &sflow_attr; + sflow_upcall_cb(&dupcall); + dp_packet_delete(packet); + return -1; + } + if (err && err != EOPNOTSUPP) { + COVERAGE_INC(datapath_drop_hw_miss_recover); + return -1; + } + } +#endif + + /* If no mark, no flow to find. */ + if (dp_packet_has_flow_mark(packet, &mark)) { + *flow = mark_to_flow_find(pmd, mark); + if (*flow) { + (*flow)->partial_offload = true; + } + } else { + *flow = NULL; + } + + dp_packet_reset_offload(packet); + return 0; +} + +/* Enqueues already classified packet into per-flow batches or the flow map, + * depending on the fact if batching enabled. */ +static inline void +dfc_processing_enqueue_classified_packet(struct dp_packet *packet, + struct dp_netdev_flow *flow, + uint16_t tcp_flags, + bool batch_enable, + struct packet_batch_per_flow *batches, + size_t *n_batches, + struct dp_packet_flow_map *flow_map, + size_t *map_cnt) + +{ + if (OVS_LIKELY(batch_enable)) { + dp_netdev_queue_batches(packet, flow, tcp_flags, batches, + n_batches); + } else { + /* Flow batching should be performed only after fast-path + * processing is also completed for packets with emc miss + * or else it will result in reordering of packets with + * same datapath flows. */ + packet_enqueue_to_flow_map(packet, flow, tcp_flags, + flow_map, (*map_cnt)++); + } + +} + +#define PKT_DUMP_MAX_LEN 180 + +static void +dump_sw_packet(const char *prefix, odp_port_t port_no, struct dp_packet *pkt) +{ + struct ds s; + + ds_init(&s); + + VLOG_INFO("%sport_no=%d: in_port=%d, recirc_id=%d, %s", prefix, port_no, + pkt->md.in_port.odp_port, pkt->md.recirc_id, + ds_cstr(dp_packet_ds_put_hex(&s, pkt, PKT_DUMP_MAX_LEN))); + + ds_destroy(&s); +} + +static int +parse_packet_tnl(const struct dp_netdev_pmd_thread *pmd, + struct dp_packet *packet) +{ + struct pkt_metadata md; + uint16_t l2_pad_size; + struct flow_tnl tnl; + struct tx_port *tx; + uint16_t l2_5_ofs; + void *orig_data; + uint16_t l3_ofs; + uint16_t l4_ofs; + int offset = 0; + + parse_tcp_flags(packet, NULL, NULL, NULL); + md = packet->md; + l2_pad_size = packet->l2_pad_size; + l2_5_ofs = packet->l2_5_ofs; + l3_ofs = packet->l3_ofs; + l4_ofs = packet->l4_ofs; + + HMAP_FOR_EACH (tx, node, &pmd->tnl_port_cache) { + const struct netdev *netdev = tx->port->netdev; + + if (!netdev->netdev_class->support_explicit_header || + !netdev->netdev_class->support_explicit_header(netdev, packet)) { + continue; + } + + orig_data = dp_packet_data(packet); + if (netdev->netdev_class->pop_header(netdev, packet, true)) { + offset = (uint8_t *) dp_packet_data(packet) - (uint8_t *) orig_data; + break; + } + } + + parse_tcp_flags(packet, NULL, NULL, NULL); + tnl = packet->md.tunnel; + packet->md = md; + packet->md.tunnel = tnl; + packet->l2_pad_size = l2_pad_size; + packet->l2_5_ofs = l2_5_ofs; + packet->l3_ofs = l3_ofs; + packet->l4_ofs = l4_ofs; + + return offset; +} + +/* Try to process all ('cnt') the 'packets' using only the datapath flow cache + * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the + * miniflow is copied into 'keys' and the packet pointer is moved at the + * beginning of the 'packets' array. The pointers of missed keys are put in the + * missed_keys pointer array for future processing. + * + * The function returns the number of packets that needs to be processed in the + * 'packets' array (they have been moved to the beginning of the vector). + * + * For performance reasons a caller may choose not to initialize the metadata + * in 'packets_'. If 'md_is_valid' is false, the metadata in 'packets' + * is not valid and must be initialized by this function using 'port_no'. + * If 'md_is_valid' is true, the metadata is already valid and 'port_no' + * will be ignored. + */ +static inline size_t +dfc_processing(struct dp_netdev_pmd_thread *pmd, + struct dp_packet_batch *packets_, + struct netdev_flow_key *keys, + struct netdev_flow_key **missed_keys, + struct packet_batch_per_flow batches[], size_t *n_batches, + struct dp_packet_flow_map *flow_map, + size_t *n_flows, uint8_t *index_map, + bool md_is_valid, odp_port_t port_no) +{ + const bool netdev_flow_api = netdev_is_flow_api_enabled(); + const uint32_t recirc_depth = *recirc_depth_get(); + const size_t cnt = dp_packet_batch_size(packets_); + size_t n_missed = 0, n_emc_hit = 0, n_phwol_hit = 0; + size_t n_mfex_opt_hit = 0, n_simple_hit = 0; + size_t n_emc_miss = 0, n_simple_miss = 0; + struct dfc_cache *cache = &pmd->flow_cache; + struct netdev_flow_key *key = &keys[0]; + struct dp_packet *packet; + size_t map_cnt = 0; + bool batch_enable = true; + uint8_t skip_actions = 0; + int parse_tnl_offset = 0; + + const bool simple_match_enabled = + !md_is_valid && dp_netdev_simple_match_enabled(pmd, port_no); + /* 'simple_match_table' is a full flow table. If the flow is not there, + * upcall is required, and there is no chance to find a match in caches. */ + const bool smc_enable_db = !simple_match_enabled && pmd->ctx.smc_enable_db; + const uint32_t cur_min = simple_match_enabled + ? 0 : pmd->ctx.emc_insert_min; + + pmd_perf_update_counter(&pmd->perf_stats, + md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV, + cnt); + int i; + DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) { + struct dp_netdev_flow *flow = NULL; + uint16_t tcp_flags; + + if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) { + dp_packet_delete(packet); + COVERAGE_INC(datapath_drop_rx_invalid_packet); + continue; + } + + if (i != cnt - 1) { + struct dp_packet **packets = packets_->packets; + /* Prefetch next packet data and metadata. */ + OVS_PREFETCH(dp_packet_data(packets[i+1])); + pkt_metadata_prefetch_init(&packets[i+1]->md); + } + + if (!md_is_valid) { + pkt_metadata_init(&packet->md, port_no); + if (dp_netdev_e2e_cache_enabled) { + dp_packet_e2e_init(packet); + } + } + + if (netdev_flow_api && recirc_depth == 0) { + bool flag; + + dp_netdev_read_dump_packets_enabled(&flag); + if (OVS_UNLIKELY(flag)) { + dump_sw_packet("", port_no, packet); + } + if (OVS_UNLIKELY(dp_netdev_hw_flow(pmd, packet, &flow, &skip_actions, flag))) { + /* Packet restoration failed and it was dropped, do not + * continue processing. + */ + continue; + } + if (OVS_UNLIKELY(flag)) { + dump_sw_packet("post-hw-recover: ", port_no, packet); + } + if (OVS_LIKELY(flow)) { + flow->skip_actions = skip_actions; + tcp_flags = parse_tcp_flags(packet, NULL, NULL, NULL); + if (!packet->md.sample) { + n_phwol_hit++; + } + dfc_processing_enqueue_classified_packet( + packet, flow, tcp_flags, batch_enable, + batches, n_batches, flow_map, &map_cnt); + if (dp_netdev_e2e_cache_enabled) { + e2e_cache_trace_add_flow(packet, &flow->mega_ufid); + } + continue; + } + } + + if (!flow && simple_match_enabled) { + ovs_be16 dl_type = 0, vlan_tci = 0; + uint8_t nw_frag = 0; + + tcp_flags = parse_tcp_flags(packet, &dl_type, &nw_frag, &vlan_tci); + flow = dp_netdev_simple_match_lookup(pmd, port_no, dl_type, + nw_frag, vlan_tci); + if (OVS_LIKELY(flow)) { + if (!packet->md.sample) { + n_simple_hit++; + } + dfc_processing_enqueue_classified_packet( + packet, flow, tcp_flags, batch_enable, + batches, n_batches, flow_map, &map_cnt); + if (dp_netdev_e2e_cache_enabled) { + e2e_cache_trace_add_flow(packet, &flow->mega_ufid); + } + continue; + } else if (!packet->md.sample) { + n_simple_miss++; + } + } + + /* In case it is the first recirc implicitly parse the outer header, + * if exists and fits one of the tunnels configured. + */ + if (recirc_depth == 0 && packet->md.recirc_id == 0) { + parse_tnl_offset = parse_packet_tnl(pmd, packet); + } + /* The packet flow parsing is done according to the inner. */ + miniflow_extract(packet, &key->mf); + /* In case the packet outer header was parsed, it was also popped. + * Restore it. + */ + if (parse_tnl_offset) { + dp_packet_set_size(packet, + dp_packet_size(packet) + parse_tnl_offset); + dp_packet_set_data(packet, ((uint8_t *) dp_packet_data(packet) - + parse_tnl_offset)); + } + key->len = 0; /* Not computed yet. */ + key->hash = + (md_is_valid == false) + ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf) + : dpif_netdev_packet_get_rss_hash(packet, &key->mf); + + /* If EMC is disabled skip emc_lookup */ + flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL; + if (OVS_LIKELY(flow)) { + tcp_flags = miniflow_get_tcp_flags(&key->mf); + if (!packet->md.sample) { + n_emc_hit++; + } + dfc_processing_enqueue_classified_packet( + packet, flow, tcp_flags, batch_enable, + batches, n_batches, flow_map, &map_cnt); + if (dp_netdev_e2e_cache_enabled) { + e2e_cache_trace_add_flow(packet, &flow->mega_ufid); + } + } else { + if (cur_min != 0 && !packet->md.sample) { + n_emc_miss++; + } + /* Exact match cache missed. Group missed packets together at + * the beginning of the 'packets' array. */ + dp_packet_batch_refill(packets_, packet, i); + + /* Preserve the order of packet for flow batching. */ + index_map[n_missed] = map_cnt; + flow_map[map_cnt++].flow = NULL; + + /* 'key[n_missed]' contains the key of the current packet and it + * will be passed to SMC lookup. The next key should be extracted + * to 'keys[n_missed + 1]'. + * We also maintain a pointer array to keys missed both SMC and EMC + * which will be returned to the caller for future processing. */ + missed_keys[n_missed] = key; + key = &keys[++n_missed]; + + /* Skip batching for subsequent packets to avoid reordering. */ + batch_enable = false; + } + } + /* Count of packets which are not flow batched. */ + *n_flows = map_cnt; + + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_PHWOL_HIT, n_phwol_hit); + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MFEX_OPT_HIT, + n_mfex_opt_hit); + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SIMPLE_HIT, + n_simple_hit); + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SIMPLE_MISS, + n_simple_miss); + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit); + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_MISS, n_emc_miss); + + if (!smc_enable_db) { + return dp_packet_batch_size(packets_); + } + + /* Packets miss EMC will do a batch lookup in SMC if enabled */ + smc_lookup_batch(pmd, keys, missed_keys, packets_, + n_missed, flow_map, index_map); + + return dp_packet_batch_size(packets_); +} + +static inline int +handle_packet_upcall(struct dp_netdev_pmd_thread *pmd, + struct dp_packet *packet, + const struct netdev_flow_key *key, + struct ofpbuf *actions, struct ofpbuf *put_actions) +{ + struct dp_netdev_flow *netdev_flow = NULL; + struct ofpbuf *add_actions; + struct dp_packet_batch b; + struct match match; + ovs_u128 ufid; + int error; + uint64_t cycles = cycles_counter_update(&pmd->perf_stats); + odp_port_t orig_in_port = packet->md.orig_in_port; + + match.tun_md.valid = false; + miniflow_expand(&key->mf, &match.flow); + memset(&match.wc, 0, sizeof match.wc); + + ofpbuf_clear(actions); + ofpbuf_clear(put_actions); + + odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid); + error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc, + &ufid, DPIF_UC_MISS, NULL, actions, + put_actions); + if (OVS_UNLIKELY(error && error != ENOSPC)) { + dp_packet_delete(packet); + COVERAGE_INC(datapath_drop_upcall_error); + return error; + } + + /* The Netlink encoding of datapath flow keys cannot express + * wildcarding the presence of a VLAN tag. Instead, a missing VLAN + * tag is interpreted as exact match on the fact that there is no + * VLAN. Unless we refactor a lot of code that translates between + * Netlink and struct flow representations, we have to do the same + * here. This must be in sync with 'match' in dpif_netdev_flow_put(). */ + if (!match.wc.masks.vlans[0].tci) { + match.wc.masks.vlans[0].tci = htons(VLAN_VID_MASK | VLAN_CFI); + } + + add_actions = put_actions->size ? put_actions : actions; + if (OVS_LIKELY(error != ENOSPC)) { + /* XXX: There's a race window where a flow covering this packet + * could have already been installed since we last did the flow + * lookup before upcall. This could be solved by moving the + * mutex lock outside the loop, but that's an awful long time + * to be locking revalidators out of making flow modifications. */ + ovs_mutex_lock(&pmd->flow_mutex); + netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL); + if (OVS_LIKELY(!netdev_flow)) { + netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid, + add_actions->data, + add_actions->size, orig_in_port); + } + ovs_mutex_unlock(&pmd->flow_mutex); + uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid); + smc_insert(pmd, key, hash); + emc_probabilistic_insert(pmd, key, netdev_flow); + if (dp_netdev_e2e_cache_enabled) { + e2e_cache_trace_add_flow(packet, &netdev_flow->mega_ufid); + } + } + + /* We can't allow the packet batching in the next loop to execute + * the actions. Otherwise, if there are any slow path actions, + * we'll send the packet up twice. */ + dp_packet_batch_init_packet(&b, packet); + dp_netdev_execute_actions(pmd, &b, true, &match.flow, netdev_flow, + actions->data, actions->size); + + if (pmd_perf_metrics_enabled(pmd)) { + /* Update upcall stats. */ + cycles = cycles_counter_update(&pmd->perf_stats) - cycles; + struct pmd_perf_stats *s = &pmd->perf_stats; + s->current.upcalls++; + s->current.upcall_cycles += cycles; + histogram_add_sample(&s->cycles_per_upcall, cycles); + } + return error; +} + +static inline void +fast_path_processing(struct dp_netdev_pmd_thread *pmd, + struct dp_packet_batch *packets_, + struct netdev_flow_key **keys, + struct dp_packet_flow_map *flow_map, + uint8_t *index_map, + odp_port_t in_port) +{ + const size_t cnt = dp_packet_batch_size(packets_); +#if !defined(__CHECKER__) && !defined(_WIN32) + const size_t PKT_ARRAY_SIZE = cnt; +#else + /* Sparse or MSVC doesn't like variable length array. */ + enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST }; +#endif + struct dp_packet *packet; + struct dpcls *cls; + struct dpcls_rule *rules[PKT_ARRAY_SIZE]; + struct dp_netdev *dp = pmd->dp; + int upcall_ok_cnt = 0, upcall_fail_cnt = 0; + int lookup_cnt = 0, add_lookup_cnt; + bool any_miss; + + for (size_t i = 0; i < cnt; i++) { + /* Key length is needed in all the cases, hash computed on demand. */ + keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf)); + } + /* Get the classifier for the in_port */ + cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); + if (OVS_LIKELY(cls)) { + any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys, + rules, cnt, &lookup_cnt); + } else { + any_miss = true; + memset(rules, 0, sizeof(rules)); + } + if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) { + uint64_t actions_stub[512 / 8], slow_stub[512 / 8]; + struct ofpbuf actions, put_actions; + + ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub); + ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub); + + DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { + struct dp_netdev_flow *netdev_flow; + + if (OVS_LIKELY(rules[i])) { + continue; + } + + /* It's possible that an earlier slow path execution installed + * a rule covering this flow. In this case, it's a lot cheaper + * to catch it here than execute a miss. */ + netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i], + &add_lookup_cnt); + if (netdev_flow) { + lookup_cnt += add_lookup_cnt; + rules[i] = &netdev_flow->cr; + continue; + } + + int error = handle_packet_upcall(pmd, packet, keys[i], + &actions, &put_actions); + + if (OVS_UNLIKELY(error)) { + upcall_fail_cnt++; + } else { + upcall_ok_cnt++; + } + } + + ofpbuf_uninit(&actions); + ofpbuf_uninit(&put_actions); + fat_rwlock_unlock(&dp->upcall_rwlock); + } else if (OVS_UNLIKELY(any_miss)) { + DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { + if (OVS_UNLIKELY(!rules[i])) { + dp_packet_delete(packet); + COVERAGE_INC(datapath_drop_lock_error); + upcall_fail_cnt++; + } + } + } + + DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { + struct dp_netdev_flow *flow; + /* Get the original order of this packet in received batch. */ + int recv_idx = index_map[i]; + uint16_t tcp_flags; + + if (OVS_UNLIKELY(!rules[i])) { + continue; + } + + flow = dp_netdev_flow_cast(rules[i]); + uint32_t hash = dp_netdev_flow_hash(&flow->ufid); + smc_insert(pmd, keys[i], hash); + + emc_probabilistic_insert(pmd, keys[i], flow); + + if (dp_netdev_e2e_cache_enabled) { + e2e_cache_trace_add_flow(packet, &flow->mega_ufid); + } + + /* Add these packets into the flow map in the same order + * as received. + */ + tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf); + packet_enqueue_to_flow_map(packet, flow, tcp_flags, + flow_map, recv_idx); + } + + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT, + cnt - upcall_ok_cnt - upcall_fail_cnt); + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP, + lookup_cnt); + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS, + upcall_ok_cnt); + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST, + upcall_fail_cnt); +} + +/* Packets enter the datapath from a port (or from recirculation) here. + * + * When 'md_is_valid' is true the metadata in 'packets' are already valid. + * When false the metadata in 'packets' need to be initialized. */ +static void +dp_netdev_input__(struct dp_netdev_pmd_thread *pmd, + struct dp_packet_batch *packets, + bool md_is_valid, odp_port_t port_no) +{ +#if !defined(__CHECKER__) && !defined(_WIN32) + const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets); +#else + /* Sparse or MSVC doesn't like variable length array. */ + enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST }; +#endif + OVS_ALIGNED_VAR(CACHE_LINE_SIZE) + struct netdev_flow_key keys[PKT_ARRAY_SIZE]; + struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE]; + struct packet_batch_per_flow batches[PKT_ARRAY_SIZE]; + size_t n_batches; + struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE]; + uint8_t index_map[PKT_ARRAY_SIZE]; + size_t n_flows, i; + + odp_port_t in_port; + + n_batches = 0; + dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches, + flow_map, &n_flows, index_map, md_is_valid, port_no); + + if (!dp_packet_batch_is_empty(packets)) { + /* Get ingress port from first packet's metadata. */ + in_port = packets->packets[0]->md.in_port.odp_port; + fast_path_processing(pmd, packets, missed_keys, + flow_map, index_map, in_port); + } + + /* Batch rest of packets which are in flow map. */ + for (i = 0; i < n_flows; i++) { + struct dp_packet_flow_map *map = &flow_map[i]; + + if (OVS_UNLIKELY(!map->flow)) { + continue; + } + dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags, + batches, &n_batches); + } + + /* All the flow batches need to be reset before any call to + * packet_batch_per_flow_execute() as it could potentially trigger + * recirculation. When a packet matching flow 'j' happens to be + * recirculated, the nested call to dp_netdev_input__() could potentially + * classify the packet as matching another flow - say 'k'. It could happen + * that in the previous call to dp_netdev_input__() that same flow 'k' had + * already its own batches[k] still waiting to be served. So if its + * 'batch' member is not reset, the recirculated packet would be wrongly + * appended to batches[k] of the 1st call to dp_netdev_input__(). */ + for (i = 0; i < n_batches; i++) { + batches[i].flow->batch = NULL; + } + + for (i = 0; i < n_batches; i++) { + packet_batch_per_flow_execute(&batches[i], pmd); + } +} + +int32_t +dp_netdev_input(struct dp_netdev_pmd_thread *pmd, + struct dp_packet_batch *packets, + odp_port_t port_no) +{ + dp_netdev_input__(pmd, packets, false, port_no); + return 0; +} + +static void +dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd, + struct dp_packet_batch *packets) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + size_t i, size = dp_packet_batch_size(packets); + struct dp_packet *packet; + + DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, packets) { + if (dp_packet_hwol_is_tunnel_geneve(packet) || + dp_packet_hwol_is_tunnel_vxlan(packet)) { + + if (dp_packet_hwol_is_tso(packet)) { + /* Can't perform GSO in the middle of a pipeline. */ + COVERAGE_INC(datapath_drop_tunnel_tso_recirc); + dp_packet_delete(packet); + VLOG_WARN_RL(&rl, "Recirculating tunnel packets with " + "TSO is not supported"); + continue; + } + /* Have to fix all the checksums before re-parsing, because the + * packet will be treated as having a single set of headers. */ + dp_packet_ol_send_prepare(packet, 0); + /* This packet must not be marked with anything tunnel-related. */ + dp_packet_hwol_reset_tunnel(packet); + /* Clear inner offsets. Other ones are collateral, but they will + * be re-initialized on re-parsing. */ + dp_packet_reset_offsets(packet); + } + dp_packet_batch_refill(packets, packet, i); + } + + dp_netdev_input__(pmd, packets, true, 0); +} + +struct dp_netdev_execute_aux { + struct dp_netdev_pmd_thread *pmd; + const struct flow *flow; + struct dp_netdev_flow *dp_flow; + const struct nlattr *actions; + size_t actions_len; +}; + +static void +dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb, + void *aux) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + dp->dp_purge_aux = aux; + dp->dp_purge_cb = cb; +} + +static void +dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb, + void *aux) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + dp->upcall_aux = aux; + dp->upcall_cb = cb; +} + +static void +dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd, + bool purge) +{ + struct tx_port *tx; + struct dp_netdev_port *port; + long long interval; + + HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) { + if (tx->port->txq_mode != TXQ_MODE_XPS) { + continue; + } + interval = pmd->ctx.now - tx->last_used; + if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) { + port = tx->port; + ovs_mutex_lock(&port->txq_used_mutex); + port->txq_used[tx->qid]--; + ovs_mutex_unlock(&port->txq_used_mutex); + tx->qid = -1; + } + } +} + +static int +dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd, + struct tx_port *tx) +{ + struct dp_netdev_port *port; + long long interval; + int i, min_cnt, min_qid; + + interval = pmd->ctx.now - tx->last_used; + tx->last_used = pmd->ctx.now; + + if (netdev_n_txq(tx->port->netdev) == 1) { + return 0; + } + + if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) { + return tx->qid; + } + + port = tx->port; + + ovs_mutex_lock(&port->txq_used_mutex); + if (tx->qid >= 0) { + port->txq_used[tx->qid]--; + tx->qid = -1; + } + + min_cnt = -1; + min_qid = 0; + for (i = 0; i < netdev_n_txq(port->netdev); i++) { + if (port->txq_used[i] < min_cnt || min_cnt == -1) { + min_cnt = port->txq_used[i]; + min_qid = i; + } + } + + port->txq_used[min_qid]++; + tx->qid = min_qid; + + ovs_mutex_unlock(&port->txq_used_mutex); + + dpif_netdev_xps_revalidate_pmd(pmd, false); + + VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.", + pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev)); + return min_qid; +} + +static struct tx_port * +pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd, + odp_port_t port_no) +{ + return tx_port_lookup(&pmd->tnl_port_cache, port_no); +} + +static struct tx_port * +pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd, + odp_port_t port_no) +{ + return tx_port_lookup(&pmd->send_port_cache, port_no); +} + +static int +push_tnl_action(const struct dp_netdev_pmd_thread *pmd, + const struct nlattr *attr, + struct dp_packet_batch *batch) +{ + struct tx_port *tun_port; + const struct ovs_action_push_tnl *data; + int err; + + data = nl_attr_get(attr); + + tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port); + if (!tun_port) { + err = -EINVAL; + goto error; + } + err = netdev_push_header(tun_port->port->netdev, batch, data); + if (!err) { + return 0; + } +error: + dp_packet_delete_batch(batch, true); + return err; +} + +static void +dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd, + struct dp_packet *packet, bool should_steal, + struct flow *flow, struct dp_netdev_flow *dp_flow, + ovs_u128 *ufid, struct ofpbuf *actions, + const struct nlattr *userdata) +{ + struct dp_packet_batch b; + int error; + + ofpbuf_clear(actions); + + error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid, + DPIF_UC_ACTION, userdata, actions, + NULL); + if (!error || error == ENOSPC) { + dp_packet_batch_init_packet(&b, packet); + dp_netdev_execute_actions(pmd, &b, should_steal, flow, dp_flow, + actions->data, actions->size); + } else if (should_steal) { + dp_packet_delete(packet); + COVERAGE_INC(datapath_drop_userspace_action_error); + } +} + +static bool +dp_execute_output_action(struct dp_netdev_pmd_thread *pmd, + struct dp_packet_batch *packets_, + bool should_steal, odp_port_t port_no) +{ + struct tx_port *p = pmd_send_port_cache_lookup(pmd, port_no); + struct dp_packet_batch out; + + if (dp_netdev_e2e_cache_enabled) { + e2e_cache_dispatch_trace_message(pmd->dp, packets_, pmd->ctx.now); + } + + if (!OVS_LIKELY(p)) { + COVERAGE_ADD(datapath_drop_invalid_port, + dp_packet_batch_size(packets_)); + dp_packet_delete_batch(packets_, should_steal); + return false; + } + if (!should_steal) { + dp_packet_batch_clone(&out, packets_); + dp_packet_batch_reset_cutlen(packets_); + packets_ = &out; + } + dp_packet_batch_apply_cutlen(packets_); +#ifdef DPDK_NETDEV + if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts) + && packets_->packets[0]->source + != p->output_pkts.packets[0]->source)) { + /* XXX: netdev-dpdk assumes that all packets in a single + * output batch has the same source. Flush here to + * avoid memory access issues. */ + dp_netdev_pmd_flush_output_on_port(pmd, p); + } +#endif + if (dp_packet_batch_size(&p->output_pkts) + + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) { + /* Flush here to avoid overflow. */ + dp_netdev_pmd_flush_output_on_port(pmd, p); + } + if (dp_packet_batch_is_empty(&p->output_pkts)) { + pmd->n_output_batches++; + } + + struct dp_packet *packet; + bool has_nonsampled_pkts = false; + DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { + if (packet->md.sample) { + if (!should_steal) { + dp_packet_delete(packet); + } + continue; + } + p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] = + pmd->ctx.last_rxq; + dp_packet_batch_add(&p->output_pkts, packet); + has_nonsampled_pkts = true; + } + return has_nonsampled_pkts; +} + +static void +dp_execute_lb_output_action(struct dp_netdev_pmd_thread *pmd, + struct dp_packet_batch *packets_, + bool should_steal, uint32_t bond) +{ + struct tx_bond *p_bond = tx_bond_lookup(&pmd->tx_bonds, bond); + struct dp_packet_batch out; + struct dp_packet *packet; + + if (!p_bond) { + COVERAGE_ADD(datapath_drop_invalid_bond, + dp_packet_batch_size(packets_)); + dp_packet_delete_batch(packets_, should_steal); + return; + } + if (!should_steal) { + dp_packet_batch_clone(&out, packets_); + dp_packet_batch_reset_cutlen(packets_); + packets_ = &out; + } + dp_packet_batch_apply_cutlen(packets_); + + DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { + /* + * Lookup the bond-hash table using hash to get the member. + */ + uint32_t hash = dp_packet_get_rss_hash(packet); + struct member_entry *s_entry + = &p_bond->member_buckets[hash & BOND_MASK]; + odp_port_t bond_member = s_entry->member_id; + uint32_t size = dp_packet_size(packet); + struct dp_packet_batch output_pkt; + + dp_packet_batch_init_packet(&output_pkt, packet); + if (OVS_LIKELY(dp_execute_output_action(pmd, &output_pkt, true, + bond_member))) { + /* Update member stats. */ + non_atomic_ullong_add(&s_entry->n_packets, 1); + non_atomic_ullong_add(&s_entry->n_bytes, size); + } + } +} + +static void +dp_execute_cb(void *aux_, struct dp_packet_batch *packets_, + const struct nlattr *a, bool should_steal) + OVS_NO_THREAD_SAFETY_ANALYSIS +{ + struct dp_netdev_execute_aux *aux = aux_; + uint32_t *depth = recirc_depth_get(); + struct dp_netdev_pmd_thread *pmd = aux->pmd; + struct dp_netdev *dp = pmd->dp; + int type = nl_attr_type(a); + struct tx_port *p; + uint32_t packet_count, packets_dropped; + + switch ((enum ovs_action_attr)type) { + case OVS_ACTION_ATTR_OUTPUT: + dp_execute_output_action(pmd, packets_, should_steal, + nl_attr_get_odp_port(a)); + return; + + case OVS_ACTION_ATTR_LB_OUTPUT: + dp_execute_lb_output_action(pmd, packets_, should_steal, + nl_attr_get_u32(a)); + return; + + case OVS_ACTION_ATTR_TUNNEL_PUSH: + if (should_steal) { + /* We're requested to push tunnel header, but also we need to take + * the ownership of these packets. Thus, we can avoid performing + * the action, because the caller will not use the result anyway. + * Just break to free the batch. */ + break; + } + dp_packet_batch_apply_cutlen(packets_); + packet_count = dp_packet_batch_size(packets_); + if (push_tnl_action(pmd, a, packets_)) { + COVERAGE_ADD(datapath_drop_tunnel_push_error, + packet_count); + } + return; + + case OVS_ACTION_ATTR_TUNNEL_POP: + if (*depth < max_recirc_depth) { + struct dp_packet_batch *orig_packets_ = packets_; + odp_port_t portno = nl_attr_get_odp_port(a); + + p = pmd_tnl_port_cache_lookup(pmd, portno); + if (p) { + struct dp_packet_batch tnl_pkt; + + if (!should_steal) { + dp_packet_batch_clone(&tnl_pkt, packets_); + packets_ = &tnl_pkt; + dp_packet_batch_reset_cutlen(orig_packets_); + } + + dp_packet_batch_apply_cutlen(packets_); + + packet_count = dp_packet_batch_size(packets_); + netdev_pop_header(p->port->netdev, packets_); + packets_dropped = + packet_count - dp_packet_batch_size(packets_); + if (packets_dropped) { + COVERAGE_ADD(datapath_drop_tunnel_pop_error, + packets_dropped); + } + if (dp_packet_batch_is_empty(packets_)) { + return; + } + + struct dp_packet *packet; + DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { + if (dp_netdev_e2e_cache_enabled) { + e2e_cache_trace_tnl_pop(packet); + } + packet->md.in_port.odp_port = portno; + } + + (*depth)++; + dp_netdev_recirculate(pmd, packets_); + (*depth)--; + return; + } + COVERAGE_ADD(datapath_drop_invalid_tnl_port, + dp_packet_batch_size(packets_)); + } else { + COVERAGE_ADD(datapath_drop_recirc_error, + dp_packet_batch_size(packets_)); + } + break; + + case OVS_ACTION_ATTR_TUN_DECAP: { + odp_port_t portno = nl_attr_get_odp_port(a); + struct dp_packet *packet; + struct netdev *netdev; + size_t i; + + if (should_steal) { + /* We are requested to decap tunnel header and take ownership of + * these packets, i.e. the caller will not use the result of this + * action. + * This is an explicit tunnel action, processing continues only + * through the caller and packets are not handed over to a tunnel + * interface. + * For this reason, if the caller relinquishes ownership of the + * packets, there is nothing more to do: packets should be + * implicitly dropped. + * Break out of this switch to free the packets batch. + */ + break; + } + packet_count = dp_packet_batch_size(packets_); + DP_PACKET_BATCH_REFILL_FOR_EACH (i, packet_count, packet, packets_) { + p = pmd_tnl_port_cache_lookup(pmd, portno); + if (!p) { + dp_packet_delete(packet); + continue; + } + netdev = p->port->netdev; + if (!netdev_has_tunnel_push_pop(netdev)) { + dp_packet_delete(packet); + continue; + } + parse_tcp_flags(packet, NULL, NULL, NULL); + packet = netdev->netdev_class->pop_header(netdev, packet, false); + if (packet) { + odp_port_t orig_in_port; + + /* Drop the tunnel metadata, and restore orig_in_port */ + orig_in_port = packet->md.orig_in_port; + pkt_metadata_init_tnl(&packet->md); + packet->md.in_port.odp_port = orig_in_port; + parse_tcp_flags(packet, NULL, NULL, NULL); + dp_packet_batch_refill(packets_, packet, i); + } + } + COVERAGE_ADD(datapath_drop_invalid_tnl_port, + packet_count - dp_packet_batch_size(packets_)); + return; + } + + case OVS_ACTION_ATTR_USERSPACE: + if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) { + struct dp_packet_batch *orig_packets_ = packets_; + const struct nlattr *userdata; + struct dp_packet_batch usr_pkt; + struct ofpbuf actions; + struct flow flow; + ovs_u128 ufid; + bool clone = false; + + userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA); + ofpbuf_init(&actions, 0); + + if (packets_->trunc) { + if (!should_steal) { + dp_packet_batch_clone(&usr_pkt, packets_); + packets_ = &usr_pkt; + clone = true; + dp_packet_batch_reset_cutlen(orig_packets_); + } + + dp_packet_batch_apply_cutlen(packets_); + } + + struct dp_packet *packet; + DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { + flow_extract(packet, &flow); + odp_flow_key_hash(&flow, sizeof flow, &ufid); + dp_execute_userspace_action(pmd, packet, should_steal, &flow, + aux->dp_flow, &ufid, &actions, + userdata); + } + + if (clone) { + dp_packet_delete_batch(packets_, true); + } + + ofpbuf_uninit(&actions); + fat_rwlock_unlock(&dp->upcall_rwlock); + + return; + } + COVERAGE_ADD(datapath_drop_lock_error, + dp_packet_batch_size(packets_)); + break; + + case OVS_ACTION_ATTR_RECIRC: + if (*depth < max_recirc_depth) { + struct dp_packet_batch recirc_pkts; + + if (!should_steal) { + dp_packet_batch_clone(&recirc_pkts, packets_); + packets_ = &recirc_pkts; + } + + struct dp_packet *packet; + DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { + packet->md.recirc_id = nl_attr_get_u32(a); + } + + (*depth)++; + dp_netdev_recirculate(pmd, packets_); + (*depth)--; + + return; + } + + COVERAGE_ADD(datapath_drop_recirc_error, + dp_packet_batch_size(packets_)); + VLOG_WARN("Packet dropped. Max recirculation depth exceeded."); + break; + + case OVS_ACTION_ATTR_CT: { + const struct nlattr *b; + bool force = false; + bool commit = false; + unsigned int left; + uint16_t zone = 0; + uint32_t tp_id = 0; + const char *helper = NULL; + const uint32_t *setmark = NULL; + const struct ovs_key_ct_labels *setlabel = NULL; + struct nat_action_info_t nat_action_info; + struct nat_action_info_t *nat_action_info_ref = NULL; + bool nat_config = false; + + NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a), + nl_attr_get_size(a)) { + enum ovs_ct_attr sub_type = nl_attr_type(b); + + switch (sub_type) { + case OVS_CT_ATTR_FORCE_COMMIT: + force = true; + /* fall through. */ + case OVS_CT_ATTR_COMMIT: + commit = true; + break; + case OVS_CT_ATTR_ZONE: + zone = nl_attr_get_u16(b); + break; + case OVS_CT_ATTR_HELPER: + helper = nl_attr_get_string(b); + break; + case OVS_CT_ATTR_MARK: + setmark = nl_attr_get(b); + break; + case OVS_CT_ATTR_LABELS: + setlabel = nl_attr_get(b); + break; + case OVS_CT_ATTR_EVENTMASK: + /* Silently ignored, as userspace datapath does not generate + * netlink events. */ + break; + case OVS_CT_ATTR_TIMEOUT: + if (!str_to_uint(nl_attr_get_string(b), 10, &tp_id)) { + VLOG_WARN("Invalid Timeout Policy ID: %s.", + nl_attr_get_string(b)); + tp_id = DEFAULT_TP_ID; + } + break; + case OVS_CT_ATTR_NAT: { + const struct nlattr *b_nest; + unsigned int left_nest; + bool ip_min_specified = false; + bool proto_num_min_specified = false; + bool ip_max_specified = false; + bool proto_num_max_specified = false; + memset(&nat_action_info, 0, sizeof nat_action_info); + nat_action_info_ref = &nat_action_info; + + NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) { + enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest); + + switch (sub_type_nest) { + case OVS_NAT_ATTR_SRC: + case OVS_NAT_ATTR_DST: + nat_config = true; + nat_action_info.nat_action |= + ((sub_type_nest == OVS_NAT_ATTR_SRC) + ? NAT_ACTION_SRC : NAT_ACTION_DST); + break; + case OVS_NAT_ATTR_IP_MIN: + memcpy(&nat_action_info.min_addr, + nl_attr_get(b_nest), + nl_attr_get_size(b_nest)); + ip_min_specified = true; + break; + case OVS_NAT_ATTR_IP_MAX: + memcpy(&nat_action_info.max_addr, + nl_attr_get(b_nest), + nl_attr_get_size(b_nest)); + ip_max_specified = true; + break; + case OVS_NAT_ATTR_PROTO_MIN: + nat_action_info.min_port = + nl_attr_get_u16(b_nest); + proto_num_min_specified = true; + break; + case OVS_NAT_ATTR_PROTO_MAX: + nat_action_info.max_port = + nl_attr_get_u16(b_nest); + proto_num_max_specified = true; + break; + case OVS_NAT_ATTR_PROTO_RANDOM: + nat_action_info.nat_flags |= NAT_RANGE_RANDOM; + break; + case OVS_NAT_ATTR_PERSISTENT: + nat_action_info.nat_flags |= NAT_PERSISTENT; + break; + case OVS_NAT_ATTR_PROTO_HASH: + break; + case OVS_NAT_ATTR_UNSPEC: + case __OVS_NAT_ATTR_MAX: + OVS_NOT_REACHED(); + } + } + + if (ip_min_specified && !ip_max_specified) { + nat_action_info.max_addr = nat_action_info.min_addr; + } + if (proto_num_min_specified && !proto_num_max_specified) { + nat_action_info.max_port = nat_action_info.min_port; + } + if (proto_num_min_specified || proto_num_max_specified) { + if (nat_action_info.nat_action & NAT_ACTION_SRC) { + nat_action_info.nat_action |= NAT_ACTION_SRC_PORT; + } else if (nat_action_info.nat_action & NAT_ACTION_DST) { + nat_action_info.nat_action |= NAT_ACTION_DST_PORT; + } + } + break; + } + case OVS_CT_ATTR_UNSPEC: + case __OVS_CT_ATTR_MAX: + OVS_NOT_REACHED(); + } + } + + /* We won't be able to function properly in this case, hence + * complain loudly. */ + if (nat_config && !commit) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); + VLOG_WARN_RL(&rl, "NAT specified without commit."); + } + + conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force, + commit, zone, setmark, setlabel, helper, + nat_action_info_ref, pmd->ctx.now / 1000, tp_id); + break; + } + + case OVS_ACTION_ATTR_METER: + dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a), + pmd->ctx.now / 1000); + break; + + case OVS_ACTION_ATTR_HASH: { + const struct ovs_action_hash *hash_act = nl_attr_get(a); + struct dp_packet *packet; + + /* Calculate a hash value directly. This might not match the + * value computed by the datapath, but it is much less expensive, + * and the current use case (bonding) does not require a strict + * match to work properly. */ + switch (hash_act->hash_alg) { + case OVS_HASH_ALG_L4: { + struct flow flow; + uint32_t hash; + + DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { + /* RSS hash can be used here instead of 5tuple for + * performance reasons. */ + if (dp_packet_rss_valid(packet)) { + hash = dp_packet_get_rss_hash(packet); + hash = hash_int(hash, hash_act->hash_basis); + } else { + flow_extract(packet, &flow); + hash = flow_hash_5tuple(&flow, hash_act->hash_basis); + } + packet->md.dp_hash = hash; + } + break; + } + case OVS_HASH_ALG_DOCA: + /* Fallthrough. */ + case OVS_HASH_ALG_SYM_L4: { + struct flow flow; + uint32_t hash; + + DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { + if (packet->orig_netdev) { + if (0 == netdev_packet_hw_hash(packet->orig_netdev, packet, + hash_act->hash_basis, + &packet->md.dp_hash)) { + continue; + } + } + flow_extract(packet, &flow); + hash = flow_hash_symmetric_l3l4(&flow, + hash_act->hash_basis, + false); + packet->md.dp_hash = hash; + } + break; + } + default: + /* Assert on unknown hash algorithm. */ + OVS_NOT_REACHED(); + } + break; + } + + case OVS_ACTION_ATTR_PUSH_VLAN: + case OVS_ACTION_ATTR_POP_VLAN: + case OVS_ACTION_ATTR_PUSH_MPLS: + case OVS_ACTION_ATTR_POP_MPLS: + case OVS_ACTION_ATTR_SET: + case OVS_ACTION_ATTR_SET_MASKED: + case OVS_ACTION_ATTR_SAMPLE: + case OVS_ACTION_ATTR_UNSPEC: + case OVS_ACTION_ATTR_TRUNC: + case OVS_ACTION_ATTR_PUSH_ETH: + case OVS_ACTION_ATTR_POP_ETH: + case OVS_ACTION_ATTR_CLONE: + case OVS_ACTION_ATTR_PUSH_NSH: + case OVS_ACTION_ATTR_POP_NSH: + case OVS_ACTION_ATTR_CT_CLEAR: + case OVS_ACTION_ATTR_CHECK_PKT_LEN: + case OVS_ACTION_ATTR_DROP: + case OVS_ACTION_ATTR_ADD_MPLS: + case OVS_ACTION_ATTR_DEC_TTL: + case __OVS_ACTION_ATTR_MAX: + OVS_NOT_REACHED(); + } + + dp_packet_delete_batch(packets_, should_steal); +} + +static void +dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd, + struct dp_packet_batch *packets, + bool should_steal, const struct flow *flow, + struct dp_netdev_flow *dp_flow, + const struct nlattr *actions, size_t actions_len) +{ + struct dp_netdev_execute_aux aux = { + .pmd = pmd, + .flow = flow, + .dp_flow = dp_flow, + .actions = actions, + .actions_len = actions_len, + }; + + odp_execute_actions(&aux, packets, should_steal, actions, + actions_len, dp_execute_cb); +} + +struct dp_netdev_ct_dump { + struct ct_dpif_dump_state up; + struct conntrack_dump dump; + struct conntrack *ct; + struct dp_netdev *dp; +}; + +static int +dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_, + const uint16_t *pzone, int *ptot_bkts) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + struct dp_netdev_ct_dump *dump; + + dump = xzalloc(sizeof *dump); + dump->dp = dp; + dump->ct = dp->conntrack; + + conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts); + + *dump_ = &dump->up; + + return 0; +} + +static int +dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED, + struct ct_dpif_dump_state *dump_, + struct ct_dpif_entry *entry) +{ + struct dp_netdev_ct_dump *dump; + + INIT_CONTAINER(dump, dump_, up); + + return conntrack_dump_next(&dump->dump, entry); +} + +static int +dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED, + struct ct_dpif_dump_state *dump_) +{ + struct dp_netdev_ct_dump *dump; + int err; + + INIT_CONTAINER(dump, dump_, up); + + err = conntrack_dump_done(&dump->dump); + + free(dump); + + return err; +} + +static int +dpif_netdev_ct_exp_dump_start(struct dpif *dpif, + struct ct_dpif_dump_state **dump_, + const uint16_t *pzone) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + struct dp_netdev_ct_dump *dump; + + dump = xzalloc(sizeof *dump); + dump->dp = dp; + dump->ct = dp->conntrack; + + conntrack_exp_dump_start(dp->conntrack, &dump->dump, pzone); + + *dump_ = &dump->up; + + return 0; +} + +static int +dpif_netdev_ct_exp_dump_next(struct dpif *dpif OVS_UNUSED, + struct ct_dpif_dump_state *dump_, + struct ct_dpif_exp *entry) +{ + struct dp_netdev_ct_dump *dump; + + INIT_CONTAINER(dump, dump_, up); + + return conntrack_exp_dump_next(&dump->dump, entry); +} + +static int +dpif_netdev_ct_exp_dump_done(struct dpif *dpif OVS_UNUSED, + struct ct_dpif_dump_state *dump_) +{ + struct dp_netdev_ct_dump *dump; + int err; + + INIT_CONTAINER(dump, dump_, up); + + err = conntrack_exp_dump_done(&dump->dump); + + free(dump); + + return err; +} + +static int +dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone, + const struct ct_dpif_tuple *tuple) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + + if (tuple) { + return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0); + } + return conntrack_flush(dp->conntrack, zone); +} + +static int +dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + + return conntrack_set_maxconns(dp->conntrack, maxconns); +} + +static int +dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + + return conntrack_get_maxconns(dp->conntrack, maxconns); +} + +static int +dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + + return conntrack_get_nconns(dp->conntrack, nconns); +} + +static int +dpif_netdev_ct_set_tcp_seq_chk(struct dpif *dpif, bool enabled) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + + return conntrack_set_tcp_seq_chk(dp->conntrack, enabled); +} + +static int +dpif_netdev_ct_get_tcp_seq_chk(struct dpif *dpif, bool *enabled) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + *enabled = conntrack_get_tcp_seq_chk(dp->conntrack); + return 0; +} + +static int +dpif_netdev_ct_set_sweep_interval(struct dpif *dpif, uint32_t ms) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + return conntrack_set_sweep_interval(dp->conntrack, ms); +} + +static int +dpif_netdev_ct_get_sweep_interval(struct dpif *dpif, uint32_t *ms) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + *ms = conntrack_get_sweep_interval(dp->conntrack); + return 0; +} + +static int +dpif_netdev_ct_set_limits(struct dpif *dpif, + const struct ovs_list *zone_limits) +{ + int err = 0; + struct dp_netdev *dp = get_dp_netdev2(dpif); + + struct ct_dpif_zone_limit *zone_limit; + LIST_FOR_EACH (zone_limit, node, zone_limits) { + err = zone_limit_update(dp->conntrack, zone_limit->zone, + zone_limit->limit); + if (err != 0) { + break; + } + } + return err; +} + +static int +dpif_netdev_ct_get_limits(struct dpif *dpif, + const struct ovs_list *zone_limits_request, + struct ovs_list *zone_limits_reply) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + struct conntrack_zone_limit czl; + + if (!ovs_list_is_empty(zone_limits_request)) { + struct ct_dpif_zone_limit *zone_limit; + LIST_FOR_EACH (zone_limit, node, zone_limits_request) { + czl = zone_limit_get(dp->conntrack, zone_limit->zone); + if (czl.zone == zone_limit->zone || czl.zone == DEFAULT_ZONE) { + ct_dpif_push_zone_limit(zone_limits_reply, zone_limit->zone, + czl.limit, + atomic_count_get(&czl.count)); + } else { + return EINVAL; + } + } + } else { + czl = zone_limit_get(dp->conntrack, DEFAULT_ZONE); + if (czl.zone == DEFAULT_ZONE) { + ct_dpif_push_zone_limit(zone_limits_reply, DEFAULT_ZONE, + czl.limit, 0); + } + + for (int z = MIN_ZONE; z <= MAX_ZONE; z++) { + czl = zone_limit_get(dp->conntrack, z); + if (czl.zone == z) { + ct_dpif_push_zone_limit(zone_limits_reply, z, czl.limit, + atomic_count_get(&czl.count)); + } + } + } + + return 0; +} + +static int +dpif_netdev_ct_del_limits(struct dpif *dpif, + const struct ovs_list *zone_limits) +{ + int err = 0; + struct dp_netdev *dp = get_dp_netdev2(dpif); + struct ct_dpif_zone_limit *zone_limit; + LIST_FOR_EACH (zone_limit, node, zone_limits) { + err = zone_limit_delete(dp->conntrack, zone_limit->zone); + if (err != 0) { + break; + } + } + + return err; +} + +static int +dpif_netdev_ct_get_features(struct dpif *dpif OVS_UNUSED, + enum ct_features *features) +{ + if (features != NULL) { + *features = CONNTRACK_F_ZERO_SNAT; + } + return 0; +} + +static int +dpif_netdev_ct_set_timeout_policy(struct dpif *dpif, + const struct ct_dpif_timeout_policy *dpif_tp) +{ + struct timeout_policy tp; + struct dp_netdev *dp; + + dp = get_dp_netdev2(dpif); + memcpy(&tp.policy, dpif_tp, sizeof tp.policy); + return timeout_policy_update(dp->conntrack, &tp); +} + +static int +dpif_netdev_ct_get_timeout_policy(struct dpif *dpif, uint32_t tp_id, + struct ct_dpif_timeout_policy *dpif_tp) +{ + struct timeout_policy *tp; + struct dp_netdev *dp; + int err = 0; + + dp = get_dp_netdev2(dpif); + tp = timeout_policy_get(dp->conntrack, tp_id); + if (!tp) { + return ENOENT; + } + memcpy(dpif_tp, &tp->policy, sizeof tp->policy); + return err; +} + +static int +dpif_netdev_ct_del_timeout_policy(struct dpif *dpif, + uint32_t tp_id) +{ + struct dp_netdev *dp; + int err = 0; + + dp = get_dp_netdev2(dpif); + err = timeout_policy_delete(dp->conntrack, tp_id); + return err; +} + +static int +dpif_netdev_ct_get_timeout_policy_name(struct dpif *dpif OVS_UNUSED, + uint32_t tp_id, + uint16_t dl_type OVS_UNUSED, + uint8_t nw_proto OVS_UNUSED, + char **tp_name, bool *is_generic) +{ + struct ds ds = DS_EMPTY_INITIALIZER; + + ds_put_format(&ds, "%"PRIu32, tp_id); + *tp_name = ds_steal_cstr(&ds); + *is_generic = true; + return 0; +} + +static int +dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable); +} + +static int +dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag); +} + +static int +dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags); +} + +/* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to + * diverge. */ +static int +dpif_netdev_ipf_get_status(struct dpif *dpif, + struct dpif_ipf_status *dpif_ipf_status) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + ipf_get_status(conntrack_ipf_ctx(dp->conntrack), + (struct ipf_status *) dpif_ipf_status); + return 0; +} + +static int +dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED, + struct ipf_dump_ctx **ipf_dump_ctx) +{ + return ipf_dump_start(ipf_dump_ctx); +} + +static int +dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx, + dump); +} + +static int +dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx) +{ + return ipf_dump_done(ipf_dump_ctx); + +} + +static int +dpif_netdev_bond_add(struct dpif *dpif, uint32_t bond_id, + odp_port_t *member_map) +{ + struct tx_bond *new_tx = xzalloc(sizeof *new_tx); + struct dp_netdev *dp = get_dp_netdev2(dpif); + struct dp_netdev_pmd_thread *pmd; + + /* Prepare new bond mapping. */ + new_tx->bond_id = bond_id; + for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) { + new_tx->member_buckets[bucket].member_id = member_map[bucket]; + } + + ovs_mutex_lock(&dp->bond_mutex); + /* Check if bond already existed. */ + struct tx_bond *old_tx = tx_bond_lookup(&dp->tx_bonds, bond_id); + if (old_tx) { + cmap_replace(&dp->tx_bonds, &old_tx->node, &new_tx->node, + hash_bond_id(bond_id)); + ovsrcu_postpone(free, old_tx); + } else { + cmap_insert(&dp->tx_bonds, &new_tx->node, hash_bond_id(bond_id)); + } + ovs_mutex_unlock(&dp->bond_mutex); + + /* Update all PMDs with new bond mapping. */ + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + dp_netdev_add_bond_tx_to_pmd(pmd, new_tx, true); + } + return 0; +} + +static int +dpif_netdev_bond_del(struct dpif *dpif, uint32_t bond_id) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + struct dp_netdev_pmd_thread *pmd; + struct tx_bond *tx; + + ovs_mutex_lock(&dp->bond_mutex); + /* Check if bond existed. */ + tx = tx_bond_lookup(&dp->tx_bonds, bond_id); + if (tx) { + cmap_remove(&dp->tx_bonds, &tx->node, hash_bond_id(bond_id)); + ovsrcu_postpone(free, tx); + } else { + /* Bond is not present. */ + ovs_mutex_unlock(&dp->bond_mutex); + return ENOENT; + } + ovs_mutex_unlock(&dp->bond_mutex); + + /* Remove the bond map in all pmds. */ + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + dp_netdev_del_bond_tx_from_pmd(pmd, bond_id); + } + return 0; +} + +static int +dpif_netdev_bond_stats_get(struct dpif *dpif, uint32_t bond_id, + uint64_t *n_bytes) +{ + struct dp_netdev *dp = get_dp_netdev2(dpif); + struct dp_netdev_pmd_thread *pmd; + + if (!tx_bond_lookup(&dp->tx_bonds, bond_id)) { + return ENOENT; + } + + /* Search the bond in all PMDs. */ + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + struct tx_bond *pmd_bond_entry + = tx_bond_lookup(&pmd->tx_bonds, bond_id); + + if (!pmd_bond_entry) { + continue; + } + + /* Read bond stats. */ + for (int i = 0; i < BOND_BUCKETS; i++) { + uint64_t pmd_n_bytes; + + atomic_read_relaxed(&pmd_bond_entry->member_buckets[i].n_bytes, + &pmd_n_bytes); + n_bytes[i] += pmd_n_bytes; + } + } + return 0; +} + +const struct dpif_class dpif_netdev2_class = { + "netdev2", + true, /* cleanup_required */ + true, /* synced_dp_layers */ + dpif_netdev_init, + dpif_netdev_enumerate, + dpif_netdev_port_open_type, + dpif_netdev2_open, + dpif_netdev_close, + dpif_netdev_destroy, + dpif_netdev_run, + dpif_netdev_wait, + dpif_netdev_get_stats, + NULL, /* set_features */ + dpif_netdev_port_add, + dpif_netdev_port_del, + dpif_netdev_port_set_config, + dpif_netdev_port_query_by_number, + dpif_netdev_port_query_by_name, + NULL, /* port_get_pid */ + dpif_netdev_port_dump_start, + dpif_netdev_port_dump_next, + dpif_netdev_port_dump_done, + dpif_netdev_port_poll, + dpif_netdev_port_poll_wait, + dpif_netdev_flow_flush, + dpif_netdev_flow_dump_create, + dpif_netdev_flow_dump_destroy, + dpif_netdev_flow_dump_thread_create, + dpif_netdev_flow_dump_thread_destroy, + dpif_netdev_flow_dump_next, + dpif_netdev_dump_e2e_flows, + dpif_netdev_operate, + dpif_netdev2_offload_stats_get, + dpif_netdev2_offload_stats_clear, + NULL, /* recv_set */ + NULL, /* handlers_set */ + NULL, /* number_handlers_required */ + dpif_netdev_set_config, + dpif_netdev_queue_to_priority, + NULL, /* recv */ + NULL, /* recv_wait */ + NULL, /* recv_purge */ + dpif_netdev_register_dp_purge_cb, + dpif_netdev_register_upcall_cb, + dpif_netdev_enable_upcall, + dpif_netdev_disable_upcall, + dpif_netdev_register_sflow_upcall_cb, + dpif_netdev_get_datapath_version, + dpif_netdev_ct_dump_start, + dpif_netdev_ct_dump_next, + dpif_netdev_ct_dump_done, + dpif_netdev_ct_exp_dump_start, + dpif_netdev_ct_exp_dump_next, + dpif_netdev_ct_exp_dump_done, + dpif_netdev_ct_flush, + dpif_netdev_ct_set_maxconns, + dpif_netdev_ct_get_maxconns, + dpif_netdev_ct_get_nconns, + dpif_netdev_ct_set_tcp_seq_chk, + dpif_netdev_ct_get_tcp_seq_chk, + dpif_netdev_ct_set_sweep_interval, + dpif_netdev_ct_get_sweep_interval, + dpif_netdev_ct_set_limits, + dpif_netdev_ct_get_limits, + dpif_netdev_ct_del_limits, + dpif_netdev2_ct_get_stats, + dpif_netdev_ct_set_timeout_policy, + dpif_netdev_ct_get_timeout_policy, + dpif_netdev_ct_del_timeout_policy, + NULL, /* ct_timeout_policy_dump_start */ + NULL, /* ct_timeout_policy_dump_next */ + NULL, /* ct_timeout_policy_dump_done */ + dpif_netdev_ct_get_timeout_policy_name, + dpif_netdev_ct_get_features, + dpif_netdev_ipf_set_enabled, + dpif_netdev_ipf_set_min_frag, + dpif_netdev_ipf_set_max_nfrags, + dpif_netdev_ipf_get_status, + dpif_netdev_ipf_dump_start, + dpif_netdev_ipf_dump_next, + dpif_netdev_ipf_dump_done, + dpif_netdev_meter_get_features, + dpif_netdev_meter_set, + dpif_netdev_meter_get, + dpif_netdev_meter_del, + dpif_netdev_bond_add, + dpif_netdev_bond_del, + dpif_netdev_bond_stats_get, + NULL, /* cache_get_supported_levels */ + NULL, /* cache_get_name */ + NULL, /* cache_get_size */ + NULL, /* cache_set_size */ +}; + +static void +dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED, + const char *argv[], void *aux OVS_UNUSED) +{ + struct dp_netdev_port *port; + struct dp_netdev *dp; + odp_port_t port_no; + + ovs_mutex_lock(&dp_netdev_mutex); + dp = shash_find_data(&dp_netdevs, argv[1]); + if (!dp || !dpif_netdev_class_is_dummy(dp->class)) { + ovs_mutex_unlock(&dp_netdev_mutex); + unixctl_command_reply_error(conn, "unknown datapath or not a dummy"); + return; + } + ovs_refcount_ref(&dp->ref_cnt); + ovs_mutex_unlock(&dp_netdev_mutex); + + ovs_rwlock_wrlock(&dp->port_rwlock); + if (get_port_by_name(dp, argv[2], &port)) { + unixctl_command_reply_error(conn, "unknown port"); + goto exit; + } + + port_no = u32_to_odp(atoi(argv[3])); + if (!port_no || port_no == ODPP_NONE) { + unixctl_command_reply_error(conn, "bad port number"); + goto exit; + } + if (dp_netdev_lookup_port(dp, port_no)) { + unixctl_command_reply_error(conn, "port number already in use"); + goto exit; + } + + /* Remove port. */ + hmap_remove(&dp->ports, &port->node); + reconfigure_datapath(dp); + + /* Reinsert with new port number. */ + port->port_no = port_no; + hmap_insert(&dp->ports, &port->node, hash_port_no(port_no)); + reconfigure_datapath(dp); + + seq_change(dp->port_seq); + unixctl_command_reply(conn, NULL); + +exit: + ovs_rwlock_unlock(&dp->port_rwlock); + dp_netdev_unref(dp); +} + +static void +dpif_dummy_register__(const char *type) +{ + struct dpif_class *class; + + class = xmalloc(sizeof *class); + *class = dpif_netdev2_class; + class->type = xstrdup(type); + dp_register_provider(class); +} + +static void +dpif_dummy_override(const char *type) +{ + int error; + + /* + * Ignore EAFNOSUPPORT to allow --enable-dummy=system with + * a userland-only build. It's useful for testsuite. + */ + error = dp_unregister_provider(type); + if (error == 0 || error == EAFNOSUPPORT) { + dpif_dummy_register__(type); + } +} + +void +dpif_dummy_register(enum dummy_level level) +{ + if (level == DUMMY_OVERRIDE_ALL) { + struct sset types; + const char *type; + + sset_init(&types); + dp_enumerate_types(&types); + SSET_FOR_EACH (type, &types) { + dpif_dummy_override(type); + } + sset_destroy(&types); + } else if (level == DUMMY_OVERRIDE_SYSTEM) { + dpif_dummy_override("system"); + } + + dpif_dummy_register__("dummy"); + + unixctl_command_register("dpif-dummy/change-port-number", + "dp port new-number", + 3, 3, dpif_dummy_change_port_number, NULL); +} + +/* Datapath Classifier. */ + +static void +dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable) +{ + cmap_destroy(&subtable->rules); + ovsrcu_postpone(free, subtable->mf_masks); + ovsrcu_postpone(free, subtable); +} + +/* Initializes 'cls' as a classifier that initially contains no classification + * rules. */ +static void +dpcls_init(struct dpcls *cls) +{ + cmap_init(&cls->subtables_map); + pvector_init(&cls->subtables); +} + +static void +dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable) +{ + VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port); + pvector_remove(&cls->subtables, subtable); + cmap_remove(&cls->subtables_map, &subtable->cmap_node, + subtable->mask.hash); + dpcls_info_dec_usage(subtable->lookup_func_info); + ovsrcu_postpone(dpcls_subtable_destroy_cb, subtable); +} + +/* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the + * caller's responsibility. + * May only be called after all the readers have been terminated. */ +static void +dpcls_destroy(struct dpcls *cls) +{ + if (cls) { + struct dpcls_subtable *subtable; + + CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) { + ovs_assert(cmap_count(&subtable->rules) == 0); + dpcls_destroy_subtable(cls, subtable); + } + cmap_destroy(&cls->subtables_map); + pvector_destroy(&cls->subtables); + } +} + +static struct dpcls_subtable * +dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask) +{ + struct dpcls_subtable *subtable; + + /* Need to add one. */ + subtable = xmalloc(sizeof *subtable + - sizeof subtable->mask.mf + mask->len); + cmap_init(&subtable->rules); + subtable->hit_cnt = 0; + netdev_flow_key_clone(&subtable->mask, mask); + + /* The count of bits in the mask defines the space required for masks. + * Then call gen_masks() to create the appropriate masks, avoiding the cost + * of doing runtime calculations. */ + uint32_t unit0 = count_1bits(mask->mf.map.bits[0]); + uint32_t unit1 = count_1bits(mask->mf.map.bits[1]); + subtable->mf_bits_set_unit0 = unit0; + subtable->mf_bits_set_unit1 = unit1; + subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1)); + dpcls_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1); + + /* Get the preferred subtable search function for this (u0,u1) subtable. + * The function is guaranteed to always return a valid implementation, and + * possibly an ISA optimized, and/or specialized implementation. Initialize + * the subtable search function atomically to avoid garbage data being read + * by the PMD thread. + */ + atomic_init(&subtable->lookup_func, + dpcls_subtable_get_best_impl(unit0, unit1, + &subtable->lookup_func_info)); + dpcls_info_inc_usage(subtable->lookup_func_info); + + cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash); + /* Add the new subtable at the end of the pvector (with no hits yet) */ + pvector_insert(&cls->subtables, subtable, 0); + VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d", + cmap_count(&cls->subtables_map), subtable, cls->in_port); + pvector_publish(&cls->subtables); + + return subtable; +} + +static inline struct dpcls_subtable * +dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask) +{ + struct dpcls_subtable *subtable; + + CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash, + &cls->subtables_map) { + if (netdev_flow_key_equal(&subtable->mask, mask)) { + return subtable; + } + } + return dpcls_create_subtable(cls, mask); +} + +/* Checks for the best available implementation for each subtable lookup + * function, and assigns it as the lookup function pointer for each subtable. + * Returns the number of subtables that have changed lookup implementation. + * This function requires holding a flow_mutex when called. This is to make + * sure modifications done by this function are not overwritten. This could + * happen if dpcls_sort_subtable_vector() is called at the same time as this + * function. + */ +static uint32_t +dpcls_subtable_lookup_reprobe(struct dpcls *cls) +{ + struct pvector *pvec = &cls->subtables; + uint32_t subtables_changed = 0; + struct dpcls_subtable *subtable = NULL; + + PVECTOR_FOR_EACH (subtable, pvec) { + uint32_t u0_bits = subtable->mf_bits_set_unit0; + uint32_t u1_bits = subtable->mf_bits_set_unit1; + void *old_func = subtable->lookup_func; + struct dpcls_subtable_lookup_info_t *old_info; + old_info = subtable->lookup_func_info; + /* Set the subtable lookup function atomically to avoid garbage data + * being read by the PMD thread. */ + atomic_store_relaxed(&subtable->lookup_func, + dpcls_subtable_get_best_impl(u0_bits, u1_bits, + &subtable->lookup_func_info)); + if (old_func != subtable->lookup_func) { + subtables_changed += 1; + } + + if (old_info != subtable->lookup_func_info) { + /* In theory, functions can be shared between implementations, so + * do an explicit check on the function info structures. */ + dpcls_info_dec_usage(old_info); + dpcls_info_inc_usage(subtable->lookup_func_info); + } + } + + return subtables_changed; +} + +/* Periodically sort the dpcls subtable vectors according to hit counts */ +static void +dpcls_sort_subtable_vector(struct dpcls *cls) +{ + struct pvector *pvec = &cls->subtables; + struct dpcls_subtable *subtable; + + PVECTOR_FOR_EACH (subtable, pvec) { + pvector_change_priority(pvec, subtable, subtable->hit_cnt); + subtable->hit_cnt = 0; + } + pvector_publish(pvec); +} + +static inline void +dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, + struct polled_queue *poll_list, int poll_cnt) +{ + struct dpcls *cls; + uint64_t tot_idle = 0, tot_proc = 0, tot_sleep = 0; + unsigned int pmd_load = 0; + + if (pmd->ctx.now > pmd->next_cycle_store) { + uint64_t curr_tsc; + uint8_t rebalance_load_trigger; + struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb; + unsigned int idx; + + if (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >= + pmd->prev_stats[PMD_CYCLES_ITER_IDLE] && + pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >= + pmd->prev_stats[PMD_CYCLES_ITER_BUSY]) { + tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] - + pmd->prev_stats[PMD_CYCLES_ITER_IDLE]; + tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] - + pmd->prev_stats[PMD_CYCLES_ITER_BUSY]; + tot_sleep = pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP] - + pmd->prev_stats[PMD_CYCLES_SLEEP]; + + if (pmd_alb->is_enabled && !pmd->isolated) { + if (tot_proc) { + pmd_load = ((tot_proc * 100) / + (tot_idle + tot_proc + tot_sleep)); + } + + atomic_read_relaxed(&pmd_alb->rebalance_load_thresh, + &rebalance_load_trigger); + if (pmd_load >= rebalance_load_trigger) { + atomic_count_inc(&pmd->pmd_overloaded); + } else { + atomic_count_set(&pmd->pmd_overloaded, 0); + } + } + } + + pmd->prev_stats[PMD_CYCLES_ITER_IDLE] = + pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE]; + pmd->prev_stats[PMD_CYCLES_ITER_BUSY] = + pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY]; + pmd->prev_stats[PMD_CYCLES_SLEEP] = + pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP]; + + /* Get the cycles that were used to process each queue and store. */ + for (unsigned i = 0; i < poll_cnt; i++) { + uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq, + RXQ_CYCLES_PROC_CURR); + dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr); + dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, + 0); + } + curr_tsc = cycles_counter_update(&pmd->perf_stats); + if (pmd->intrvl_tsc_prev) { + /* There is a prev timestamp, store a new intrvl cycle count. */ + atomic_store_relaxed(&pmd->intrvl_cycles, + curr_tsc - pmd->intrvl_tsc_prev); + } + idx = atomic_count_inc(&pmd->intrvl_idx) % PMD_INTERVAL_MAX; + atomic_store_relaxed(&pmd->busy_cycles_intrvl[idx], tot_proc); + pmd->intrvl_tsc_prev = curr_tsc; + /* Start new measuring interval */ + pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN; + } + + if (pmd->ctx.now > pmd->next_optimization) { + /* Try to obtain the flow lock to block out revalidator threads. + * If not possible, just try next time. */ + if (!ovs_mutex_trylock(&pmd->flow_mutex)) { + /* Optimize each classifier */ + CMAP_FOR_EACH (cls, node, &pmd->classifiers) { + dpcls_sort_subtable_vector(cls); + } + ovs_mutex_unlock(&pmd->flow_mutex); + /* Start new measuring interval */ + pmd->next_optimization = pmd->ctx.now + + DPCLS_OPTIMIZATION_INTERVAL; + } + } +} + +/* Returns the sum of a specified number of newest to + * oldest interval values. 'cur_idx' is where the next + * write will be and wrap around needs to be handled. + */ +static uint64_t +get_interval_values(atomic_ullong *source, atomic_count *cur_idx, + int num_to_read) { + unsigned int i; + uint64_t total = 0; + + i = atomic_count_get(cur_idx) % PMD_INTERVAL_MAX; + for (int read = 0; read < num_to_read; read++) { + uint64_t interval_value; + + i = i ? i - 1 : PMD_INTERVAL_MAX - 1; + atomic_read_relaxed(&source[i], &interval_value); + total += interval_value; + } + return total; +} + +/* Insert 'rule' into 'cls'. */ +static void +dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule, + const struct netdev_flow_key *mask) +{ + struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask); + + /* Refer to subtable's mask, also for later removal. */ + rule->mask = &subtable->mask; + cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash); +} + +/* Removes 'rule' from 'cls', also destructing the 'rule'. */ +static void +dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule) +{ + struct dpcls_subtable *subtable; + + ovs_assert(rule->mask); + + /* Get subtable from reference in rule->mask. */ + INIT_CONTAINER(subtable, rule->mask, mask); + if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash) + == 0) { + /* Delete empty subtable. */ + dpcls_destroy_subtable(cls, subtable); + pvector_publish(&cls->subtables); + } +} + +/* Inner loop for mask generation of a unit, see dpcls_flow_key_gen_masks. */ +static inline void +dpcls_flow_key_gen_mask_unit(uint64_t iter, const uint64_t count, + uint64_t *mf_masks) +{ + int i; + for (i = 0; i < count; i++) { + uint64_t lowest_bit = (iter & -iter); + iter &= ~lowest_bit; + mf_masks[i] = (lowest_bit - 1); + } + /* Checks that count has covered all bits in the iter bitmap. */ + ovs_assert(iter == 0); +} + +/* Generate a mask for each block in the miniflow, based on the bits set. This + * allows easily masking packets with the generated array here, without + * calculations. This replaces runtime-calculating the masks. + * @param key The table to generate the mf_masks for + * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size + * @param mf_bits_total Number of bits set in the whole miniflow (both units) + * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow + */ +void +dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl, + uint64_t *mf_masks, + const uint32_t mf_bits_u0, + const uint32_t mf_bits_u1) +{ + uint64_t iter_u0 = tbl->mf.map.bits[0]; + uint64_t iter_u1 = tbl->mf.map.bits[1]; + + dpcls_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]); + dpcls_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, &mf_masks[mf_bits_u0]); +} + +/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit + * in 'mask' the values in 'key' and 'target' are the same. */ +inline bool +dpcls_rule_matches_key(const struct dpcls_rule *rule, + const struct netdev_flow_key *target) +{ + const uint64_t *keyp = miniflow_get_values(&rule->flow.mf); + const uint64_t *maskp = miniflow_get_values(&rule->mask->mf); + uint64_t value; + + NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) { + if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) { + return false; + } + } + return true; +} + +/* For each miniflow in 'keys' performs a classifier lookup writing the result + * into the corresponding slot in 'rules'. If a particular entry in 'keys' is + * NULL it is skipped. + * + * This function is optimized for use in the userspace datapath and therefore + * does not implement a lot of features available in the standard + * classifier_lookup() function. Specifically, it does not implement + * priorities, instead returning any rule which matches the flow. + * + * Returns true if all miniflows found a corresponding rule. */ +bool +dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[], + struct dpcls_rule **rules, const size_t cnt, + int *num_lookups_p) +{ + /* The received 'cnt' miniflows are the search-keys that will be processed + * to find a matching entry into the available subtables. + * The number of bits in map_type is equal to NETDEV_MAX_BURST. */ +#define MAP_BITS (sizeof(uint32_t) * CHAR_BIT) + BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST); + + struct dpcls_subtable *subtable; + uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */ + + if (cnt != MAP_BITS) { + keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */ + } + memset(rules, 0, cnt * sizeof *rules); + + int lookups_match = 0, subtable_pos = 1; + uint32_t found_map; + + /* The Datapath classifier - aka dpcls - is composed of subtables. + * Subtables are dynamically created as needed when new rules are inserted. + * Each subtable collects rules with matches on a specific subset of packet + * fields as defined by the subtable's mask. We proceed to process every + * search-key against each subtable, but when a match is found for a + * search-key, the search for that key can stop because the rules are + * non-overlapping. */ + PVECTOR_FOR_EACH (subtable, &cls->subtables) { + /* Call the subtable specific lookup function. */ + found_map = subtable->lookup_func(subtable, keys_map, keys, rules); + + /* Count the number of subtables searched for this packet match. This + * estimates the "spread" of subtables looked at per matched packet. */ + uint32_t pkts_matched = count_1bits(found_map); + lookups_match += pkts_matched * subtable_pos; + + /* Clear the found rules, and return early if all packets are found. */ + keys_map &= ~found_map; + if (!keys_map) { + if (num_lookups_p) { + *num_lookups_p = lookups_match; + } + return true; + } + subtable_pos++; + } + + if (num_lookups_p) { + *num_lookups_p = lookups_match; + } + return false; +} + +struct dpif_plugin dpif_netdev2_plugin = { + .plugin_class = &dpif_netdev2_class, +}; diff --git a/lib/dpif-netdev2.h b/lib/dpif-netdev2.h new file mode 100644 index 00000000000..6e42502558b --- /dev/null +++ b/lib/dpif-netdev2.h @@ -0,0 +1,380 @@ +/* + * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2015 Nicira, Inc. + * Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef DPIF_NETDEV_H +#define DPIF_NETDEV_H 1 + +#include +#include +#include +#include "dpif.h" +#include "dp-packet.h" +#include "dpif-netdev-ext2.h" +#include "fat-rwlock.h" +#include "mov-avg.h" +#include "netdev-offload.h" +#include "openvswitch/types.h" +#include "packets.h" +#include "pvector.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* Enough headroom to add a vlan tag, plus an extra 2 bytes to allow IP + * headers to be aligned on a 4-byte boundary. */ +enum { DP_NETDEV_HEADROOM = 2 + VLAN_HEADER_LEN }; + +bool dpif_is_netdev2(const struct dpif *); + +#define NR_QUEUE 1 +#define NR_PMD_THREADS 1 + +enum { MAX_METERS = 5000 }; + +enum dp_offload_type { + DP_OFFLOAD_FLOW, + DP_OFFLOAD_CONN, + DP_OFFLOAD_FLUSH, + DP_OFFLOAD_STATS_CLEAR, +}; +#define DP_OFFLOAD_TYPE_NUM (DP_OFFLOAD_STATS_CLEAR + 1) + +enum { + DP_NETDEV_FLOW_OFFLOAD_OP_NONE, + DP_NETDEV_FLOW_OFFLOAD_OP_ADD, + DP_NETDEV_FLOW_OFFLOAD_OP_MOD, + DP_NETDEV_FLOW_OFFLOAD_OP_DEL, +}; + +/* Data structure to keep packet order till fastpath processing. */ +struct dp_packet_flow_map { + struct dp_packet *packet; + struct dp_netdev_flow *flow; + uint16_t tcp_flags; +}; + +struct dp_offload_flow_item { + struct dp_netdev_flow *flow; + int op; + struct match match; + struct nlattr *actions; + size_t actions_len; + odp_port_t orig_in_port; /* Originating in_port for tnl flows. */ + bool is_e2e_cache_flow; + uintptr_t ct_counter_key; + struct flows_counter_key flows_counter_key; +}; + +struct dp_offload_conn_item { + int op; + struct conntrack *ct; + struct conn *conn; +}; + +struct dp_offload_flush_item { + struct netdev *netdev; + struct ovs_refcount *count; + struct ovs_mutex *mutex; + pthread_cond_t *cond; +}; + +union dp_offload_thread_data { + struct dp_offload_flow_item flow; + struct dp_offload_conn_item conn; + struct dp_offload_flush_item flush; +}; + +struct dp_offload_thread_item { + struct mpsc_queue_node node; + struct ovsrcu_gc_node gc_node; + enum dp_offload_type type; + long long int timestamp; + struct dp_netdev *dp; + union dp_offload_thread_data data[0]; +}; + +struct dp_offload_queue_metrics { + struct histogram wait_time; + struct histogram service_time; + struct histogram sojourn_time; +}; + +struct dp_offload_thread { + PADDED_MEMBERS(CACHE_LINE_SIZE, + struct mpsc_queue offload_queue; + bool high_latency_event; + atomic_bool active; + atomic_uint64_t enqueued_ct_add; + atomic_uint64_t enqueued_offload; + struct cmap megaflow_to_mark; + struct cmap mark_to_flow; + struct mov_avg_cma cma; + struct mov_avg_ema ema; + struct histogram latency; + atomic_uint64_t ct_uni_dir_connections; + atomic_uint64_t ct_bi_dir_connections; + struct mpsc_queue ufid_queue; + struct mpsc_queue trace_queue; + struct e2e_cache_stats e2e_stats; + struct dp_offload_queue_metrics queue_metrics[DP_OFFLOAD_TYPE_NUM]; + ); +}; + +enum txq_req_mode { + TXQ_REQ_MODE_THREAD, + TXQ_REQ_MODE_HASH, +}; + +enum txq_mode { + TXQ_MODE_STATIC, + TXQ_MODE_XPS, + TXQ_MODE_XPS_HASH, +}; + +/* A port in a netdev-based datapath. */ +struct dp_netdev_port { + odp_port_t port_no; + enum txq_mode txq_mode; /* static, XPS, XPS_HASH. */ + bool need_reconfigure; /* True if we should reconfigure netdev. */ + struct netdev *netdev; + struct hmap_node node; /* Node in dp_netdev's 'ports'. */ + struct netdev_saved_flags *sf; + struct dp_netdev_rxq *rxqs; + unsigned n_rxq; /* Number of elements in 'rxqs' */ + unsigned *txq_used; /* Number of threads that use each tx queue. */ + struct ovs_mutex txq_used_mutex; + bool emc_enabled; /* If true EMC will be used. */ + char *type; /* Port type as requested by user. */ + char *rxq_affinity_list; /* Requested affinity of rx queues. */ + enum txq_req_mode txq_requested_mode; + bool disabled; + bool offload_disabled; +}; + +struct dp_meter_band { + uint32_t rate; + uint32_t burst_size; + atomic_uint64_t bucket; /* In 1/1000 packets for PKTPS, + * or in bits for KBPS. */ + atomic_uint64_t packet_count; + atomic_uint64_t byte_count; +}; + +struct dp_meter { + struct cmap_node node; + struct ovs_mutex lock; + uint32_t id; + uint16_t flags; + uint16_t n_bands; + uint32_t max_delta_t; + atomic_uint64_t used; /* Time of a last use in milliseconds. */ + atomic_uint64_t packet_count; + atomic_uint64_t byte_count; + struct dp_meter_band bands[]; +}; + +struct pmd_auto_lb { + bool do_dry_run; + bool recheck_config; + bool is_enabled; /* Current status of Auto load balancing. */ + uint64_t rebalance_intvl; + uint64_t rebalance_poll_timer; + uint8_t rebalance_improve_thresh; + atomic_uint8_t rebalance_load_thresh; +}; + +enum sched_assignment_type { + SCHED_ROUNDROBIN, + SCHED_CYCLES, /* Default.*/ + SCHED_GROUP +}; + +struct dpcls { + struct cmap_node node; /* Within dp_netdev_pmd_thread.classifiers */ + odp_port_t in_port; + struct cmap subtables_map; + struct pvector subtables; +}; + +/* Datapath based on the network device interface from netdev.h. + * + * + * Thread-safety + * ============= + * + * Some members, marked 'const', are immutable. Accessing other members + * requires synchronization, as noted in more detail below. + * + * Acquisition order is, from outermost to innermost: + * + * dp_netdev_mutex (global) + * port_rwlock + * bond_mutex + * non_pmd_mutex + */ +struct dp_netdev { + const struct dpif_class *const class; + const char *const name; + struct ovs_refcount ref_cnt; + atomic_flag destroyed; + pthread_t system_port_wd_thread; + atomic_bool system_port_wd_exit; + + /* Ports. + * + * Any lookup into 'ports' or any access to the dp_netdev_ports found + * through 'ports' requires taking 'port_rwlock'. */ + struct ovs_rwlock port_rwlock; + struct hmap ports; + struct seq *port_seq; /* Incremented whenever a port changes. */ + + /* The time that a packet can wait in output batch for sending. */ + atomic_uint32_t tx_flush_interval; + + /* Meters. */ + struct ovs_mutex meters_lock; + struct cmap meters OVS_GUARDED; + + /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/ + atomic_uint32_t emc_insert_min; + /* Enable collection of PMD performance metrics. */ + atomic_bool pmd_perf_metrics; + /* Default max load based sleep request. */ + uint64_t pmd_max_sleep_default; + /* Register the PMD as quiescent when idle. */ + atomic_bool pmd_quiet_idle; + /* Enable the SMC cache from ovsdb config */ + atomic_bool smc_enable_db; + + /* Protects access to ofproto-dpif-upcall interface during revalidator + * thread synchronization. */ + struct fat_rwlock upcall_rwlock; + upcall_callback *upcall_cb; /* Callback function for executing upcalls. */ + void *upcall_aux; + + /* Callback function for notifying the purging of dp flows (during + * reseting pmd deletion). */ + dp_purge_callback *dp_purge_cb; + void *dp_purge_aux; + + /* Stores all 'struct dp_netdev_pmd_thread's. */ + struct cmap poll_threads; + /* id pool for per thread static_tx_qid. */ + struct id_pool *tx_qid_pool; + struct ovs_mutex tx_qid_pool_mutex; + /* Rxq to pmd assignment type. */ + enum sched_assignment_type pmd_rxq_assign_type; + bool pmd_iso; + + /* Protects the access of the 'struct dp_netdev_pmd_thread' + * instance for non-pmd thread. */ + struct ovs_mutex non_pmd_mutex; + + /* Each pmd thread will store its pointer to + * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */ + ovsthread_key_t per_pmd_key; + + struct seq *reconfigure_seq; + uint64_t last_reconfigure_seq; + + /* Cpu mask for pin of pmd threads. */ + char *pmd_cmask; + char *req_pmd_cmask; + + /* PMD max load based sleep request user string. */ + char *max_sleep_list; + + uint64_t last_tnl_conf_seq; + + struct conntrack *conntrack; + struct pmd_auto_lb pmd_alb; + + /* Bonds. */ + struct ovs_mutex bond_mutex; /* Protects updates of 'tx_bonds'. */ + struct cmap tx_bonds; /* Contains 'struct tx_bond'. */ +}; + +extern struct dp_offload_thread dp_offload_threads[MAX_OFFLOAD_THREAD_NB]; + +struct dp_netdev *get_dp_netdev2(const struct dpif *dpif); + +/* Time in microseconds to try RCU quiescing. */ +#define PMD_RCU_QUIESCE_INTERVAL 10000LL + +void +dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid); + +uint32_t +megaflow_to_mark_find(const ovs_u128 *mega_ufid); + +struct dp_netdev_actions * +dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow); + +#define DP_NETDEV_OFFLOAD_BACKOFF_MIN 1 +#define DP_NETDEV_OFFLOAD_BACKOFF_MAX 64 +#define DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US (100 * 1000) /* 100 ms */ + +void +dp_offload_flush(struct dp_offload_thread_item *item); + +void +dp_netdev_free_offload(struct dp_offload_thread_item *offload); + +int +dp_netdev_flow_offload_put(struct dp_offload_thread_item *item); + +int +dp_netdev_flow_offload_del(struct dp_offload_thread_item *item); + +struct dp_offload_thread_item * +dp_netdev_alloc_flow_offload(struct dp_netdev *dp, + struct dp_netdev_flow *flow, + int op, long long now); + +void +dp_offload_flow(struct dp_offload_thread_item *item); + +void +dp_netdev_offload_init(void); + +void +dp_netdev_port_rdlock_at(struct dp_netdev *dp, unsigned long long int limit_ms, + const char *where) + OVS_ACQ_RDLOCK(dp->port_rwlock); + +#define dp_netdev_port_rdlock(dp) \ + dp_netdev_port_rdlock_at(dp, 1000, OVS_SOURCE_LOCATOR) + +#define dp_netdev_port_rdlock_limit(dp, limit_ms) \ + dp_netdev_port_rdlock_at(dp, limit_ms, OVS_SOURCE_LOCATOR) + +void +dp_netdev_esw_ports_set_disabled(struct dp_netdev *dp, struct netdev *esw_mgr, bool value) + OVS_REQ_WRLOCK(dp->port_rwlock); + +void +log_all_pmd_sleeps(struct dp_netdev *dp); + +bool +set_all_pmd_max_sleeps(struct dp_netdev *dp, const struct smap *config); + +#ifdef __cplusplus +} +#endif + +#endif /* netdev.h */ diff --git a/lib/dpif.c b/lib/dpif.c index 07f210d2dd9..060a4e8c893 100644 --- a/lib/dpif.c +++ b/lib/dpif.c @@ -155,6 +155,7 @@ dp_initialize(void) dp_register_dynamic_provider("dummy"); dp_register_dynamic_provider("netlink2"); + dp_register_dynamic_provider("netdev2"); ovsthread_once_done(&once); }