From b27892b0be2053f2b5f10b01ac9232b70496553a Mon Sep 17 00:00:00 2001 From: Rajasekar Raja Date: Wed, 28 Aug 2024 10:40:26 -0700 Subject: [PATCH 1/6] zebra: backpressure - decrease stream allocation size for L2/L3 VNI Currently, zebra-bgp communcication via zapi for L2/L3 VNI allocates more memory than required. So, on a system where BGP is slow/busy and zebra is quick, triggers such as ADD L2/L3 VNI can result in huge buffer growth in zebra thereby spiking up the memory because a VNI ADD/DEL operation includes - Expensive Walk of the entire global routing table per L2/L3 VNI. - The next read (say of another VNI ADD/DEL) from the socket does not proceed unless the current walk is complete. This bigger stream allocation accounts a portion to that memory spike. Fix is to reduce the stream allocation size to a reasonable value when zebra informs BGP about local EVPN L2/L3 VNI Addition or Deletion. Note: - Future commits will optimize the inline global routing table walk for triggers where bigger set of VNIs flap (Ex: br_default/br_vni flap). - Currently, focus is only on communication between zebra and bgp for L2/L3 VNI add/del. Need to evaluate this for other zapi msgs. Ticket :#3864372 Signed-off-by: Donald Sharp sharpd@nvidia.com Signed-off-by: Rajasekar Raja --- lib/zclient.h | 5 +++++ zebra/zebra_evpn.c | 4 ++-- zebra/zebra_vxlan.c | 4 ++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/lib/zclient.h b/lib/zclient.h index 6da9558aa560..1ddf020683c4 100644 --- a/lib/zclient.h +++ b/lib/zclient.h @@ -47,6 +47,11 @@ typedef uint16_t zebra_size_t; #define ZEBRA_MAX_PACKET_SIZ 16384U #define ZEBRA_SMALL_PACKET_SIZE 200U +/* Only for L2/L3 VNI Add/Del */ +#define ZEBRA_VNI_MAX_PACKET_SIZE 80U +#define ZEBRA_VNI_SMALL_PACKET_SIZE 50U +#define ZEBRA_VNI_MIN_PACKET_SIZE 25U + /* Zebra header size. */ #define ZEBRA_HEADER_SIZE 10 diff --git a/zebra/zebra_evpn.c b/zebra/zebra_evpn.c index 25c616c4c956..edff084bacf0 100644 --- a/zebra/zebra_evpn.c +++ b/zebra/zebra_evpn.c @@ -1105,7 +1105,7 @@ int zebra_evpn_send_add_to_client(struct zebra_evpn *zevpn) svi_index = zevpn->svi_if ? zevpn->svi_if->ifindex : 0; - s = stream_new(ZEBRA_SMALL_PACKET_SIZE); + s = stream_new(ZEBRA_VNI_SMALL_PACKET_SIZE); zclient_create_header(s, ZEBRA_VNI_ADD, zebra_vrf_get_evpn_id()); stream_putl(s, zevpn->vni); @@ -1157,7 +1157,7 @@ int zebra_evpn_send_del_to_client(struct zebra_evpn *zevpn) zebra_evpn_update_all_es(zevpn); } - s = stream_new(ZEBRA_SMALL_PACKET_SIZE); + s = stream_new(ZEBRA_VNI_MIN_PACKET_SIZE); stream_reset(s); zclient_create_header(s, ZEBRA_VNI_DEL, zebra_vrf_get_evpn_id()); diff --git a/zebra/zebra_vxlan.c b/zebra/zebra_vxlan.c index ad112a4ab1f4..d4d0a1df1d24 100644 --- a/zebra/zebra_vxlan.c +++ b/zebra/zebra_vxlan.c @@ -2203,7 +2203,7 @@ static int zl3vni_send_add_to_client(struct zebra_l3vni *zl3vni) is_anycast_mac = false; } - s = stream_new(ZEBRA_MAX_PACKET_SIZ); + s = stream_new(ZEBRA_VNI_MAX_PACKET_SIZE); /* The message is used for both vni add and/or update like * vrr mac is added for l3vni SVI. @@ -2246,7 +2246,7 @@ static int zl3vni_send_del_to_client(struct zebra_l3vni *zl3vni) if (!client) return 0; - s = stream_new(ZEBRA_MAX_PACKET_SIZ); + s = stream_new(ZEBRA_VNI_MIN_PACKET_SIZE); zclient_create_header(s, ZEBRA_L3VNI_DEL, zl3vni_vrf_id(zl3vni)); stream_putl(s, zl3vni->vni); From 55f056ccbfda537b42279e855abd0319b0154a46 Mon Sep 17 00:00:00 2001 From: Trey Aspelund Date: Wed, 25 Jan 2023 13:47:24 -0500 Subject: [PATCH 2/6] bgpd: add EVPN route type msg list Adds a msg list for getting strings mapping to enum bgp_evpn_route_type Ticket: #3318830 Signed-off-by: Trey Aspelund --- bgpd/bgp_evpn_private.h | 9 +++++++++ lib/prefix.h | 1 + 2 files changed, 10 insertions(+) diff --git a/bgpd/bgp_evpn_private.h b/bgpd/bgp_evpn_private.h index b05df3d82ae7..24b4d0993fbf 100644 --- a/bgpd/bgp_evpn_private.h +++ b/bgpd/bgp_evpn_private.h @@ -32,6 +32,15 @@ #define BGP_EVPN_TYPE4_V4_PSIZE 23 #define BGP_EVPN_TYPE4_V6_PSIZE 34 +static const struct message bgp_evpn_route_type_str[] = { + {BGP_EVPN_UNKN_ROUTE, "UNKNOWN"}, + {BGP_EVPN_AD_ROUTE, "AD"}, + {BGP_EVPN_MAC_IP_ROUTE, "MACIP"}, + {BGP_EVPN_IMET_ROUTE, "IMET"}, + {BGP_EVPN_ES_ROUTE, "ES"}, + {BGP_EVPN_IP_PREFIX_ROUTE, "IP-PREFIX"}, + {0}}; + RB_HEAD(bgp_es_evi_rb_head, bgp_evpn_es_evi); RB_PROTOTYPE(bgp_es_evi_rb_head, bgp_evpn_es_evi, rb_node, bgp_es_evi_rb_cmp); diff --git a/lib/prefix.h b/lib/prefix.h index 2d679d06224f..2f1a437b80b4 100644 --- a/lib/prefix.h +++ b/lib/prefix.h @@ -26,6 +26,7 @@ extern "C" { /* EVPN route types. */ typedef enum { + BGP_EVPN_UNKN_ROUTE = 0, BGP_EVPN_AD_ROUTE = 1, /* Ethernet Auto-Discovery (A-D) route */ BGP_EVPN_MAC_IP_ROUTE, /* MAC/IP Advertisement route */ BGP_EVPN_IMET_ROUTE, /* Inclusive Multicast Ethernet Tag route */ From 58e8563b80f7b8de32599e92d35d95dbe09ecd53 Mon Sep 17 00:00:00 2001 From: Rajasekar Raja Date: Wed, 28 Aug 2024 11:38:53 -0700 Subject: [PATCH 3/6] bgpd: backpressure - Add a typesafe list for Zebra L2VNI and L3VNI - For L2vni, struct bgp_master holds a type safe list of all the VNIs(struct bgpevpn) that needs to be processed. - For L3vni, struct bgp_master holds a type safe list of all the BGP_VRFs(struct bgp) that needs to be processed. Future commits will use this. Ticket :#3864372 Signed-off-by: Rajasekar Raja --- bgpd/bgp_evpn_private.h | 4 ++++ bgpd/bgp_main.c | 2 ++ bgpd/bgpd.c | 2 ++ bgpd/bgpd.h | 12 ++++++++++++ 4 files changed, 20 insertions(+) diff --git a/bgpd/bgp_evpn_private.h b/bgpd/bgp_evpn_private.h index 24b4d0993fbf..65f17aa73d55 100644 --- a/bgpd/bgp_evpn_private.h +++ b/bgpd/bgp_evpn_private.h @@ -117,11 +117,15 @@ struct bgpevpn { /* List of local ESs */ struct list *local_es_evi_list; + struct zebra_l2_vni_item zl2vni; + QOBJ_FIELDS; }; DECLARE_QOBJ_TYPE(bgpevpn); +DECLARE_LIST(zebra_l2_vni, struct bgpevpn, zl2vni); + /* Mapping of Import RT to VNIs. * The Import RTs of all VNIs are maintained in a hash table with each * RT linking to all VNIs that will import routes matching this RT. diff --git a/bgpd/bgp_main.c b/bgpd/bgp_main.c index 535d2fc5f434..9ca20c949a30 100644 --- a/bgpd/bgp_main.c +++ b/bgpd/bgp_main.c @@ -207,6 +207,8 @@ static __attribute__((__noreturn__)) void bgp_exit(int status) bgp_nhg_finish(); zebra_announce_fini(&bm->zebra_announce_head); + zebra_l2_vni_fini(&bm->zebra_l2_vni_head); + zebra_l3_vni_fini(&bm->zebra_l3_vni_head); /* reverse bgp_dump_init */ bgp_dump_finish(); diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c index f92ae969f8c4..8b05ac880ec4 100644 --- a/bgpd/bgpd.c +++ b/bgpd/bgpd.c @@ -8492,6 +8492,8 @@ void bgp_master_init(struct event_loop *master, const int buffer_size, bm = &bgp_master; zebra_announce_init(&bm->zebra_announce_head); + zebra_l2_vni_init(&bm->zebra_l2_vni_head); + zebra_l3_vni_init(&bm->zebra_l3_vni_head); bm->bgp = list_new(); bm->listen_sockets = list_new(); bm->port = BGP_PORT_DEFAULT; diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h index df55d879e71d..5835eaac6b57 100644 --- a/bgpd/bgpd.h +++ b/bgpd/bgpd.h @@ -19,6 +19,8 @@ #include "asn.h" PREDECL_LIST(zebra_announce); +PREDECL_LIST(zebra_l2_vni); +PREDECL_LIST(zebra_l3_vni); /* For union sockunion. */ #include "queue.h" @@ -204,6 +206,12 @@ struct bgp_master { /* To preserve ordering of installations into zebra across all Vrfs */ struct zebra_announce_head zebra_announce_head; + /* To preserve ordering of processing of L2 VNIs in BGP */ + struct zebra_l2_vni_head zebra_l2_vni_head; + + /* To preserve ordering of processing of BGP-VRFs for L3 VNIs */ + struct zebra_l3_vni_head zebra_l3_vni_head; + QOBJ_FIELDS; }; DECLARE_QOBJ_TYPE(bgp_master); @@ -868,10 +876,14 @@ struct bgp { uint64_t node_already_on_queue; uint64_t node_deferred_on_queue; + struct zebra_l3_vni_item zl3vni; + QOBJ_FIELDS; }; DECLARE_QOBJ_TYPE(bgp); +DECLARE_LIST(zebra_l3_vni, struct bgp, zl3vni); + struct bgp_interface { #define BGP_INTERFACE_MPLS_BGP_FORWARDING (1 << 0) /* L3VPN multi domain switching */ From 353a706d4e9d3154f4839cace196a37fd5239560 Mon Sep 17 00:00:00 2001 From: Rajasekar Raja Date: Wed, 28 Aug 2024 12:11:16 -0700 Subject: [PATCH 4/6] bgpd: backpressure - Optimize EVPN L2VNI remote routes processing Anytime BGP gets a L2 VNI ADD from zebra, - Walking the entire global routing table per L2VNI is very expensive. - The next read (say of another VNI ADD) from the socket does not proceed unless this walk is complete. So for triggers where a bulk of L2VNI's are flapped, this results in huge output buffer FIFO growth spiking up the memory in zebra since bgp is slow/busy processing the first message. To avoid this, idea is to hookup the VPN off the bgp_master struct and maintain a VPN FIFO list which is processed later on, where we walk a chunk of VPNs and do the remote route install. Note: So far in the L3 backpressure cases(#15524), we have considered the fact that zebra is slow, and the buffer grows in the BGP. However this is the reverse i.e. BGP is very busy processing the first ZAPI message from zebra due to which the buffer grows huge in zebra and memory spikes up. Ticket :#3864372 Signed-off-by: Rajasekar Raja --- bgpd/bgp_evpn.c | 226 +++++++++++++++++++++++++++++++--------- bgpd/bgp_evpn.h | 1 + bgpd/bgp_evpn_private.h | 5 +- bgpd/bgp_zebra.c | 17 +++ bgpd/bgp_zebra.h | 1 + bgpd/bgpd.c | 35 +++++-- bgpd/bgpd.h | 1 + 7 files changed, 227 insertions(+), 59 deletions(-) diff --git a/bgpd/bgp_evpn.c b/bgpd/bgp_evpn.c index f173bd01f20f..31371920047f 100644 --- a/bgpd/bgp_evpn.c +++ b/bgpd/bgp_evpn.c @@ -3979,30 +3979,81 @@ static int install_uninstall_routes_for_vrf(struct bgp *bgp_vrf, bool install) return 0; } +#define BGP_PROC_L2VNI_LIMIT 10 +static int install_evpn_remote_route_per_l2vni(struct bgp *bgp, struct bgp_path_info *pi, + const struct prefix_evpn *evp) +{ + int ret = 0; + uint8_t vni_iter = 0; + struct bgpevpn *t_vpn = NULL; + struct bgpevpn *t_vpn_next = NULL; + + for (t_vpn = zebra_l2_vni_first(&bm->zebra_l2_vni_head); + t_vpn && vni_iter < BGP_PROC_L2VNI_LIMIT; t_vpn = t_vpn_next) { + t_vpn_next = zebra_l2_vni_next(&bm->zebra_l2_vni_head, t_vpn); + vni_iter++; + /* + * Skip install/uninstall if the route entry is not needed to + * be imported into the VNI i.e. RTs dont match + */ + if (!is_route_matching_for_vni(bgp, t_vpn, pi)) + continue; + + ret = install_evpn_route_entry(bgp, t_vpn, evp, pi); + + if (ret) { + flog_err(EC_BGP_EVPN_FAIL, + "%u: Failed to install EVPN %s route in VNI %u during BP", + bgp->vrf_id, bgp_evpn_route_type_str[evp->prefix.route_type].str, + t_vpn->vni); + zebra_l2_vni_del(&bm->zebra_l2_vni_head, t_vpn); + + return ret; + } + } + + return 0; +} + /* * Install or uninstall routes of specified type that are appropriate for this * particular VNI. */ -static int install_uninstall_routes_for_vni(struct bgp *bgp, - struct bgpevpn *vpn, bool install) +int install_uninstall_routes_for_vni(struct bgp *bgp, struct bgpevpn *vpn, bool install) { afi_t afi; safi_t safi; struct bgp_dest *rd_dest, *dest; struct bgp_table *table; struct bgp_path_info *pi; - int ret; + int ret = 0; + uint8_t count = 0; + bool walk_fifo = false; afi = AFI_L2VPN; safi = SAFI_EVPN; - /* Walk entire global routing table and evaluate routes which could be + if (!bgp) { + walk_fifo = true; + bgp = bgp_get_evpn(); + if (!bgp) { + if (BGP_DEBUG(zebra, ZEBRA)) + zlog_debug("No BGP EVPN instance found..."); + + return -1; + } + } + + if (BGP_DEBUG(zebra, ZEBRA)) + zlog_debug("%s: Total %u L2VNI VPNs pending to be processed for remote route installation", + __func__, (uint32_t)zebra_l2_vni_count(&bm->zebra_l2_vni_head)); + /* + * Walk entire global routing table and evaluate routes which could be * imported into this VPN. Note that we cannot just look at the routes - * for - * the VNI's RD - remote routes applicable for this VNI could have any - * RD. + * for the VNI's RD - remote routes applicable for this VNI could have + * any RD. + * Note: EVPN routes are a 2-level table. */ - /* EVPN routes are a 2-level table. */ for (rd_dest = bgp_table_top(bgp->rib[afi][safi]); rd_dest; rd_dest = bgp_route_next(rd_dest)) { table = bgp_dest_get_bgp_table_info(rd_dest); @@ -4015,54 +4066,79 @@ static int install_uninstall_routes_for_vni(struct bgp *bgp, (const struct prefix_evpn *)bgp_dest_get_prefix( dest); - if (evp->prefix.route_type != BGP_EVPN_IMET_ROUTE && - evp->prefix.route_type != BGP_EVPN_AD_ROUTE && - evp->prefix.route_type != BGP_EVPN_MAC_IP_ROUTE) + /* Proceed only for IMET/AD/MAC_IP routes */ + switch (evp->prefix.route_type) { + case BGP_EVPN_IMET_ROUTE: + case BGP_EVPN_AD_ROUTE: + case BGP_EVPN_MAC_IP_ROUTE: + break; + default: continue; + } for (pi = bgp_dest_get_bgp_path_info(dest); pi; pi = pi->next) { - /* Consider "valid" remote routes applicable for - * this VNI. */ - if (!(CHECK_FLAG(pi->flags, BGP_PATH_VALID) - && pi->type == ZEBRA_ROUTE_BGP - && pi->sub_type == BGP_ROUTE_NORMAL)) - continue; - - if (!is_route_matching_for_vni(bgp, vpn, pi)) + /* + * Skip install/uninstall if + * - Not a valid remote routes + * - Install & evpn route matchesi macvrf SOO + */ + if (!(CHECK_FLAG(pi->flags, BGP_PATH_VALID) && + pi->type == ZEBRA_ROUTE_BGP && + pi->sub_type == BGP_ROUTE_NORMAL) || + (install && bgp_evpn_route_matches_macvrf_soo(pi, evp))) continue; - if (install) { - if (bgp_evpn_route_matches_macvrf_soo( - pi, evp)) + if (walk_fifo) { + ret = install_evpn_remote_route_per_l2vni(bgp, pi, evp); + if (ret) { + bgp_dest_unlock_node(rd_dest); + bgp_dest_unlock_node(dest); + return ret; + } + } else { + /* + * Skip install/uninstall if the route + * entry is not needed to be imported + * into the VNI i.e. RTs dont match + */ + if (!is_route_matching_for_vni(bgp, vpn, pi)) continue; - ret = install_evpn_route_entry(bgp, vpn, - evp, pi); - } else - ret = uninstall_evpn_route_entry( - bgp, vpn, evp, pi); - - if (ret) { - flog_err(EC_BGP_EVPN_FAIL, - "%u: Failed to %s EVPN %s route in VNI %u", - bgp->vrf_id, - install ? "install" - : "uninstall", - evp->prefix.route_type == - BGP_EVPN_MAC_IP_ROUTE - ? "MACIP" - : "IMET", - vpn->vni); - - bgp_dest_unlock_node(rd_dest); - bgp_dest_unlock_node(dest); - return ret; + if (install) + ret = install_evpn_route_entry(bgp, vpn, evp, pi); + else + ret = uninstall_evpn_route_entry(bgp, vpn, evp, pi); + + if (ret) { + flog_err(EC_BGP_EVPN_FAIL, + "%u: Failed to %s EVPN %s route in VNI %u", + bgp->vrf_id, + install ? "install" : "uninstall", + bgp_evpn_route_type_str[evp->prefix.route_type] + .str, + vpn->vni); + + bgp_dest_unlock_node(rd_dest); + bgp_dest_unlock_node(dest); + return ret; + } } } } } + if (walk_fifo) { + while (count < BGP_PROC_L2VNI_LIMIT) { + vpn = zebra_l2_vni_pop(&bm->zebra_l2_vni_head); + if (!vpn) + return 0; + + UNSET_FLAG(vpn->flags, VNI_FLAG_ADD); + count++; + } + } + return 0; } @@ -7024,6 +7100,45 @@ int bgp_evpn_local_l3vni_del(vni_t l3vni, vrf_id_t vrf_id) return 0; } +static void bgp_evpn_l2vni_remote_route_processing(struct bgpevpn *vpn) +{ + /* + * Anytime BGP gets a Bulk of L2 VNIs ADD/UPD from zebra, + * - Walking the entire global routing table per VNI is very expensive. + * - The next read (say of another VNI ADD/UPD) from the socket does + * not proceed unless this walk is complete. + * This results in huge output buffer FIFO growth spiking up the + * memory in zebra. + * + * To avoid this, idea is to hookup the VPN off the struct bgp_master + * and maintain a VPN FIFO list which is processed later on, where we + * walk a chunk of VPNs and do the remote route install. + */ + if (!CHECK_FLAG(vpn->flags, VNI_FLAG_ADD)) { + zebra_l2_vni_add_tail(&bm->zebra_l2_vni_head, vpn); + SET_FLAG(vpn->flags, VNI_FLAG_ADD); + } + + if (BGP_DEBUG(zebra, ZEBRA)) + zlog_debug("Scheduling L2VNI ADD to be processed later for VNI %u", vpn->vni); + + /* + * If there are no VNI's in the bgp VPN FIFO list i.e. an update + * for an already processed VNI comes in, schedule the remote + * route install immediately. + * + * In all other cases, it is ok to schedule the remote route install + * after a small sleep. This is to give benefit of doubt in case more + * L2VNI ADD events come. + */ + if (zebra_l2_vni_count(&bm->zebra_l2_vni_head)) + event_add_timer_msec(bm->master, bgp_zebra_process_remote_routes_for_l2vni, NULL, + 20, &bm->t_bgp_zebra_l2_vni); + else + event_add_event(bm->master, bgp_zebra_process_remote_routes_for_l2vni, NULL, 0, + &bm->t_bgp_zebra_l2_vni); +} + /* * When bgp instance goes down also clean up what might have been left over * from evpn. @@ -7047,6 +7162,10 @@ int bgp_evpn_local_vni_del(struct bgp *bgp, vni_t vni) if (!vpn) return 0; + /* Remove the VPN from the bgp VPN FIFO (if exists) */ + UNSET_FLAG(vpn->flags, VNI_FLAG_ADD); + zebra_l2_vni_del(&bm->zebra_l2_vni_head, vpn); + /* Remove all local EVPN routes and schedule for processing (to * withdraw from peers). */ @@ -7203,12 +7322,6 @@ int bgp_evpn_local_vni_add(struct bgp *bgp, vni_t vni, } } - /* If we have learnt and retained remote routes (VTEPs, MACs) for this - * VNI, - * install them. - */ - install_routes_for_vni(bgp, vpn); - /* If we are advertising gateway mac-ip It needs to be conveyed again to zebra */ bgp_zebra_advertise_gw_macip(bgp, vpn->advertise_gw_macip, vpn->vni); @@ -7216,6 +7329,8 @@ int bgp_evpn_local_vni_add(struct bgp *bgp, vni_t vni, /* advertise svi mac-ip knob to zebra */ bgp_zebra_advertise_svi_macip(bgp, vpn->advertise_svi_macip, vpn->vni); + bgp_evpn_l2vni_remote_route_processing(vpn); + return 0; } @@ -7245,8 +7360,17 @@ void bgp_evpn_flood_control_change(struct bgp *bgp) */ void bgp_evpn_cleanup_on_disable(struct bgp *bgp) { - hash_iterate(bgp->vnihash, (void (*)(struct hash_bucket *, - void *))cleanup_vni_on_disable, + struct bgpevpn *vpn = NULL; + uint32_t vni_count = zebra_l2_vni_count(&bm->zebra_l2_vni_head); + + /* Cleanup VNI FIFO list from this bgp instance */ + while (vni_count) { + vpn = zebra_l2_vni_pop(&bm->zebra_l2_vni_head); + UNSET_FLAG(vpn->flags, VNI_FLAG_ADD); + vni_count--; + } + + hash_iterate(bgp->vnihash, (void (*)(struct hash_bucket *, void *))cleanup_vni_on_disable, bgp); } diff --git a/bgpd/bgp_evpn.h b/bgpd/bgp_evpn.h index 1a333a5a09a3..75dde616ce78 100644 --- a/bgpd/bgp_evpn.h +++ b/bgpd/bgp_evpn.h @@ -200,4 +200,5 @@ bool bgp_evpn_skip_vrf_import_of_local_es(struct bgp *bgp_vrf, const struct pref int uninstall_evpn_route_entry_in_vrf(struct bgp *bgp_vrf, const struct prefix_evpn *evp, struct bgp_path_info *parent_pi); extern void bgp_zebra_evpn_pop_items_from_announce_fifo(struct bgpevpn *vpn); +extern int install_uninstall_routes_for_vni(struct bgp *bgp, struct bgpevpn *vpn, bool install); #endif /* _QUAGGA_BGP_EVPN_H */ diff --git a/bgpd/bgp_evpn_private.h b/bgpd/bgp_evpn_private.h index 65f17aa73d55..cc92807d42b4 100644 --- a/bgpd/bgp_evpn_private.h +++ b/bgpd/bgp_evpn_private.h @@ -62,8 +62,9 @@ struct bgpevpn { #define VNI_FLAG_RD_CFGD 0x4 /* RD is user configured. */ #define VNI_FLAG_IMPRT_CFGD 0x8 /* Import RT is user configured */ #define VNI_FLAG_EXPRT_CFGD 0x10 /* Export RT is user configured */ -#define VNI_FLAG_USE_TWO_LABELS 0x20 /* Attach both L2-VNI and L3-VNI if - needed for this VPN */ +/* Attach both L2-VNI and L3-VNI if needed for this VPN */ +#define VNI_FLAG_USE_TWO_LABELS 0x20 +#define VNI_FLAG_ADD 0x40 /* L2VNI Add */ struct bgp *bgp_vrf; /* back pointer to the vrf instance */ diff --git a/bgpd/bgp_zebra.c b/bgpd/bgp_zebra.c index 688dfacaa0b6..1eb614385c06 100644 --- a/bgpd/bgp_zebra.c +++ b/bgpd/bgp_zebra.c @@ -3025,6 +3025,23 @@ static void bgp_zebra_connected(struct zclient *zclient) BGP_GR_ROUTER_DETECT_AND_SEND_CAPABILITY_TO_ZEBRA(bgp, bgp->peer); } +void bgp_zebra_process_remote_routes_for_l2vni(struct event *e) +{ + /* + * If we have learnt and retained remote routes (VTEPs, MACs) + * for this VNI, install them. + */ + install_uninstall_routes_for_vni(NULL, NULL, true); + + /* + * If there are VNIs still pending to be processed, schedule them + * after a small sleep so that CPU can be used for other purposes. + */ + if (zebra_l2_vni_count(&bm->zebra_l2_vni_head)) + event_add_timer_msec(bm->master, bgp_zebra_process_remote_routes_for_l2vni, NULL, + 20, &bm->t_bgp_zebra_l2_vni); +} + static int bgp_zebra_process_local_es_add(ZAPI_CALLBACK_ARGS) { esi_t esi; diff --git a/bgpd/bgp_zebra.h b/bgpd/bgp_zebra.h index 8deecba747b3..993d002998f2 100644 --- a/bgpd/bgp_zebra.h +++ b/bgpd/bgp_zebra.h @@ -135,4 +135,5 @@ extern void bgp_zebra_release_label_range(uint32_t start, uint32_t end); extern enum zclient_send_status bgp_zebra_withdraw_actual(struct bgp_dest *dest, struct bgp_path_info *info, struct bgp *bgp); +extern void bgp_zebra_process_remote_routes_for_l2vni(struct event *e); #endif /* _QUAGGA_BGP_ZEBRA_H */ diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c index 8b05ac880ec4..45c4229afa3c 100644 --- a/bgpd/bgpd.c +++ b/bgpd/bgpd.c @@ -3966,11 +3966,14 @@ int bgp_delete(struct bgp *bgp) afi_t afi; safi_t safi; int i; + uint32_t vni_count; + struct bgpevpn *vpn = NULL; struct bgp_dest *dest = NULL; struct bgp_dest *dest_next = NULL; struct bgp_table *dest_table = NULL; struct graceful_restart_info *gr_info; - uint32_t cnt_before, cnt_after; + uint32_t b_ann_cnt = 0, b_l2_cnt = 0; + uint32_t a_ann_cnt = 0, a_l2_cnt = 0; assert(bgp); @@ -3978,7 +3981,7 @@ int bgp_delete(struct bgp *bgp) * Iterate the pending dest list and remove all the dest pertaininig to * the bgp under delete. */ - cnt_before = zebra_announce_count(&bm->zebra_announce_head); + b_ann_cnt = zebra_announce_count(&bm->zebra_announce_head); for (dest = zebra_announce_first(&bm->zebra_announce_head); dest; dest = dest_next) { dest_next = zebra_announce_next(&bm->zebra_announce_head, dest); @@ -3990,10 +3993,28 @@ int bgp_delete(struct bgp *bgp) } } - cnt_after = zebra_announce_count(&bm->zebra_announce_head); - if (BGP_DEBUG(zebra, ZEBRA)) - zlog_debug("Zebra Announce Fifo cleanup count before %u and after %u during BGP %s deletion", - cnt_before, cnt_after, bgp->name_pretty); + /* + * Pop all VPNs yet to be processed for remote routes install if the + * bgp-evpn instance is getting deleted + */ + if (bgp == bgp_get_evpn()) { + b_l2_cnt = zebra_l2_vni_count(&bm->zebra_l2_vni_head); + vni_count = b_l2_cnt; + while (vni_count) { + vpn = zebra_l2_vni_pop(&bm->zebra_l2_vni_head); + UNSET_FLAG(vpn->flags, VNI_FLAG_ADD); + vni_count--; + } + } + + if (BGP_DEBUG(zebra, ZEBRA)) { + a_ann_cnt = zebra_announce_count(&bm->zebra_announce_head); + a_l2_cnt = zebra_l2_vni_count(&bm->zebra_l2_vni_head); + zlog_debug("FIFO Cleanup Count during BGP %s deletion :: " + "Zebra Announce - before %u after %u :: " + "BGP L2_VNI - before %u after %u", + bgp->name_pretty, b_ann_cnt, a_ann_cnt, b_l2_cnt, a_l2_cnt); + } bgp_soft_reconfig_table_task_cancel(bgp, NULL, NULL); @@ -8517,6 +8538,7 @@ void bgp_master_init(struct event_loop *master, const int buffer_size, bm->stalepath_time = BGP_DEFAULT_STALEPATH_TIME; bm->select_defer_time = BGP_DEFAULT_SELECT_DEFERRAL_TIME; bm->rib_stale_time = BGP_DEFAULT_RIB_STALE_TIME; + bm->t_bgp_zebra_l2_vni = NULL; bgp_mac_init(); /* init the rd id space. @@ -8764,6 +8786,7 @@ void bgp_terminate(void) EVENT_OFF(bm->t_bgp_sync_label_manager); EVENT_OFF(bm->t_bgp_start_label_manager); EVENT_OFF(bm->t_bgp_zebra_route); + EVENT_OFF(bm->t_bgp_zebra_l2_vni); bgp_mac_finish(); } diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h index 5835eaac6b57..139871e4825c 100644 --- a/bgpd/bgpd.h +++ b/bgpd/bgpd.h @@ -206,6 +206,7 @@ struct bgp_master { /* To preserve ordering of installations into zebra across all Vrfs */ struct zebra_announce_head zebra_announce_head; + struct event *t_bgp_zebra_l2_vni; /* To preserve ordering of processing of L2 VNIs in BGP */ struct zebra_l2_vni_head zebra_l2_vni_head; From 888351d094ac0fbc0ef3c2cf42a3669b11bc0daf Mon Sep 17 00:00:00 2001 From: Rajasekar Raja Date: Thu, 31 Oct 2024 13:16:35 -0700 Subject: [PATCH 5/6] bgpd: backpressure - Optimize EVPN L3VNI remote routes processing Anytime BGP gets a L3 VNI ADD/DEL from zebra, - Walking the entire global routing table per L3VNI is very expensive. - The next read (say of another VNI ADD/DEL) from the socket does not proceed unless this walk is complete. So for triggers where a bulk of L3VNI's are flapped, this results in huge output buffer FIFO growth spiking up the memory in zebra since bgp is slow/busy processing the first message. To avoid this, idea is to hookup the BGP-VRF off the struct bgp_master and maintain a struct bgp FIFO list which is processed later on, where we walk a chunk of BGP-VRFs and do the remote route install/uninstall. Ticket :#3864372 Signed-off-by: Rajasekar Raja --- bgpd/bgp_evpn.c | 249 +++++++++++++++++++++++++++++++++++++---------- bgpd/bgp_evpn.h | 1 + bgpd/bgp_zebra.c | 25 +++++ bgpd/bgp_zebra.h | 1 + bgpd/bgpd.c | 24 +++-- bgpd/bgpd.h | 3 + 6 files changed, 243 insertions(+), 60 deletions(-) diff --git a/bgpd/bgp_evpn.c b/bgpd/bgp_evpn.c index 31371920047f..11a4611a14ac 100644 --- a/bgpd/bgp_evpn.c +++ b/bgpd/bgp_evpn.c @@ -79,6 +79,8 @@ static void bgp_evpn_remote_ip_hash_unlink_nexthop(struct hash_bucket *bucket, void *args); static struct in_addr zero_vtep_ip; +static void bgp_evpn_local_l3vni_del_post_processing(struct bgp *bgp_vrf); + /* * Private functions. */ @@ -3882,14 +3884,6 @@ int bgp_evpn_route_entry_install_if_vrf_match(struct bgp *bgp_vrf, const struct prefix_evpn *evp = (const struct prefix_evpn *)bgp_dest_get_prefix(pi->net); - /* Consider "valid" remote routes applicable for - * this VRF. - */ - if (!(CHECK_FLAG(pi->flags, BGP_PATH_VALID) - && pi->type == ZEBRA_ROUTE_BGP - && pi->sub_type == BGP_ROUTE_NORMAL)) - return 0; - if (is_route_matching_for_vrf(bgp_vrf, pi)) { if (bgp_evpn_route_rmac_self_check(bgp_vrf, evp, pi)) return 0; @@ -3916,26 +3910,67 @@ int bgp_evpn_route_entry_install_if_vrf_match(struct bgp *bgp_vrf, return ret; } +#define BGP_PROC_L3VNI_LIMIT 10 +static int install_uninstall_evpn_remote_route_per_l3vni(struct bgp_path_info *pi, + const struct prefix_evpn *evp) +{ + int ret = 0; + uint8_t vni_iter = 0; + bool is_install = false; + struct bgp *bgp_to_proc = NULL; + struct bgp *bgp_to_proc_next = NULL; + + for (bgp_to_proc = zebra_l3_vni_first(&bm->zebra_l3_vni_head); + bgp_to_proc && vni_iter < BGP_PROC_L3VNI_LIMIT; bgp_to_proc = bgp_to_proc_next) { + bgp_to_proc_next = zebra_l3_vni_next(&bm->zebra_l3_vni_head, bgp_to_proc); + vni_iter++; + is_install = !!CHECK_FLAG(bgp_to_proc->flags, BGP_FLAG_L3VNI_SCHEDULE_FOR_INSTALL); + + ret = bgp_evpn_route_entry_install_if_vrf_match(bgp_to_proc, pi, is_install); + if (ret) { + flog_err(EC_BGP_EVPN_FAIL, + "%u: Failed to %s EVPN %s route in L3VNI %u during BP", + bgp_to_proc->vrf_id, is_install ? "install" : "uninstall", + bgp_evpn_route_type_str[evp->prefix.route_type].str, + bgp_to_proc->l3vni); + zebra_l3_vni_del(&bm->zebra_l3_vni_head, bgp_to_proc); + if (!is_install) + bgp_evpn_local_l3vni_del_post_processing(bgp_to_proc); + + return ret; + } + } + + return 0; +} /* * Install or uninstall mac-ip routes are appropriate for this * particular VRF. */ -static int install_uninstall_routes_for_vrf(struct bgp *bgp_vrf, bool install) +int install_uninstall_routes_for_vrf(struct bgp *bgp_vrf, bool install) { afi_t afi; safi_t safi; struct bgp_dest *rd_dest, *dest; struct bgp_table *table; struct bgp_path_info *pi; - int ret; + int ret = 0; struct bgp *bgp_evpn = NULL; + uint8_t count = 0; afi = AFI_L2VPN; safi = SAFI_EVPN; bgp_evpn = bgp_get_evpn(); - if (!bgp_evpn) + if (!bgp_evpn) { + if (BGP_DEBUG(zebra, ZEBRA)) + zlog_debug("No BGP EVPN instance found..."); + return -1; + } + if (BGP_DEBUG(zebra, ZEBRA)) + zlog_debug("%s: Total %u L3VNI BGP-VRFs pending to be processed for remote route installation", + __func__, (uint32_t)zebra_l3_vni_count(&bm->zebra_l3_vni_head)); /* Walk entire global routing table and evaluate routes which could be * imported into this VRF. Note that we need to loop through all global * routes to determine which route matches the import rt on vrf @@ -3952,30 +3987,72 @@ static int install_uninstall_routes_for_vrf(struct bgp *bgp_vrf, bool install) (const struct prefix_evpn *)bgp_dest_get_prefix( dest); - /* if not mac-ip route skip this route */ - if (!(evp->prefix.route_type == BGP_EVPN_MAC_IP_ROUTE - || evp->prefix.route_type - == BGP_EVPN_IP_PREFIX_ROUTE)) - continue; - - /* if not a mac+ip route skip this route */ - if (!(is_evpn_prefix_ipaddr_v4(evp) - || is_evpn_prefix_ipaddr_v6(evp))) + /* Proceed only for MAC_IP/IP-Pfx routes */ + switch (evp->prefix.route_type) { + case BGP_EVPN_IP_PREFIX_ROUTE: + case BGP_EVPN_MAC_IP_ROUTE: + if (!(is_evpn_prefix_ipaddr_v4(evp) || + is_evpn_prefix_ipaddr_v6(evp))) + continue; + break; + default: continue; + } for (pi = bgp_dest_get_bgp_path_info(dest); pi; pi = pi->next) { - ret = bgp_evpn_route_entry_install_if_vrf_match( - bgp_vrf, pi, install); - if (ret) { - bgp_dest_unlock_node(rd_dest); - bgp_dest_unlock_node(dest); - return ret; + /* Consider "valid" remote routes applicable for + * this VRF */ + if (!(CHECK_FLAG(pi->flags, BGP_PATH_VALID) && + pi->type == ZEBRA_ROUTE_BGP && + pi->sub_type == BGP_ROUTE_NORMAL)) + continue; + + if (!bgp_vrf) { + ret = install_uninstall_evpn_remote_route_per_l3vni(pi, evp); + if (ret) { + bgp_dest_unlock_node(rd_dest); + bgp_dest_unlock_node(dest); + + return ret; + } + } else { + ret = bgp_evpn_route_entry_install_if_vrf_match(bgp_vrf, pi, + install); + if (ret) { + flog_err(EC_BGP_EVPN_FAIL, + "%u: Failed to %s EVPN %s route in L3VNI %u", + bgp_vrf->vrf_id, + install ? "install" : "uninstall", + bgp_evpn_route_type_str[evp->prefix.route_type] + .str, + bgp_vrf->l3vni); + bgp_dest_unlock_node(rd_dest); + bgp_dest_unlock_node(dest); + + return ret; + } } } } } + if (!bgp_vrf) { + while (count < BGP_PROC_L3VNI_LIMIT) { + struct bgp *bgp_to_proc = zebra_l3_vni_pop(&bm->zebra_l3_vni_head); + + if (!bgp_to_proc) + return 0; + + if (CHECK_FLAG(bgp_to_proc->flags, BGP_FLAG_L3VNI_SCHEDULE_FOR_DELETE)) + bgp_evpn_local_l3vni_del_post_processing(bgp_to_proc); + + UNSET_FLAG(bgp_to_proc->flags, BGP_FLAG_L3VNI_SCHEDULE_FOR_INSTALL); + UNSET_FLAG(bgp_to_proc->flags, BGP_FLAG_L3VNI_SCHEDULE_FOR_DELETE); + count++; + } + } + return 0; } @@ -6856,6 +6933,53 @@ static void link_l2vni_hash_to_l3vni(struct hash_bucket *bucket, bgpevpn_link_to_l3vni(vpn); } +static void bgp_evpn_l3vni_remote_route_processing(struct bgp *bgp, bool install) +{ + /* + * Anytime BGP gets a Bulk of L3 VNI ADD/DEL from zebra, + * - Walking the entire global routing table per VNI is very expensive. + * - The next read (say of another VNI ADD/DEL) from the socket does + * not proceed unless this walk is complete. + * This results in huge output buffer FIFO growth spiking up the + * memory in zebra. + * + * To avoid this, idea is to hookup the BGP-VRF off the struct + * bgp_master and maintain a struct bgp FIFO list which is processed + * later on, where we walk a chunk of BGP-VRFs and do the remote route + * install/uninstall. + */ + if (!CHECK_FLAG(bgp->flags, BGP_FLAG_L3VNI_SCHEDULE_FOR_INSTALL) && + !CHECK_FLAG(bgp->flags, BGP_FLAG_L3VNI_SCHEDULE_FOR_DELETE)) + zebra_l3_vni_add_tail(&bm->zebra_l3_vni_head, bgp); + + if (install) { + SET_FLAG(bgp->flags, BGP_FLAG_L3VNI_SCHEDULE_FOR_INSTALL); + UNSET_FLAG(bgp->flags, BGP_FLAG_L3VNI_SCHEDULE_FOR_DELETE); + } else { + SET_FLAG(bgp->flags, BGP_FLAG_L3VNI_SCHEDULE_FOR_DELETE); + UNSET_FLAG(bgp->flags, BGP_FLAG_L3VNI_SCHEDULE_FOR_INSTALL); + } + + if (BGP_DEBUG(zebra, ZEBRA)) + zlog_debug("Scheduling L3VNI %s to be processed later for %s VNI %u", + install ? "ADD" : "DEL", bgp->name_pretty, bgp->l3vni); + /* + * If there are no BGP-VRFs's in the bm L3VNI FIFO list i.e. an update + * for an already processed L3VNI comes in, schedule the remote route + * install immediately. + * + * In all other cases, it is ok to schedule the remote route un/install + * after a small sleep. This is to give benefit of doubt in case more + * L3VNI events come. + */ + if (zebra_l3_vni_count(&bm->zebra_l3_vni_head)) + event_add_timer_msec(bm->master, bgp_zebra_process_remote_routes_for_l3vrf, NULL, + 20, &bm->t_bgp_zebra_l3_vni); + else + event_add_event(bm->master, bgp_zebra_process_remote_routes_for_l3vrf, NULL, 0, + &bm->t_bgp_zebra_l3_vni); +} + int bgp_evpn_local_l3vni_add(vni_t l3vni, vrf_id_t vrf_id, struct ethaddr *svi_rmac, struct ethaddr *vrr_rmac, @@ -7001,52 +7125,36 @@ int bgp_evpn_local_l3vni_add(vni_t l3vni, vrf_id_t vrf_id, /* advertise type-5 routes if needed */ update_advertise_vrf_routes(bgp_vrf); - /* install all remote routes belonging to this l3vni into correspondng - * vrf */ - install_routes_for_vrf(bgp_vrf); + bgp_evpn_l3vni_remote_route_processing(bgp_vrf, true); return 0; } -int bgp_evpn_local_l3vni_del(vni_t l3vni, vrf_id_t vrf_id) +static void bgp_evpn_local_l3vni_del_post_processing(struct bgp *bgp_vrf) { - struct bgp *bgp_vrf = NULL; /* bgp vrf instance */ struct bgp *bgp_evpn = NULL; /* EVPN bgp instance */ struct listnode *node = NULL; struct listnode *next = NULL; struct bgpevpn *vpn = NULL; - bgp_vrf = bgp_lookup_by_vrf_id(vrf_id); - if (!bgp_vrf) { - flog_err( - EC_BGP_NO_DFLT, - "Cannot process L3VNI %u Del - Could not find BGP instance", - l3vni); - return -1; - } - bgp_evpn = bgp_get_evpn(); if (!bgp_evpn) { - flog_err( - EC_BGP_NO_DFLT, - "Cannot process L3VNI %u Del - Could not find EVPN BGP instance", - l3vni); - return -1; + flog_err(EC_BGP_NO_DFLT, + "Cannot process L3VNI %u Del - Could not find EVPN BGP instance", + bgp_vrf->l3vni); + return; } if (CHECK_FLAG(bgp_evpn->flags, BGP_FLAG_DELETE_IN_PROGRESS)) { flog_err(EC_BGP_NO_DFLT, - "Cannot process L3VNI %u ADD - EVPN BGP instance is shutting down", - l3vni); - return -1; + "Cannot process L3VNI %u ADD - EVPN BGP instance is shutting down", + bgp_vrf->l3vni); + return; } - /* Remove remote routes from BGT VRF even if BGP_VRF_AUTO is configured, - * bgp_delete would not remove/decrement bgp_path_info of the ip_prefix - * routes. This will uninstalling the routes from zebra and decremnt the - * bgp info count. - */ - uninstall_routes_for_vrf(bgp_vrf); + if (BGP_DEBUG(zebra, ZEBRA)) + zlog_debug("In %s for L3VNI %u after remote route installation", __func__, + bgp_vrf->l3vni); /* delete/withdraw all type-5 routes */ delete_withdraw_vrf_routes(bgp_vrf); @@ -7096,6 +7204,39 @@ int bgp_evpn_local_l3vni_del(vni_t l3vni, vrf_id_t vrf_id) /* Delete the instance if it was autocreated */ if (CHECK_FLAG(bgp_vrf->vrf_flags, BGP_VRF_AUTO)) bgp_delete(bgp_vrf); +} + +int bgp_evpn_local_l3vni_del(vni_t l3vni, vrf_id_t vrf_id) +{ + struct bgp *bgp_evpn = NULL; /* EVPN bgp instance */ + struct bgp *bgp_vrf = NULL; /* bgp vrf instance */ + + bgp_vrf = bgp_lookup_by_vrf_id(vrf_id); + if (!bgp_vrf) { + flog_err(EC_BGP_NO_DFLT, + "Cannot process L3VNI %u Del - Could not find BGP instance", l3vni); + return -1; + } + + bgp_evpn = bgp_get_evpn(); + if (!bgp_evpn) { + flog_err(EC_BGP_NO_DFLT, + "Cannot process L3VNI %u Del - Could not find EVPN BGP instance", l3vni); + return -1; + } + + if (CHECK_FLAG(bgp_evpn->flags, BGP_FLAG_DELETE_IN_PROGRESS)) { + flog_err(EC_BGP_NO_DFLT, + "Cannot process L3VNI %u ADD - EVPN BGP instance is shutting down", l3vni); + return -1; + } + + /* + * Move all the l3vni_delete operation post the remote route + * installation processing i.e. add the L3VNI DELETE item on the + * BGP-VRFs FIFO and move on. + */ + bgp_evpn_l3vni_remote_route_processing(bgp_vrf, false); return 0; } diff --git a/bgpd/bgp_evpn.h b/bgpd/bgp_evpn.h index 75dde616ce78..8bbc5d3c37f2 100644 --- a/bgpd/bgp_evpn.h +++ b/bgpd/bgp_evpn.h @@ -201,4 +201,5 @@ int uninstall_evpn_route_entry_in_vrf(struct bgp *bgp_vrf, const struct prefix_e struct bgp_path_info *parent_pi); extern void bgp_zebra_evpn_pop_items_from_announce_fifo(struct bgpevpn *vpn); extern int install_uninstall_routes_for_vni(struct bgp *bgp, struct bgpevpn *vpn, bool install); +extern int install_uninstall_routes_for_vrf(struct bgp *bgp_vrf, bool install); #endif /* _QUAGGA_BGP_EVPN_H */ diff --git a/bgpd/bgp_zebra.c b/bgpd/bgp_zebra.c index 1eb614385c06..783dec7802db 100644 --- a/bgpd/bgp_zebra.c +++ b/bgpd/bgp_zebra.c @@ -3042,6 +3042,31 @@ void bgp_zebra_process_remote_routes_for_l2vni(struct event *e) 20, &bm->t_bgp_zebra_l2_vni); } +void bgp_zebra_process_remote_routes_for_l3vrf(struct event *e) +{ + /* + * Install/Uninstall all remote routes belonging to l3vni + * + * NOTE: + * - At this point it does not matter whether we call + * install_routes_for_vrf/uninstall_routes_for_vrf. + * - Since we pass struct bgp as NULL, + * * we iterate the bm FIFO list + * * the second variable (true) is ignored as well and + * calculated based on the BGP-VRFs flags for ADD/DELETE. + */ + install_uninstall_routes_for_vrf(NULL, true); + + /* + * If there are L3VNIs still pending to be processed, schedule them + * after a small sleep so that CPU can be used for other purposes. + */ + if (zebra_l3_vni_count(&bm->zebra_l3_vni_head)) { + event_add_timer_msec(bm->master, bgp_zebra_process_remote_routes_for_l3vrf, NULL, + 20, &bm->t_bgp_zebra_l3_vni); + } +} + static int bgp_zebra_process_local_es_add(ZAPI_CALLBACK_ARGS) { esi_t esi; diff --git a/bgpd/bgp_zebra.h b/bgpd/bgp_zebra.h index 993d002998f2..7e9d57cb8521 100644 --- a/bgpd/bgp_zebra.h +++ b/bgpd/bgp_zebra.h @@ -136,4 +136,5 @@ extern enum zclient_send_status bgp_zebra_withdraw_actual(struct bgp_dest *dest, struct bgp_path_info *info, struct bgp *bgp); extern void bgp_zebra_process_remote_routes_for_l2vni(struct event *e); +extern void bgp_zebra_process_remote_routes_for_l3vrf(struct event *e); #endif /* _QUAGGA_BGP_ZEBRA_H */ diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c index 45c4229afa3c..f2149c3256cc 100644 --- a/bgpd/bgpd.c +++ b/bgpd/bgpd.c @@ -3972,8 +3972,10 @@ int bgp_delete(struct bgp *bgp) struct bgp_dest *dest_next = NULL; struct bgp_table *dest_table = NULL; struct graceful_restart_info *gr_info; - uint32_t b_ann_cnt = 0, b_l2_cnt = 0; - uint32_t a_ann_cnt = 0, a_l2_cnt = 0; + uint32_t b_ann_cnt = 0, b_l2_cnt = 0, b_l3_cnt = 0; + uint32_t a_ann_cnt = 0, a_l2_cnt = 0, a_l3_cnt = 0; + struct bgp *bgp_to_proc = NULL; + struct bgp *bgp_to_proc_next = NULL; assert(bgp); @@ -4007,13 +4009,21 @@ int bgp_delete(struct bgp *bgp) } } + b_l3_cnt = zebra_l3_vni_count(&bm->zebra_l3_vni_head); + for (bgp_to_proc = zebra_l3_vni_first(&bm->zebra_l3_vni_head); bgp_to_proc; + bgp_to_proc = bgp_to_proc_next) { + bgp_to_proc_next = zebra_l3_vni_next(&bm->zebra_l3_vni_head, bgp_to_proc); + if (bgp_to_proc == bgp) + zebra_l3_vni_del(&bm->zebra_l3_vni_head, bgp_to_proc); + } + if (BGP_DEBUG(zebra, ZEBRA)) { a_ann_cnt = zebra_announce_count(&bm->zebra_announce_head); a_l2_cnt = zebra_l2_vni_count(&bm->zebra_l2_vni_head); - zlog_debug("FIFO Cleanup Count during BGP %s deletion :: " - "Zebra Announce - before %u after %u :: " - "BGP L2_VNI - before %u after %u", - bgp->name_pretty, b_ann_cnt, a_ann_cnt, b_l2_cnt, a_l2_cnt); + a_l3_cnt = zebra_l3_vni_count(&bm->zebra_l3_vni_head); + zlog_debug("BGP %s deletion FIFO cnt Zebra_Ann before %u after %u, L2_VNI before %u after, %u L3_VNI before %u after %u", + bgp->name_pretty, b_ann_cnt, a_ann_cnt, b_l2_cnt, a_l2_cnt, b_l3_cnt, + a_l3_cnt); } bgp_soft_reconfig_table_task_cancel(bgp, NULL, NULL); @@ -8539,6 +8549,7 @@ void bgp_master_init(struct event_loop *master, const int buffer_size, bm->select_defer_time = BGP_DEFAULT_SELECT_DEFERRAL_TIME; bm->rib_stale_time = BGP_DEFAULT_RIB_STALE_TIME; bm->t_bgp_zebra_l2_vni = NULL; + bm->t_bgp_zebra_l3_vni = NULL; bgp_mac_init(); /* init the rd id space. @@ -8787,6 +8798,7 @@ void bgp_terminate(void) EVENT_OFF(bm->t_bgp_start_label_manager); EVENT_OFF(bm->t_bgp_zebra_route); EVENT_OFF(bm->t_bgp_zebra_l2_vni); + EVENT_OFF(bm->t_bgp_zebra_l3_vni); bgp_mac_finish(); } diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h index 139871e4825c..bdfc3316febb 100644 --- a/bgpd/bgpd.h +++ b/bgpd/bgpd.h @@ -210,6 +210,7 @@ struct bgp_master { /* To preserve ordering of processing of L2 VNIs in BGP */ struct zebra_l2_vni_head zebra_l2_vni_head; + struct event *t_bgp_zebra_l3_vni; /* To preserve ordering of processing of BGP-VRFs for L3 VNIs */ struct zebra_l3_vni_head zebra_l3_vni_head; @@ -563,6 +564,8 @@ struct bgp { #define BGP_FLAG_INSTANCE_HIDDEN (1ULL << 39) /* Prohibit BGP from enabling IPv6 RA on interfaces */ #define BGP_FLAG_IPV6_NO_AUTO_RA (1ULL << 40) +#define BGP_FLAG_L3VNI_SCHEDULE_FOR_INSTALL (1ULL << 41) +#define BGP_FLAG_L3VNI_SCHEDULE_FOR_DELETE (1ULL << 42) /* BGP default address-families. * New peers inherit enabled afi/safis from bgp instance. From 2cfe7bd8862e716a23ecfd6fd6478a18af422691 Mon Sep 17 00:00:00 2001 From: Rajasekar Raja Date: Tue, 26 Nov 2024 12:02:53 -0800 Subject: [PATCH 6/6] bgpd: Suppress redundant L3VNI delete processing Consider a master bridge interface (br_l3vni) having a slave vxlan99 mapped to vlans used by 3 L3VNIs. During ifdown br_l3vni interface, the function zebra_vxlan_process_l3vni_oper_down() where zebra sends ZAPI to bgp for a delete L3VNI is sent twice. 1) if_down -> zebra_vxlan_svi_down() 2) VXLAN is unlinked from the bridge i.e. vxlan99 zebra_if_dplane_ifp_handling() --> zebra_vxlan_if_update_vni() (since ZEBRA_VXLIF_MASTER_CHANGE flag is set) During ifup br_l3vni interface, the function zebra_vxlan_process_l3vni_oper_down() is invoked because of access-vlan change - process oper down, associate with new svi_if and then process oper up again The problem here is that the redundant ZAPI message of L3VNI delete results in BGP doing a inline Global table walk for remote route installation when the L3VNI is already removed/deleted. Bigger the scale, more wastage is the CPU utilization. Given the triggers for bridge flap is not a common scenario, idea is to simply return from BGP if the L3VNI is already set to 0 i.e. if the L3VNI is already deleted, do nothing and return. NOTE/TBD: An ideal fix is to make zebra not send the second L3VNI delete ZAPI message. However it is a much involved and a day-1 code to handle corner cases. Ticket :#3864372 Signed-off-by: Rajasekar Raja --- bgpd/bgp_evpn.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/bgpd/bgp_evpn.c b/bgpd/bgp_evpn.c index 11a4611a14ac..400475a86107 100644 --- a/bgpd/bgp_evpn.c +++ b/bgpd/bgp_evpn.c @@ -7231,6 +7231,15 @@ int bgp_evpn_local_l3vni_del(vni_t l3vni, vrf_id_t vrf_id) return -1; } + if (!bgp_vrf->l3vni) { + if (BGP_DEBUG(zebra, ZEBRA)) + zlog_debug( + "Returning from %s since VNI %u is already deleted", + __func__, l3vni); + + return -1; + } + /* * Move all the l3vni_delete operation post the remote route * installation processing i.e. add the L3VNI DELETE item on the