From a1fe93ca0b346fe5a42aa5f4f91dbb07809811e1 Mon Sep 17 00:00:00 2001
From: Shi Su <ssu@microsoft.com>
Date: Thu, 14 Oct 2021 04:13:02 +0000
Subject: [PATCH] ADD support for overlay ecmp

---
 orchagent/nexthopkey.h  |   2 +
 orchagent/routeorch.cpp |  15 ++
 orchagent/routeorch.h   |   4 +
 orchagent/vnetorch.cpp  | 376 ++++++++++++++++++++++++++++++++++++----
 orchagent/vnetorch.h    |  41 +++--
 tests/test_vnet.py      | 246 ++++++++++++++++++++++++--
 6 files changed, 626 insertions(+), 58 deletions(-)

diff --git a/orchagent/nexthopkey.h b/orchagent/nexthopkey.h
index 87c294415a2..29f5115592c 100644
--- a/orchagent/nexthopkey.h
+++ b/orchagent/nexthopkey.h
@@ -81,6 +81,8 @@ struct NextHopKey
         weight = 0;
     }
 
+    NextHopKey(const IpAddress &ip, const MacAddress &mac, const uint32_t &vni, bool overlay_nh) : ip_address(ip), alias(""), vni(vni), mac_address(mac){}
+
     const std::string to_string() const
     {
         std::string str = formatMplsNextHop();
diff --git a/orchagent/routeorch.cpp b/orchagent/routeorch.cpp
index d54e205dedf..bead332d67b 100644
--- a/orchagent/routeorch.cpp
+++ b/orchagent/routeorch.cpp
@@ -2150,3 +2150,18 @@ bool RouteOrch::removeOverlayNextHops(sai_object_id_t vrf_id, const NextHopGroup
 
     return true;
 }
+
+void RouteOrch::increaseNextHopGroupCount()
+{
+    m_nextHopGroupCount ++;
+}
+
+void RouteOrch::decreaseNextHopGroupCount()
+{
+    m_nextHopGroupCount --;
+}
+
+bool RouteOrch::checkNextHopGroupCount()
+{
+    return m_nextHopGroupCount < m_maxNextHopGroupCount;
+}
diff --git a/orchagent/routeorch.h b/orchagent/routeorch.h
index 20e79699d58..89d48ec2184 100644
--- a/orchagent/routeorch.h
+++ b/orchagent/routeorch.h
@@ -156,6 +156,10 @@ class RouteOrch : public Orch, public Subject
     void delLinkLocalRouteToMe(sai_object_id_t vrf_id, IpPrefix linklocal_prefix);
     std::string getLinkLocalEui64Addr(void);
 
+    void increaseNextHopGroupCount();
+    void decreaseNextHopGroupCount();
+    bool checkNextHopGroupCount();
+
 private:
     SwitchOrch *m_switchOrch;
     NeighOrch *m_neighOrch;
diff --git a/orchagent/vnetorch.cpp b/orchagent/vnetorch.cpp
index efc60d22c25..a15554371a5 100644
--- a/orchagent/vnetorch.cpp
+++ b/orchagent/vnetorch.cpp
@@ -20,6 +20,7 @@
 #include "intfsorch.h"
 #include "neighorch.h"
 #include "crmorch.h"
+#include "routeorch.h"
 
 extern sai_virtual_router_api_t* sai_virtual_router_api;
 extern sai_route_api_t* sai_route_api;
@@ -28,6 +29,7 @@ extern sai_router_interface_api_t* sai_router_intfs_api;
 extern sai_fdb_api_t* sai_fdb_api;
 extern sai_neighbor_api_t* sai_neighbor_api;
 extern sai_next_hop_api_t* sai_next_hop_api;
+extern sai_next_hop_group_api_t* sai_next_hop_group_api;
 extern sai_object_id_t gSwitchId;
 extern sai_object_id_t gVirtualRouterId;
 extern Directory<Orch*> gDirectory;
@@ -35,6 +37,7 @@ extern PortsOrch *gPortsOrch;
 extern IntfsOrch *gIntfsOrch;
 extern NeighOrch *gNeighOrch;
 extern CrmOrch *gCrmOrch;
+extern RouteOrch *gRouteOrch;
 extern MacAddress gVxlanMacAddress;
 
 /*
@@ -150,15 +153,18 @@ bool VNetVrfObject::hasRoute(IpPrefix& ipPrefix)
     return false;
 }
 
-bool VNetVrfObject::addRoute(IpPrefix& ipPrefix, tunnelEndpoint& endp)
+bool VNetVrfObject::addRoute(IpPrefix& ipPrefix, NextHopGroupKey& nexthops)
 {
-    if (hasRoute(ipPrefix))
+    if (nexthops.is_overlay_nexthop())
     {
-        SWSS_LOG_INFO("VNET route '%s' exists", ipPrefix.to_string().c_str());
+        tunnels_[ipPrefix] = nexthops;
+    }
+    else
+    {
+        SWSS_LOG_ERROR("Input %s is not overlay nexthop group", nexthops.to_string().c_str());
         return false;
     }
 
-    tunnels_[ipPrefix] = endp;
     return true;
 }
 
@@ -237,8 +243,6 @@ bool VNetVrfObject::removeRoute(IpPrefix& ipPrefix)
 
     if (tunnels_.find(ipPrefix) != tunnels_.end())
     {
-        auto endp = tunnels_.at(ipPrefix);
-        removeTunnelNextHop(endp);
         tunnels_.erase(ipPrefix);
     }
     else
@@ -267,32 +271,32 @@ bool VNetVrfObject::getRouteNextHop(IpPrefix& ipPrefix, nextHop& nh)
     return true;
 }
 
-sai_object_id_t VNetVrfObject::getTunnelNextHop(tunnelEndpoint& endp)
+sai_object_id_t VNetVrfObject::getTunnelNextHop(NextHopKey& nh)
 {
     sai_object_id_t nh_id = SAI_NULL_OBJECT_ID;
     auto tun_name = getTunnelName();
 
     VxlanTunnelOrch* vxlan_orch = gDirectory.get<VxlanTunnelOrch*>();
 
-    nh_id = vxlan_orch->createNextHopTunnel(tun_name, endp.ip, endp.mac, endp.vni);
+    nh_id = vxlan_orch->createNextHopTunnel(tun_name, nh.ip_address, nh.mac_address, nh.vni);
     if (nh_id == SAI_NULL_OBJECT_ID)
     {
-        throw std::runtime_error("NH Tunnel create failed for " + vnet_name_ + " ip " + endp.ip.to_string());
+        throw std::runtime_error("NH Tunnel create failed for " + vnet_name_ + " ip " + nh.ip_address.to_string());
     }
 
     return nh_id;
 }
 
-bool VNetVrfObject::removeTunnelNextHop(tunnelEndpoint& endp)
+bool VNetVrfObject::removeTunnelNextHop(NextHopKey& nh)
 {
     auto tun_name = getTunnelName();
 
     VxlanTunnelOrch* vxlan_orch = gDirectory.get<VxlanTunnelOrch*>();
 
-    if (!vxlan_orch->removeNextHopTunnel(tun_name, endp.ip, endp.mac, endp.vni))
+    if (!vxlan_orch->removeNextHopTunnel(tun_name, nh.ip_address, nh.mac_address, nh.vni))
     {
         SWSS_LOG_ERROR("VNET %s Tunnel NextHop remove failed for '%s'",
-                        vnet_name_.c_str(), endp.ip.to_string().c_str());
+                        vnet_name_.c_str(), nh.ip_address.to_string().c_str());
         return false;
     }
 
@@ -603,6 +607,28 @@ static bool add_route(sai_object_id_t vr_id, sai_ip_prefix_t& ip_pfx, sai_object
     return true;
 }
 
+static bool update_route(sai_object_id_t vr_id, sai_ip_prefix_t& ip_pfx, sai_object_id_t nh_id)
+{
+    sai_route_entry_t route_entry;
+    route_entry.vr_id = vr_id;
+    route_entry.switch_id = gSwitchId;
+    route_entry.destination = ip_pfx;
+
+    sai_attribute_t route_attr;
+
+    route_attr.id = SAI_ROUTE_ENTRY_ATTR_NEXT_HOP_ID;
+    route_attr.value.oid = nh_id;
+
+    sai_status_t status = sai_route_api->set_route_entry_attribute(&route_entry, &route_attr);
+    if (status != SAI_STATUS_SUCCESS)
+    {
+        SWSS_LOG_ERROR("SAI failed to update route");
+        return false;
+    }
+
+    return true;
+}
+
 VNetRouteOrch::VNetRouteOrch(DBConnector *db, vector<string> &tableNames, VNetOrch *vnetOrch)
                                   : Orch2(db, tableNames, request_), vnet_orch_(vnetOrch)
 {
@@ -612,9 +638,166 @@ VNetRouteOrch::VNetRouteOrch(DBConnector *db, vector<string> &tableNames, VNetOr
     handler_map_.insert(handler_pair(APP_VNET_RT_TUNNEL_TABLE_NAME, &VNetRouteOrch::handleTunnel));
 }
 
+bool VNetRouteOrch::hasNextHopGroup(const string& vnet, const NextHopGroupKey& nexthops)
+{
+    return syncd_nexthop_groups_[vnet].find(nexthops) != syncd_nexthop_groups_[vnet].end();
+}
+
+sai_object_id_t VNetRouteOrch::getNextHopGroupId(const string& vnet, const NextHopGroupKey& nexthops)
+{
+    assert(hasNextHopGroup(vnet, nexthops));
+    return syncd_nexthop_groups_[vnet][nexthops].next_hop_group_id;
+}
+
+bool VNetRouteOrch::addNextHopGroup(const string& vnet, const NextHopGroupKey &nexthops, VNetVrfObject *vrf_obj)
+{
+    SWSS_LOG_ENTER();
+
+    assert(!hasNextHopGroup(vnet, nexthops));
+
+    if (!gRouteOrch->checkNextHopGroupCount())
+    {
+        SWSS_LOG_ERROR("Reached maximum number of next hop groups. Failed to create new next hop group.");
+        return false;
+    }
+
+    vector<sai_object_id_t> next_hop_ids;
+    set<NextHopKey> next_hop_set = nexthops.getNextHops();
+    std::map<sai_object_id_t, NextHopKey> nhopgroup_members_set;
+
+    for (auto it : next_hop_set)
+    {
+        sai_object_id_t next_hop_id = vrf_obj->getTunnelNextHop(it);
+        next_hop_ids.push_back(next_hop_id);
+        nhopgroup_members_set[next_hop_id] = it;
+    }
+
+    sai_attribute_t nhg_attr;
+    vector<sai_attribute_t> nhg_attrs;
+
+    nhg_attr.id = SAI_NEXT_HOP_GROUP_ATTR_TYPE;
+    nhg_attr.value.s32 = SAI_NEXT_HOP_GROUP_TYPE_ECMP;
+    nhg_attrs.push_back(nhg_attr);
+
+    sai_object_id_t next_hop_group_id;
+    sai_status_t status = sai_next_hop_group_api->create_next_hop_group(&next_hop_group_id,
+                                                                        gSwitchId,
+                                                                        (uint32_t)nhg_attrs.size(),
+                                                                        nhg_attrs.data());
+
+    if (status != SAI_STATUS_SUCCESS)
+    {
+        SWSS_LOG_ERROR("Failed to create next hop group %s, rv:%d",
+                       nexthops.to_string().c_str(), status);
+        return false;
+    }
+
+    gRouteOrch->increaseNextHopGroupCount();
+    gCrmOrch->incCrmResUsedCounter(CrmResourceType::CRM_NEXTHOP_GROUP);
+    SWSS_LOG_NOTICE("Create next hop group %s", nexthops.to_string().c_str());
+
+    NextHopGroupInfo next_hop_group_entry;
+    next_hop_group_entry.next_hop_group_id = next_hop_group_id;
+
+    for (auto nhid: next_hop_ids)
+    {
+        // Create a next hop group member
+        vector<sai_attribute_t> nhgm_attrs;
+
+        sai_attribute_t nhgm_attr;
+        nhgm_attr.id = SAI_NEXT_HOP_GROUP_MEMBER_ATTR_NEXT_HOP_GROUP_ID;
+        nhgm_attr.value.oid = next_hop_group_id;
+        nhgm_attrs.push_back(nhgm_attr);
+
+        nhgm_attr.id = SAI_NEXT_HOP_GROUP_MEMBER_ATTR_NEXT_HOP_ID;
+        nhgm_attr.value.oid = nhid;
+        nhgm_attrs.push_back(nhgm_attr);
+
+        sai_object_id_t next_hop_group_member_id;
+        status = sai_next_hop_group_api->create_next_hop_group_member(&next_hop_group_member_id,
+                                                                    gSwitchId,
+                                                                    (uint32_t)nhgm_attrs.size(),
+                                                                    nhgm_attrs.data());
+
+        if (status != SAI_STATUS_SUCCESS)
+        {
+            SWSS_LOG_ERROR("Failed to create next hop group %" PRIx64 " member %" PRIx64 ": %d\n",
+                           next_hop_group_id, next_hop_group_member_id, status);
+            return false;
+        }
+
+        gCrmOrch->incCrmResUsedCounter(CrmResourceType::CRM_NEXTHOP_GROUP_MEMBER);
+
+        // Save the membership into next hop structure
+        next_hop_group_entry.active_members[nhopgroup_members_set.find(nhid)->second] =
+                                                                next_hop_group_member_id;
+    }
+
+    /*
+     * Initialize the next hop group structure with ref_count as 0. This
+     * count will increase once the route is successfully syncd.
+     */
+    next_hop_group_entry.ref_count = 0;
+    syncd_nexthop_groups_[vnet][nexthops] = next_hop_group_entry;
+
+    return true;
+}
+
+bool VNetRouteOrch::removeNextHopGroup(const string& vnet, const NextHopGroupKey &nexthops, VNetVrfObject *vrf_obj)
+{
+    SWSS_LOG_ENTER();
+
+    sai_object_id_t next_hop_group_id;
+    auto next_hop_group_entry = syncd_nexthop_groups_[vnet].find(nexthops);
+    sai_status_t status;
+
+    assert(next_hop_group_entry != syncd_nexthop_groups_[vnet].end());
+
+    if (next_hop_group_entry->second.ref_count != 0)
+    {
+        return true;
+    }
+
+    next_hop_group_id = next_hop_group_entry->second.next_hop_group_id;
+    SWSS_LOG_NOTICE("Delete next hop group %s", nexthops.to_string().c_str());
+
+    for (auto nhop = next_hop_group_entry->second.active_members.begin();
+         nhop != next_hop_group_entry->second.active_members.end();)
+    {
+        NextHopKey nexthop = nhop->first;
+
+        status = sai_next_hop_group_api->remove_next_hop_group_member(nhop->second);
+        if (status != SAI_STATUS_SUCCESS)
+        {
+            SWSS_LOG_ERROR("Failed to remove next hop group member %" PRIx64 ", rv:%d",
+                           nhop->second, status);
+            return false;
+        }
+
+        vrf_obj->removeTunnelNextHop(nexthop);
+
+        gCrmOrch->decCrmResUsedCounter(CrmResourceType::CRM_NEXTHOP_GROUP_MEMBER);
+        nhop = next_hop_group_entry->second.active_members.erase(nhop);
+    }
+
+    status = sai_next_hop_group_api->remove_next_hop_group(next_hop_group_id);
+    if (status != SAI_STATUS_SUCCESS)
+    {
+        SWSS_LOG_ERROR("Failed to remove next hop group %" PRIx64 ", rv:%d", next_hop_group_id, status);
+        return false;
+    }
+
+    gRouteOrch->decreaseNextHopGroupCount();
+    gCrmOrch->decCrmResUsedCounter(CrmResourceType::CRM_NEXTHOP_GROUP);
+
+    syncd_nexthop_groups_[vnet].erase(nexthops);
+
+    return true;
+}
+
 template<>
 bool VNetRouteOrch::doRouteTask<VNetVrfObject>(const string& vnet, IpPrefix& ipPrefix,
-                                               tunnelEndpoint& endp, string& op)
+                                               NextHopGroupKey& nexthops, string& op)
 {
     SWSS_LOG_ENTER();
 
@@ -648,29 +831,126 @@ bool VNetRouteOrch::doRouteTask<VNetVrfObject>(const string& vnet, IpPrefix& ipP
     auto *vrf_obj = vnet_orch_->getTypePtr<VNetVrfObject>(vnet);
     sai_ip_prefix_t pfx;
     copy(pfx, ipPrefix);
-    sai_object_id_t nh_id = (op == SET_COMMAND)?vrf_obj->getTunnelNextHop(endp):SAI_NULL_OBJECT_ID;
 
-    for (auto vr_id : vr_set)
+    if (op == SET_COMMAND)
     {
-        if (op == SET_COMMAND && !add_route(vr_id, pfx, nh_id))
+        sai_object_id_t nh_id;
+        /* The route in pointing to one single endpoint */
+        if (!hasNextHopGroup(vnet, nexthops))
         {
-            SWSS_LOG_ERROR("Route add failed for %s, vr_id '0x%" PRIx64, ipPrefix.to_string().c_str(), vr_id);
-            return false;
+            if (nexthops.getSize() == 1)
+            {
+                NextHopKey nexthop(nexthops.to_string(), true);
+                NextHopGroupInfo next_hop_group_entry;
+                next_hop_group_entry.next_hop_group_id = vrf_obj->getTunnelNextHop(nexthop);
+                next_hop_group_entry.ref_count = 0;
+                next_hop_group_entry.active_members[nexthop] = SAI_NULL_OBJECT_ID;
+                syncd_nexthop_groups_[vnet][nexthops] = next_hop_group_entry;
+            }
+            else {
+                if (!addNextHopGroup(vnet, nexthops, vrf_obj))
+                {
+                    SWSS_LOG_ERROR("Failed to create next hop group %s", nexthops.to_string().c_str());
+                    return false;
+                }
+            }
         }
-        else if (op == DEL_COMMAND && !del_route(vr_id, pfx))
+        nh_id = syncd_nexthop_groups_[vnet][nexthops].next_hop_group_id;
+
+        auto it_route = syncd_tunnel_routes_[vnet].find(ipPrefix);
+        for (auto vr_id : vr_set)
         {
-            SWSS_LOG_ERROR("Route del failed for %s, vr_id '0x%" PRIx64, ipPrefix.to_string().c_str(), vr_id);
-            return false;
+            bool route_status = true;
+
+            if (it_route == syncd_tunnel_routes_[vnet].end())
+            {
+                route_status = add_route(vr_id, pfx, nh_id);
+            }
+            else
+            {
+                route_status = update_route(vr_id, pfx, nh_id);
+            }
+
+            if (!route_status)
+            {
+                SWSS_LOG_ERROR("Route add/update failed for %s, vr_id '0x%" PRIx64, ipPrefix.to_string().c_str(), vr_id);
+                /* Clean up the newly created next hop group entry */
+                if (nexthops.getSize() > 1)
+                {
+                    removeNextHopGroup(vnet, nexthops, vrf_obj);
+                }
+                return false;
+            }
         }
-    }
 
-    if (op == SET_COMMAND)
+        if (it_route != syncd_tunnel_routes_[vnet].end())
+        {
+            NextHopGroupKey nhg = it_route->second;
+            if(--syncd_nexthop_groups_[vnet][nhg].ref_count == 0)
+            {
+                if (nexthops.getSize() > 1)
+                {
+                    removeNextHopGroup(vnet, nhg, vrf_obj);
+                }
+                else
+                {
+                    syncd_nexthop_groups_[vnet].erase(nhg);
+                    NextHopKey nexthop(nhg.to_string(), true);
+                    vrf_obj->removeTunnelNextHop(nexthop);
+                }
+            }
+            vrf_obj->removeRoute(ipPrefix);
+        }
+
+        syncd_tunnel_routes_[vnet][ipPrefix] = nexthops;
+        syncd_nexthop_groups_[vnet][nexthops].ref_count++;
+        vrf_obj->addRoute(ipPrefix, nexthops);
+    }
+    else if (op == DEL_COMMAND)
     {
-        vrf_obj->addRoute(ipPrefix, endp);
+        auto it_route = syncd_tunnel_routes_[vnet].find(ipPrefix);
+        if (it_route == syncd_tunnel_routes_[vnet].end())
+        {
+            SWSS_LOG_INFO("Failed to find tunnel route entry, prefix %s\n",
+                ipPrefix.to_string().c_str());
+            return true;
+        }
+        NextHopGroupKey nhg = it_route->second;
+
+        for (auto vr_id : vr_set)
+        {
+            if (!del_route(vr_id, pfx))
+            {
+                SWSS_LOG_ERROR("Route del failed for %s, vr_id '0x%" PRIx64, ipPrefix.to_string().c_str(), vr_id);
+                return false;
+            }
+        }
+
+        if(--syncd_nexthop_groups_[vnet][nhg].ref_count == 0)
+        {
+            if (nhg.getSize() > 1)
+            {
+                removeNextHopGroup(vnet, nhg, vrf_obj);
+            }
+            else
+            {
+                syncd_nexthop_groups_[vnet].erase(nhg);
+                NextHopKey nexthop(nhg.to_string(), true);
+                vrf_obj->removeTunnelNextHop(nexthop);
+            }
+        }
+
+        syncd_tunnel_routes_[vnet].erase(ipPrefix);
+        if (syncd_tunnel_routes_[vnet].empty())
+        {
+            syncd_tunnel_routes_.erase(vnet);
+        }
+
+        vrf_obj->removeRoute(ipPrefix);
     }
     else
     {
-        vrf_obj->removeRoute(ipPrefix);
+        SWSS_LOG_ERROR("Unknown operation");
     }
 
     return true;
@@ -1041,23 +1321,23 @@ bool VNetRouteOrch::handleTunnel(const Request& request)
 {
     SWSS_LOG_ENTER();
 
-    IpAddress ip;
-    MacAddress mac;
-    uint32_t vni = 0;
+    vector<IpAddress> ip_list;
+    vector<MacAddress> mac_list;
+    vector<uint64_t> vni_list;
 
     for (const auto& name: request.getAttrFieldNames())
     {
         if (name == "endpoint")
         {
-            ip = request.getAttrIP(name);
+            ip_list = request.getAttrIPList(name);
         }
         else if (name == "vni")
         {
-            vni = static_cast<uint32_t>(request.getAttrUint(name));
+            vni_list = request.getAttrUintList(name);
         }
         else if (name == "mac_address")
         {
-            mac = request.getAttrMacAddress(name);
+            mac_list = request.getAttrMacAddressList(name);
         }
         else
         {
@@ -1066,6 +1346,18 @@ bool VNetRouteOrch::handleTunnel(const Request& request)
         }
     }
 
+    if (!vni_list.empty() && vni_list.size() != ip_list.size())
+    {
+        SWSS_LOG_ERROR("VNI size of %zu does not match endpoint size of %zu", vni_list.size(), ip_list.size());
+        return false;
+    }
+
+    if (!mac_list.empty() && mac_list.size() != ip_list.size())
+    {
+        SWSS_LOG_ERROR("MAC address size of %zu does not match endpoint size of %zu", mac_list.size(), ip_list.size());
+        return false;
+    }
+
     const std::string& vnet_name = request.getKeyString(0);
     auto ip_pfx = request.getKeyIpPrefix(1);
     auto op = request.getOperation();
@@ -1073,11 +1365,29 @@ bool VNetRouteOrch::handleTunnel(const Request& request)
     SWSS_LOG_INFO("VNET-RT '%s' op '%s' for pfx %s", vnet_name.c_str(),
                    op.c_str(), ip_pfx.to_string().c_str());
 
-    tunnelEndpoint endp = { ip, mac, vni };
+    NextHopGroupKey nhg("", true);
+    for (size_t idx_ip = 0; idx_ip < ip_list.size(); idx_ip++)
+    {
+        IpAddress ip = ip_list[idx_ip];
+        MacAddress mac;
+        uint32_t vni = 0;
+        if (!vni_list.empty())
+        {
+            vni = static_cast<uint32_t>(vni_list[idx_ip]);
+        }
+
+        if (!mac_list.empty())
+        {
+            mac = mac_list[idx_ip];
+        }
+
+        NextHopKey nh(ip, mac, vni, true);
+        nhg.add(nh);
+    }
 
     if (vnet_orch_->isVnetExecVrf())
     {
-        return doRouteTask<VNetVrfObject>(vnet_name, ip_pfx, endp, op);
+        return doRouteTask<VNetVrfObject>(vnet_name, ip_pfx, nhg, op);
     }
 
     return true;
diff --git a/orchagent/vnetorch.h b/orchagent/vnetorch.h
index 2ca48ec3a0e..d4ae4411f44 100644
--- a/orchagent/vnetorch.h
+++ b/orchagent/vnetorch.h
@@ -12,6 +12,8 @@
 #include "ipaddresses.h"
 #include "producerstatetable.h"
 #include "observer.h"
+#include "intfsorch.h"
+#include "nexthopgroupkey.h"
 
 #define VNET_BITMAP_SIZE 32
 #define VNET_TUNNEL_SIZE 40960
@@ -66,11 +68,11 @@ class VNetRequest : public Request
     VNetRequest() : Request(vnet_request_description, ':') { }
 };
 
-struct tunnelEndpoint
+struct NextHopGroupInfo
 {
-    IpAddress ip;
-    MacAddress mac;
-    uint32_t vni;
+    sai_object_id_t                         next_hop_group_id;      // next hop group id (null for single nexthop)
+    int                                     ref_count;              // reference count
+    std::map<NextHopKey, sai_object_id_t>   active_members;         // active nexthops and nexthop group member id (null for single nexthop)
 };
 
 class VNetObject
@@ -125,7 +127,7 @@ struct nextHop
     string ifname;
 };
 
-typedef std::map<IpPrefix, tunnelEndpoint> TunnelRoutes;
+typedef std::map<IpPrefix, NextHopGroupKey> TunnelRoutes;
 typedef std::map<IpPrefix, nextHop> RouteMap;
 
 class VNetVrfObject : public VNetObject
@@ -165,7 +167,7 @@ class VNetVrfObject : public VNetObject
 
     bool updateObj(vector<sai_attribute_t>&);
 
-    bool addRoute(IpPrefix& ipPrefix, tunnelEndpoint& endp);
+    bool addRoute(IpPrefix& ipPrefix, NextHopGroupKey& nexthops);
     bool addRoute(IpPrefix& ipPrefix, nextHop& nh);
     bool removeRoute(IpPrefix& ipPrefix);
 
@@ -173,8 +175,8 @@ class VNetVrfObject : public VNetObject
     bool getRouteNextHop(IpPrefix& ipPrefix, nextHop& nh);
     bool hasRoute(IpPrefix& ipPrefix);
 
-    sai_object_id_t getTunnelNextHop(tunnelEndpoint& endp);
-    bool removeTunnelNextHop(tunnelEndpoint& endp);
+    sai_object_id_t getTunnelNextHop(NextHopKey& nh);
+    bool removeTunnelNextHop(NextHopKey& nh);
     void increaseNextHopRefCount(const nextHop&);
     void decreaseNextHopRefCount(const nextHop&);
 
@@ -246,11 +248,12 @@ class VNetOrch : public Orch2
 const request_description_t vnet_route_description = {
     { REQ_T_STRING, REQ_T_IP_PREFIX },
     {
-        { "endpoint",    REQ_T_IP },
-        { "ifname",      REQ_T_STRING },
-        { "nexthop",     REQ_T_STRING },
-        { "vni",         REQ_T_UINT },
-        { "mac_address", REQ_T_MAC_ADDRESS },
+        { "endpoint",               REQ_T_IP_LIST },
+        { "ifname",                 REQ_T_STRING },
+        { "nexthop",                REQ_T_STRING },
+        { "vni",                    REQ_T_UINT_LIST },
+        { "mac_address",            REQ_T_MAC_ADDRESS_LIST },
+        { "endpoint_monitor",       REQ_T_IP_LIST },
     },
     { }
 };
@@ -281,6 +284,9 @@ struct VNetNextHopObserverEntry
 /* NextHopObserverTable: Destination IP address, next hop observer entry */
 typedef std::map<IpAddress, VNetNextHopObserverEntry> VNetNextHopObserverTable;
 
+typedef std::map<NextHopGroupKey, NextHopGroupInfo> VNetNextHopGroupInfoTable;
+typedef std::map<IpPrefix, NextHopGroupKey> VNetTunnelRouteTable;
+
 class VNetRouteOrch : public Orch2, public Subject
 {
 public:
@@ -302,8 +308,13 @@ class VNetRouteOrch : public Orch2, public Subject
     bool handleRoutes(const Request&);
     bool handleTunnel(const Request&);
 
+    bool hasNextHopGroup(const string&, const NextHopGroupKey&);
+    sai_object_id_t getNextHopGroupId(const string&, const NextHopGroupKey&);
+    bool addNextHopGroup(const string&, const NextHopGroupKey&, VNetVrfObject *vrf_obj);
+    bool removeNextHopGroup(const string&, const NextHopGroupKey&, VNetVrfObject *vrf_obj);
+
     template<typename T>
-    bool doRouteTask(const string& vnet, IpPrefix& ipPrefix, tunnelEndpoint& endp, string& op);
+    bool doRouteTask(const string& vnet, IpPrefix& ipPrefix, NextHopGroupKey& nexthops, string& op);
 
     template<typename T>
     bool doRouteTask(const string& vnet, IpPrefix& ipPrefix, nextHop& nh, string& op);
@@ -314,6 +325,8 @@ class VNetRouteOrch : public Orch2, public Subject
 
     VNetRouteTable syncd_routes_;
     VNetNextHopObserverTable next_hop_observers_;
+    std::map<std::string, VNetNextHopGroupInfoTable> syncd_nexthop_groups_;
+    std::map<std::string, VNetTunnelRouteTable> syncd_tunnel_routes_;
 };
 
 class VNetCfgRouteOrch : public Orch
diff --git a/tests/test_vnet.py b/tests/test_vnet.py
index c7fd3c1225e..6f5d0b0325a 100644
--- a/tests/test_vnet.py
+++ b/tests/test_vnet.py
@@ -6,6 +6,7 @@
 
 from swsscommon import swsscommon
 from pprint import pprint
+from dvslib.dvs_common import wait_for_result
 
 
 def create_entry(tbl, key, pairs):
@@ -139,7 +140,11 @@ def delete_vnet_local_routes(dvs, prefix, vnet_name):
     time.sleep(2)
 
 
-def create_vnet_routes(dvs, prefix, vnet_name, endpoint, mac="", vni=0):
+def create_vnet_routes(dvs, prefix, vnet_name, endpoint, mac="", vni=0, ep_monitor=""):
+    set_vnet_routes(dvs, prefix, vnet_name, endpoint, mac=mac, vni=vni, ep_monitor=ep_monitor)
+
+
+def set_vnet_routes(dvs, prefix, vnet_name, endpoint, mac="", vni=0, ep_monitor=""):
     conf_db = swsscommon.DBConnector(swsscommon.CONFIG_DB, dvs.redis_sock, 0)
 
     attrs = [
@@ -152,11 +157,12 @@ def create_vnet_routes(dvs, prefix, vnet_name, endpoint, mac="", vni=0):
     if mac:
         attrs.append(('mac_address', mac))
 
-    create_entry_tbl(
-        conf_db,
-        "VNET_ROUTE_TUNNEL", '|', "%s|%s" % (vnet_name, prefix),
-        attrs,
-    )
+    if ep_monitor:
+        attrs.append(('endpoint_monitor', ep_monitor))
+
+    tbl = swsscommon.Table(conf_db, "VNET_ROUTE_TUNNEL")
+    fvs = swsscommon.FieldValuePairs(attrs)
+    tbl.set("%s|%s" % (vnet_name, prefix), fvs)
 
     time.sleep(2)
 
@@ -429,7 +435,9 @@ class VnetVxlanVrfTunnel(object):
     ASIC_VRF_TABLE          = "ASIC_STATE:SAI_OBJECT_TYPE_VIRTUAL_ROUTER"
     ASIC_ROUTE_ENTRY        = "ASIC_STATE:SAI_OBJECT_TYPE_ROUTE_ENTRY"
     ASIC_NEXT_HOP           = "ASIC_STATE:SAI_OBJECT_TYPE_NEXT_HOP"
-    ASIC_VLAN_TABLE          = "ASIC_STATE:SAI_OBJECT_TYPE_VLAN"
+    ASIC_VLAN_TABLE         = "ASIC_STATE:SAI_OBJECT_TYPE_VLAN"
+    ASIC_NEXT_HOP_GROUP     = "ASIC_STATE:SAI_OBJECT_TYPE_NEXT_HOP_GROUP"
+    ASIC_NEXT_HOP_GROUP_MEMBER  = "ASIC_STATE:SAI_OBJECT_TYPE_NEXT_HOP_GROUP_MEMBER"
 
     tunnel_map_ids       = set()
     tunnel_map_entry_ids = set()
@@ -440,6 +448,7 @@ class VnetVxlanVrfTunnel(object):
     vnet_vr_ids          = set()
     vr_map               = {}
     nh_ids               = {}
+    nhg_ids              = {}
 
     def fetch_exist_entries(self, dvs):
         self.vnet_vr_ids = get_exist_entries(dvs, self.ASIC_VRF_TABLE)
@@ -450,6 +459,7 @@ def fetch_exist_entries(self, dvs):
         self.rifs = get_exist_entries(dvs, self.ASIC_RIF_TABLE)
         self.routes = get_exist_entries(dvs, self.ASIC_ROUTE_ENTRY)
         self.nhops = get_exist_entries(dvs, self.ASIC_NEXT_HOP)
+        self.nhgs = get_exist_entries(dvs, self.ASIC_NEXT_HOP_GROUP)
 
         global loopback_id, def_vr_id, switch_mac
         if not loopback_id:
@@ -670,7 +680,7 @@ def check_del_vnet_local_routes(self, dvs, name):
         # TODO: Implement for VRF VNET
         return True
 
-    def check_vnet_routes(self, dvs, name, endpoint, tunnel, mac="", vni=0):
+    def check_vnet_routes(self, dvs, name, endpoint, tunnel, mac="", vni=0, route_ids=""):
         asic_db = swsscommon.DBConnector(swsscommon.ASIC_DB, dvs.redis_sock, 0)
 
         vr_ids = self.vnet_route_ids(dvs, name)
@@ -697,7 +707,10 @@ def check_vnet_routes(self, dvs, name, endpoint, tunnel, mac="", vni=0):
             self.nhops.add(new_nh)
 
         check_object(asic_db, self.ASIC_NEXT_HOP, new_nh, expected_attr)
-        new_route = get_created_entries(asic_db, self.ASIC_ROUTE_ENTRY, self.routes, count)
+        if not route_ids:
+            new_route = get_created_entries(asic_db, self.ASIC_ROUTE_ENTRY, self.routes, count)
+        else:
+            new_route = route_ids
 
         #Check if the route is in expected VRF
         asic_vrs = set()
@@ -714,8 +727,107 @@ def check_vnet_routes(self, dvs, name, endpoint, tunnel, mac="", vni=0):
 
         self.routes.update(new_route)
 
-    def check_del_vnet_routes(self, dvs, name):
+        return new_route
+
+    def serialize_endpoint_group(self, endpoints):
+        endpoints.sort()
+        return ",".join(endpoints)
+
+    def check_next_hop_group_member(self, dvs, nhg, expected_endpoint, expected_attrs):
+        expected_endpoint_str = self.serialize_endpoint_group(expected_endpoint)
+        asic_db = swsscommon.DBConnector(swsscommon.ASIC_DB, dvs.redis_sock, 0)
+        tbl_nhgm =  swsscommon.Table(asic_db, self.ASIC_NEXT_HOP_GROUP_MEMBER)
+        tbl_nh =  swsscommon.Table(asic_db, self.ASIC_NEXT_HOP)
+        entries = set(tbl_nhgm.getKeys())
+        endpoints = []
+        for entry in entries:
+            status, fvs = tbl_nhgm.get(entry)
+            fvs = dict(fvs)
+            assert status, "Got an error when get a key"
+            if fvs["SAI_NEXT_HOP_GROUP_MEMBER_ATTR_NEXT_HOP_GROUP_ID"] == nhg:
+                nh_key = fvs["SAI_NEXT_HOP_GROUP_MEMBER_ATTR_NEXT_HOP_ID"]
+                status, nh_fvs = tbl_nh.get(nh_key)
+                nh_fvs = dict(nh_fvs)
+                assert status, "Got an error when get a key"
+                endpoint = nh_fvs["SAI_NEXT_HOP_ATTR_IP"]
+                endpoints.append(endpoint)
+                assert endpoint in expected_attrs
+                check_object(asic_db, self.ASIC_NEXT_HOP, nh_key, expected_attrs[endpoint])
+
+        assert self.serialize_endpoint_group(endpoints) == expected_endpoint_str
+
+    def check_vnet_ecmp_routes(self, dvs, name, endpoints, tunnel, mac=[], vni=[], route_ids=[], nhg=""):
+        asic_db = swsscommon.DBConnector(swsscommon.ASIC_DB, dvs.redis_sock, 0)
+        endpoint_str = name + "|" + self.serialize_endpoint_group(endpoints)
+
+        vr_ids = self.vnet_route_ids(dvs, name)
+        count = len(vr_ids)
+
+        expected_attrs = {}
+        for idx, endpoint in enumerate(endpoints):
+            expected_attr = {
+                        "SAI_NEXT_HOP_ATTR_TYPE": "SAI_NEXT_HOP_TYPE_TUNNEL_ENCAP",
+                        "SAI_NEXT_HOP_ATTR_IP": endpoint,
+                        "SAI_NEXT_HOP_ATTR_TUNNEL_ID": self.tunnel[tunnel],
+                    }
+            if vni and vni[idx]:
+                expected_attr.update({'SAI_NEXT_HOP_ATTR_TUNNEL_VNI': vni[idx]})
+            if mac and mac[idx]:
+                expected_attr.update({'SAI_NEXT_HOP_ATTR_TUNNEL_MAC': mac[idx]})
+            expected_attrs[endpoint] = expected_attr
+
+        if nhg:
+            new_nhg = nhg
+        elif endpoint_str in self.nhg_ids:
+            new_nhg = self.nhg_ids[endpoint_str]
+        else:
+            new_nhg = get_created_entry(asic_db, self.ASIC_NEXT_HOP_GROUP, self.nhgs)
+            self.nhg_ids[endpoint_str] = new_nhg
+            self.nhgs.add(new_nhg)
+
+
+        # Check routes in ingress VRF
+        expected_nhg_attr = {
+                        "SAI_NEXT_HOP_GROUP_ATTR_TYPE": "SAI_NEXT_HOP_GROUP_TYPE_DYNAMIC_UNORDERED_ECMP",
+                    }
+        check_object(asic_db, self.ASIC_NEXT_HOP_GROUP, new_nhg, expected_nhg_attr)
+
+        # Check nexthop group member
+        self.check_next_hop_group_member(dvs, new_nhg, endpoints, expected_attrs)
+
+        if route_ids:
+            new_route = route_ids
+        else:
+            new_route = get_created_entries(asic_db, self.ASIC_ROUTE_ENTRY, self.routes, count)
+
+        #Check if the route is in expected VRF
+        asic_vrs = set()
+        for idx in range(count):
+            check_object(asic_db, self.ASIC_ROUTE_ENTRY, new_route[idx],
+                        {
+                            "SAI_ROUTE_ENTRY_ATTR_NEXT_HOP_ID": new_nhg,
+                        }
+                    )
+            rt_key = json.loads(new_route[idx])
+            asic_vrs.add(rt_key['vr'])
+
+        assert asic_vrs == vr_ids
+
+        self.routes.update(new_route)
+
+        return new_route, new_nhg
+
+    def check_del_vnet_routes(self, dvs, name, prefixes=[]):
         # TODO: Implement for VRF VNET
+
+        def _access_function():
+            route_entries = get_exist_entries(dvs, self.ASIC_ROUTE_ENTRY)
+            route_prefixes = [json.loads(route_entry)["dest"] for route_entry in route_entries]
+            return (all(prefix not in route_prefixes for prefix in prefixes), None)
+
+        if prefixes:
+            wait_for_result(_access_function)
+
         return True
 
 
@@ -790,7 +902,7 @@ def test_vnet_orch_1(self, dvs, testlog):
         vnet_obj.check_del_vnet_routes(dvs, 'Vnet_2001')
 
         delete_vnet_routes(dvs, "100.100.1.1/32", 'Vnet_2000')
-        vnet_obj.check_del_vnet_routes(dvs, 'Vnet_2001')
+        vnet_obj.check_del_vnet_routes(dvs, 'Vnet_2000')
 
         delete_phy_interface(dvs, "Ethernet4", "100.102.1.1/24")
         vnet_obj.check_del_router_interface(dvs, "Ethernet4")
@@ -1125,6 +1237,118 @@ def test_vnet_vxlan_multi_map(self, dvs, testlog):
 
         create_vxlan_tunnel_map(dvs, tunnel_name, 'map_1', 'Vlan1000', '1000')
 
+    '''
+    Test 7 - Test for vnet tunnel routes with ECMP nexthop group
+    '''
+    def test_vnet_orch_7(self, dvs, testlog):
+        vnet_obj = self.get_vnet_obj()
+
+        tunnel_name = 'tunnel_7'
+
+        vnet_obj.fetch_exist_entries(dvs)
+
+        create_vxlan_tunnel(dvs, tunnel_name, '7.7.7.7')
+        create_vnet_entry(dvs, 'Vnet7', tunnel_name, '10007', "")
+
+        vnet_obj.check_vnet_entry(dvs, 'Vnet7')
+        vnet_obj.check_vxlan_tunnel_entry(dvs, tunnel_name, 'Vnet7', '10007')
+
+        vnet_obj.check_vxlan_tunnel(dvs, tunnel_name, '7.7.7.7')
+
+        # Create an ECMP tunnel route
+        vnet_obj.fetch_exist_entries(dvs)
+        create_vnet_routes(dvs, "100.100.1.1/32", 'Vnet7', '7.0.0.1,7.0.0.2,7.0.0.3')
+        route1, nhg1_1 = vnet_obj.check_vnet_ecmp_routes(dvs, 'Vnet7', ['7.0.0.1', '7.0.0.2', '7.0.0.3'], tunnel_name)
+
+        # Set the tunnel route to another nexthop group
+        set_vnet_routes(dvs, "100.100.1.1/32", 'Vnet7', '7.0.0.1,7.0.0.2,7.0.0.3,7.0.0.4')
+        route1, nhg1_2 = vnet_obj.check_vnet_ecmp_routes(dvs, 'Vnet7', ['7.0.0.1', '7.0.0.2', '7.0.0.3', '7.0.0.4'], tunnel_name, route_ids=route1)
+
+        # Check the previous nexthop group is removed
+        vnet_obj.fetch_exist_entries(dvs)
+        assert nhg1_1 not in vnet_obj.nhgs
+
+        # Create another tunnel route to the same set of endpoints
+        create_vnet_routes(dvs, "100.100.2.1/32", 'Vnet7', '7.0.0.1,7.0.0.2,7.0.0.3,7.0.0.4')
+        route2, nhg2_1 = vnet_obj.check_vnet_ecmp_routes(dvs, 'Vnet7', ['7.0.0.1', '7.0.0.2', '7.0.0.3', '7.0.0.4'], tunnel_name)
+
+        assert nhg2_1 == nhg1_2
+
+        # Remove one of the tunnel routes
+        delete_vnet_routes(dvs, "100.100.1.1/32", 'Vnet7')
+        vnet_obj.check_del_vnet_routes(dvs, 'Vnet7', ["100.100.1.1/32"])
+
+        # Check the nexthop group still exists
+        vnet_obj.fetch_exist_entries(dvs)
+        assert nhg1_2 in vnet_obj.nhgs
+
+        # Remove the other tunnel route
+        delete_vnet_routes(dvs, "100.100.2.1/32", 'Vnet7')
+        vnet_obj.check_del_vnet_routes(dvs, 'Vnet7', ["100.100.2.1/32"])
+
+        # Check the nexthop group is removed
+        vnet_obj.fetch_exist_entries(dvs)
+        assert nhg2_1 not in vnet_obj.nhgs
+
+        delete_vnet_entry(dvs, 'Vnet7')
+        vnet_obj.check_del_vnet_entry(dvs, 'Vnet7')
+
+    '''
+    Test 8 - Test for ipv6 vnet tunnel routes with ECMP nexthop group
+    '''
+    def test_vnet_orch_8(self, dvs, testlog):
+        vnet_obj = self.get_vnet_obj()
+
+        tunnel_name = 'tunnel_8'
+
+        vnet_obj.fetch_exist_entries(dvs)
+
+        create_vxlan_tunnel(dvs, tunnel_name, 'fd:8::32')
+        create_vnet_entry(dvs, 'Vnet8', tunnel_name, '10008', "")
+
+        vnet_obj.check_vnet_entry(dvs, 'Vnet8')
+        vnet_obj.check_vxlan_tunnel_entry(dvs, tunnel_name, 'Vnet8', '10008')
+
+        vnet_obj.check_vxlan_tunnel(dvs, tunnel_name, 'fd:8::32')
+
+        # Create an ECMP tunnel route
+        vnet_obj.fetch_exist_entries(dvs)
+        create_vnet_routes(dvs, "fd:8:10::32/128", 'Vnet8', 'fd:8:1::1,fd:8:1::2,fd:8:1::3')
+        route1, nhg1_1 = vnet_obj.check_vnet_ecmp_routes(dvs, 'Vnet8', ['fd:8:1::1', 'fd:8:1::2', 'fd:8:1::3'], tunnel_name)
+
+        # Set the tunnel route to another nexthop group
+        set_vnet_routes(dvs, "fd:8:10::32/128", 'Vnet8', 'fd:8:1::1,fd:8:1::2,fd:8:1::3,fd:8:1::4')
+        route1, nhg1_2 = vnet_obj.check_vnet_ecmp_routes(dvs, 'Vnet8', ['fd:8:1::1', 'fd:8:1::2', 'fd:8:1::3', 'fd:8:1::4'], tunnel_name, route_ids=route1)
+
+        # Check the previous nexthop group is removed
+        vnet_obj.fetch_exist_entries(dvs)
+        assert nhg1_1 not in vnet_obj.nhgs
+
+        # Create another tunnel route to the same set of endpoints
+        create_vnet_routes(dvs, "fd:8:20::32/128", 'Vnet8', 'fd:8:1::1,fd:8:1::2,fd:8:1::3,fd:8:1::4')
+        route2, nhg2_1 = vnet_obj.check_vnet_ecmp_routes(dvs, 'Vnet8', ['fd:8:1::1', 'fd:8:1::2', 'fd:8:1::3', 'fd:8:1::4'], tunnel_name)
+
+        assert nhg2_1 == nhg1_2
+
+        # Remove one of the tunnel routes
+        delete_vnet_routes(dvs, "fd:8:10::32/128", 'Vnet8')
+        vnet_obj.check_del_vnet_routes(dvs, 'Vnet8', ["fd:8:10::32/128"])
+
+        # Check the nexthop group still exists
+        vnet_obj.fetch_exist_entries(dvs)
+        assert nhg1_2 in vnet_obj.nhgs
+
+        # Remove the other tunnel route
+        delete_vnet_routes(dvs, "fd:8:20::32/128", 'Vnet8')
+        vnet_obj.check_del_vnet_routes(dvs, 'Vnet8', ["fd:8:20::32/128"])
+
+        # Check the nexthop group is removed
+        vnet_obj.fetch_exist_entries(dvs)
+        assert nhg2_1 not in vnet_obj.nhgs
+
+        delete_vnet_entry(dvs, 'Vnet8')
+        vnet_obj.check_del_vnet_entry(dvs, 'Vnet8')
+
 
 # Add Dummy always-pass test at end as workaroud
 # for issue when Flaky fail on final test it invokes module tear-down before retrying