Skip to content

Commit

Permalink
Added changes to handle dependency check in FdbSyncd and FpmSyncd for…
Browse files Browse the repository at this point in the history
… warm-boot (sonic-net#1556)

Added changes to handle dependency check in FpmSyncd and FdbSyncd for warmreboot. 
This was done to ensure for EVPN warm-reboot the order of data replay to kernel is maintained across various submodules and the kernel programming will be successful.
  • Loading branch information
nkelapur authored Mar 3, 2021
1 parent 1d33881 commit 721f47d
Show file tree
Hide file tree
Showing 7 changed files with 132 additions and 10 deletions.
39 changes: 37 additions & 2 deletions fdbsyncd/fdbsync.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,36 @@ FdbSync::~FdbSync()
}
}


// Check if interface entries are restored in kernel
bool FdbSync::isIntfRestoreDone()
{
vector<string> required_modules = {
"vxlanmgrd",
"intfmgrd",
"vlanmgrd",
"vrfmgrd"
};

for (string& module : required_modules)
{
WarmStart::WarmStartState state;

WarmStart::getWarmStartState(module, state);
if (state == WarmStart::REPLAYED || state == WarmStart::RECONCILED)
{
SWSS_LOG_INFO("Module %s Replayed or Reconciled %d",module.c_str(), (int) state);
}
else
{
SWSS_LOG_INFO("Module %s NOT Replayed or Reconciled %d",module.c_str(), (int) state);
return false;
}
}

return true;
}

void FdbSync::processCfgEvpnNvo()
{
std::deque<KeyOpFieldsValuesTuple> entries;
Expand Down Expand Up @@ -447,14 +477,17 @@ void FdbSync::macDelVxlanDB(string key)
fvVector.push_back(t);
fvVector.push_back(v);

SWSS_LOG_NOTICE("%sVXLAN_FDB_TABLE: DEL_KEY %s vtep:%s type:%s",
m_AppRestartAssist->isWarmStartInProgress() ? "WARM-RESTART:" : "" ,
key.c_str(), vtep.c_str(), type.c_str());

// If warmstart is in progress, we take all netlink changes into the cache map
if (m_AppRestartAssist->isWarmStartInProgress())
{
m_AppRestartAssist->insertToMap(APP_VXLAN_FDB_TABLE_NAME, key, fvVector, true);
return;
}

SWSS_LOG_INFO("VXLAN_FDB_TABLE: DEL_KEY %s vtep:%s type:%s", key.c_str(), vtep.c_str(), type.c_str());
m_fdbTable.del(key);
return;

Expand All @@ -476,14 +509,16 @@ void FdbSync::macAddVxlan(string key, struct in_addr vtep, string type, uint32_t
fvVector.push_back(t);
fvVector.push_back(v);

SWSS_LOG_INFO("%sVXLAN_FDB_TABLE: ADD_KEY %s vtep:%s type:%s",
m_AppRestartAssist->isWarmStartInProgress() ? "WARM-RESTART:" : "" ,
key.c_str(), svtep.c_str(), type.c_str());
// If warmstart is in progress, we take all netlink changes into the cache map
if (m_AppRestartAssist->isWarmStartInProgress())
{
m_AppRestartAssist->insertToMap(APP_VXLAN_FDB_TABLE_NAME, key, fvVector, false);
return;
}

SWSS_LOG_INFO("VXLAN_FDB_TABLE: ADD_KEY %s vtep:%s type:%s", key.c_str(), svtep.c_str(), type.c_str());
m_fdbTable.set(key, fvVector);

return;
Expand Down
15 changes: 12 additions & 3 deletions fdbsyncd/fdbsync.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,17 @@
#include "netmsg.h"
#include "warmRestartAssist.h"

// The timeout value (in seconds) for fdbsyncd reconcilation logic
#define DEFAULT_FDBSYNC_WARMSTART_TIMER 30
/*
* Default timer interval for fdbsyncd reconcillation
*/
#define DEFAULT_FDBSYNC_WARMSTART_TIMER 120

/*
* This is the MAX time in seconds, fdbsyncd will wait after warm-reboot
* for the interface entries to be recreated in kernel before attempting to
* write the FDB data to kernel
*/
#define INTF_RESTORE_MAX_WAIT_TIME 180

namespace swss {

Expand Down Expand Up @@ -43,7 +52,7 @@ class FdbSync : public NetMsg

virtual void onMsg(int nlmsg_type, struct nl_object *obj);

bool isFdbRestoreDone();
bool isIntfRestoreDone();

AppRestartAssist *getRestartAssist()
{
Expand Down
57 changes: 54 additions & 3 deletions fdbsyncd/fdbsyncd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "netdispatcher.h"
#include "netlink.h"
#include "fdbsyncd/fdbsync.h"
#include "warm_restart.h"

using namespace std;
using namespace swss;
Expand Down Expand Up @@ -35,6 +36,7 @@ int main(int argc, char **argv)
Selectable *temps;
int ret;
Select s;
SelectableTimer replayCheckTimer(timespec{0, 0});

using namespace std::chrono;

Expand All @@ -45,7 +47,29 @@ int main(int argc, char **argv)
if (sync.getRestartAssist()->isWarmStartInProgress())
{
sync.getRestartAssist()->readTablesToMap();
SWSS_LOG_NOTICE("Starting ReconcileTimer");

steady_clock::time_point starttime = steady_clock::now();
while (!sync.isIntfRestoreDone())
{
duration<double> time_span =
duration_cast<duration<double>>(steady_clock::now() - starttime);
int pasttime = int(time_span.count());

if (pasttime > INTF_RESTORE_MAX_WAIT_TIME)
{
SWSS_LOG_INFO("timed-out before all interface data was replayed to kernel!!!");
throw runtime_error("fdbsyncd: timedout on interface data replay");
}
sleep(1);
}
replayCheckTimer.setInterval(timespec{1, 0});
replayCheckTimer.start();
s.addSelectable(&replayCheckTimer);
}
else
{
sync.getRestartAssist()->warmStartDisabled();
sync.m_reconcileDone = true;
}

netlink.registerGroup(RTNLGRP_LINK);
Expand All @@ -67,14 +91,41 @@ int main(int argc, char **argv)
{
s.select(&temps);

if(temps == (Selectable *)sync.getFdbStateTable())
if (temps == (Selectable *)sync.getFdbStateTable())
{
sync.processStateFdb();
}
else if (temps == (Selectable *)sync.getCfgEvpnNvoTable())
{
sync.processCfgEvpnNvo();
}
else if (temps == &replayCheckTimer)
{
if (sync.getFdbStateTable()->empty() && sync.getCfgEvpnNvoTable()->empty())
{
sync.getRestartAssist()->appDataReplayed();
SWSS_LOG_NOTICE("FDB Replay Complete");
s.removeSelectable(&replayCheckTimer);

/* Obtain warm-restart timer defined for routing application */
uint32_t warmRestartIval = WarmStart::getWarmStartTimer("bgp","bgp");
if (warmRestartIval)
{
sync.getRestartAssist()->setReconcileInterval(warmRestartIval);
}
//Else the interval is already set to default value

//TODO: Optimise the reconcillation time using eoiu - issue#1657
SWSS_LOG_NOTICE("Starting ReconcileTimer");
sync.getRestartAssist()->startReconcileTimer(s);
}
else
{
replayCheckTimer.setInterval(timespec{1, 0});
// re-start replay check timer
replayCheckTimer.start();
}
}
else
{
/*
Expand All @@ -88,7 +139,7 @@ int main(int argc, char **argv)
sync.m_reconcileDone = true;
sync.getRestartAssist()->stopReconcileTimer(s);
sync.getRestartAssist()->reconcile();
SWSS_LOG_NOTICE("VXLAN FDB VNI Reconcillation Complete (Timer)");
SWSS_LOG_NOTICE("VXLAN FDB VNI Reconcillation Complete");
}
}
}
Expand Down
7 changes: 7 additions & 0 deletions fpmsyncd/fpmsyncd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ using namespace swss;
*/
const uint32_t DEFAULT_ROUTING_RESTART_INTERVAL = 120;


// Wait 3 seconds after detecting EOIU reached state
// TODO: support eoiu hold interval config
const uint32_t DEFAULT_EOIU_HOLD_INTERVAL = 3;
Expand Down Expand Up @@ -67,6 +68,7 @@ int main(int argc, char **argv)
SelectableTimer eoiuCheckTimer(timespec{0, 0});
// After eoiu flags are detected, start a hold timer before starting reconciliation.
SelectableTimer eoiuHoldTimer(timespec{0, 0});

/*
* Pipeline should be flushed right away to deal with state pending
* from previous try/catch iterations.
Expand Down Expand Up @@ -108,6 +110,10 @@ int main(int argc, char **argv)
s.addSelectable(&eoiuCheckTimer);
SWSS_LOG_NOTICE("Warm-Restart eoiuCheckTimer timer started.");
}
else
{
sync.m_warmStartHelper.setState(WarmStart::WSDISABLED);
}

while (true)
{
Expand All @@ -132,6 +138,7 @@ int main(int argc, char **argv)
{
SWSS_LOG_NOTICE("Warm-Restart EOIU hold timer expired.");
}

if (sync.m_warmStartHelper.inProgress())
{
sync.m_warmStartHelper.reconcile();
Expand Down
4 changes: 2 additions & 2 deletions tests/test_warm_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def swss_app_check_RestoreCount_single(state_db, restore_count, name):
if fv[0] == "restore_count":
assert int(fv[1]) == restore_count[key] + 1
elif fv[0] == "state":
assert fv[1] == "reconciled" or fv[1] == "disabled"
assert fv[1] == "reconciled" or fv[1] == "disabled"

def swss_app_check_warmstart_state(state_db, name, state):
warmtbl = swsscommon.Table(state_db, swsscommon.STATE_WARM_RESTART_TABLE_NAME)
Expand Down Expand Up @@ -1150,7 +1150,7 @@ def test_routing_WarmRestart(self, dvs, testlog):
time.sleep(5)

# Verify FSM
swss_app_check_warmstart_state(state_db, "bgp", "")
swss_app_check_warmstart_state(state_db, "bgp", "disabled")

# Verify that multiple changes are seen in swss and sairedis logs as there's
# no warm-reboot logic in place.
Expand Down
17 changes: 17 additions & 0 deletions warmrestart/warmRestartAssist.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,16 @@ AppRestartAssist::cache_state_t AppRestartAssist::getCacheEntryState(const std::
throw std::logic_error("cache entry state is invalid");
}

void AppRestartAssist::appDataReplayed()
{
WarmStart::setWarmStartState(m_appName, WarmStart::REPLAYED);
}

void AppRestartAssist::warmStartDisabled()
{
WarmStart::setWarmStartState(m_appName, WarmStart::WSDISABLED);
}

// Read table(s) from APPDB and append stale flag then insert to cachemap
void AppRestartAssist::readTablesToMap()
{
Expand Down Expand Up @@ -274,6 +284,13 @@ void AppRestartAssist::reconcile()
return;
}

// set the reconcile interval
void AppRestartAssist::setReconcileInterval(uint32_t time)
{
m_reconcileTimer = time;
m_warmStartTimer.setInterval(timespec{m_reconcileTimer, 0});
}

// start the timer, take Select class "s" to add the timer.
void AppRestartAssist::startReconcileTimer(Select &s)
{
Expand Down
3 changes: 3 additions & 0 deletions warmrestart/warmRestartAssist.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,13 @@ class AppRestartAssist
DELETE = 3
};
// These functions were used as described in the class description
void setReconcileInterval(uint32_t time);
void startReconcileTimer(Select &s);
void stopReconcileTimer(Select &s);
bool checkReconcileTimer(Selectable *s);
void readTablesToMap(void);
void appDataReplayed(void);
void warmStartDisabled(void);
void insertToMap(std::string tableName, std::string key, std::vector<FieldValueTuple> fvVector, bool delete_key);
void reconcile(void);
bool isWarmStartInProgress(void)
Expand Down

0 comments on commit 721f47d

Please sign in to comment.