Skip to content

Commit

Permalink
Merge pull request #4506 from cyclinder/0.9_cp/coordinator_detect_tim…
Browse files Browse the repository at this point in the history
…eout

Detect IP conflicts before gateway detection to fix communication fail
  • Loading branch information
weizhoublue authored Jan 3, 2025
2 parents 56bce06 + 90acb22 commit 5014593
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 39 deletions.
4 changes: 2 additions & 2 deletions cmd/coordinator/cmd/cni_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -285,11 +285,11 @@ func ValidateDelectOptions(config *DetectOptions) (*DetectOptions, error) {
}

if config.Interval == "" {
config.Interval = "1s"
config.Interval = "10ms"
}

if config.TimeOut == "" {
config.TimeOut = "3s"
config.TimeOut = "100ms"
}

_, err := time.ParseDuration(config.Interval)
Expand Down
88 changes: 58 additions & 30 deletions cmd/coordinator/cmd/command_add.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,58 @@ func CmdAdd(args *skel.CmdArgs) (err error) {

logger.Sugar().Infof("Get coordinator config: %+v", c)

errg := errgroup.Group{}
// we do detect gateway connection firstly
errgConflict := errgroup.Group{}
// IP conflict detection must precede gateway detection, which avoids the
// possibility that gateway detection may update arp table entries first and cause
// communication problems when IP conflict detection fails
// see https://github.com/spidernet-io/spiderpool/issues/4475
var ipc *ipchecking.IPChecker
if conf.IPConflict != nil && *conf.IPConflict {
logger.Debug("Try to detect ip conflict")
ipc, err = ipchecking.NewIPChecker(conf.DetectOptions.Retry, conf.DetectOptions.Interval, conf.DetectOptions.TimeOut, c.hostNs, c.netns, logger)
if err != nil {
return fmt.Errorf("failed to run NewIPChecker: %w", err)
}
ipc.DoIPConflictChecking(prevResult.IPs, c.currentInterface, &errgConflict)
} else {
logger.Debug("disable detect ip conflict")
}

if err = errgConflict.Wait(); err != nil {
logger.Error("failed to detect ip conflict", zap.Error(err))
if errors.Is(err, constant.ErrIPConflict) {
logger.Info("ip conflict detected, clean up conflict IPs")
_, innerErr := client.Daemonset.DeleteIpamIps(daemonset.NewDeleteIpamIpsParams().WithContext(context.TODO()).WithIpamBatchDelArgs(
&models.IpamBatchDelArgs{
ContainerID: &args.ContainerID,
NetNamespace: args.Netns,
PodName: (*string)(&k8sArgs.K8S_POD_NAME),
PodNamespace: (*string)(&k8sArgs.K8S_POD_NAMESPACE),
PodUID: (*string)(&k8sArgs.K8S_POD_UID),
},
))
if innerErr != nil {
logger.Sugar().Errorf("failed to clean up conflict IPs, error: %v", innerErr)
return multierr.Append(err, innerErr)
}
}
return err
}

// Fixed Mac addresses must come after IP conflict detection, otherwise the switch learns to communicate
// with the wrong Mac address when IP conflict detection fails
if len(conf.MacPrefix) != 0 {
hwAddr, err := networking.OverwriteHwAddress(logger, c.netns, conf.MacPrefix, args.IfName)
if err != nil {
return fmt.Errorf("failed to update hardware address for interface %s, maybe hardware_prefix(%s) is invalid: %v", args.IfName, conf.MacPrefix, err)
}
logger.Info("Fix mac address successfully", zap.String("interface", args.IfName), zap.String("macAddress", hwAddr))
}

// we do detect gateway connection lastly
// Finally, there is gateway detection, which updates the correct arp table entries
// once there are no IP address conflicts and fixed Mac addresses
errgGateway := errgroup.Group{}
if conf.DetectGateway != nil && *conf.DetectGateway {
logger.Debug("Try to detect gateway")

Expand All @@ -202,7 +252,6 @@ func CmdAdd(args *skel.CmdArgs) (err error) {
return fmt.Errorf("failed to GetDefaultGatewayByName: %v", err)
}
logger.Debug("Get GetDefaultGatewayByName", zap.Any("Gws", gws))

p, err := gwconnection.New(conf.DetectOptions.Retry, conf.DetectOptions.Interval, conf.DetectOptions.TimeOut, c.currentInterface, logger)
if err != nil {
return fmt.Errorf("failed to init the gateway client: %v", err)
Expand All @@ -211,10 +260,10 @@ func CmdAdd(args *skel.CmdArgs) (err error) {
for _, gw := range gws {
if gw.To4() != nil {
p.V4Gw = gw
errg.Go(c.hostNs, c.netns, p.ArpingOverIface)
errgGateway.Go(c.hostNs, c.netns, p.ArpingOverIface)
} else {
p.V6Gw = gw
errg.Go(c.hostNs, c.netns, p.NDPingOverIface)
errgGateway.Go(c.hostNs, c.netns, p.NDPingOverIface)
}
}
return nil
Expand All @@ -226,21 +275,10 @@ func CmdAdd(args *skel.CmdArgs) (err error) {
logger.Debug("disable detect gateway")
}

var ipc *ipchecking.IPChecker
if conf.IPConflict != nil && *conf.IPConflict {
logger.Debug("Try to detect ip conflict")
ipc, err = ipchecking.NewIPChecker(conf.DetectOptions.Retry, conf.DetectOptions.Interval, conf.DetectOptions.TimeOut, c.hostNs, c.netns, logger)
if err != nil {
return fmt.Errorf("failed to run NewIPChecker: %w", err)
}
ipc.DoIPConflictChecking(prevResult.IPs, c.currentInterface, &errg)
} else {
logger.Debug("disable detect ip conflict")
}

if err = errg.Wait(); err != nil {
logger.Error("failed to detect gateway and ip checking", zap.Error(err))
if errors.Is(err, constant.ErrIPConflict) || errors.Is(err, constant.ErrGatewayUnreachable) {
if err = errgGateway.Wait(); err != nil {
logger.Error("failed to detect gateway reachable", zap.Error(err))
if errors.Is(err, constant.ErrGatewayUnreachable) {
logger.Info("gateway unreachable detected, clean up conflict IPs")
_, innerErr := client.Daemonset.DeleteIpamIps(daemonset.NewDeleteIpamIpsParams().WithContext(context.TODO()).WithIpamBatchDelArgs(
&models.IpamBatchDelArgs{
ContainerID: &args.ContainerID,
Expand All @@ -255,19 +293,9 @@ func CmdAdd(args *skel.CmdArgs) (err error) {
return multierr.Append(err, innerErr)
}
}

return err
}

// overwrite mac address
if len(conf.MacPrefix) != 0 {
hwAddr, err := networking.OverwriteHwAddress(logger, c.netns, conf.MacPrefix, args.IfName)
if err != nil {
return fmt.Errorf("failed to update hardware address for interface %s, maybe hardware_prefix(%s) is invalid: %v", args.IfName, conf.MacPrefix, err)
}
logger.Info("Override hardware address successfully", zap.String("interface", args.IfName), zap.String("hardware address", hwAddr))
}

// set txqueuelen
if conf.TxQueueLen != nil && *conf.TxQueueLen > 0 {
if err = networking.LinkSetTxqueueLen(args.IfName, int(*conf.TxQueueLen)); err != nil {
Expand Down
4 changes: 2 additions & 2 deletions docs/concepts/coordinator-zh_CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ EOF
| podRPFilter | 设置 Pod 的 sysctl 参数 rp_filter | 整数型 | optional | 0 |
| hostRPFilter | (遗弃)设置节点 的 sysctl 参数 rp_filter | 整数型 | optional | 0 |
| txQueueLen | 设置 Pod 的网卡传输队列 | 整数型 | optional | 0 |
| detectOptions | 检测地址冲突和网关可达性的高级配置项: 包括重试次数(默认为 3 次), 探测间隔(默认为 1s) 和 超时时间(默认为 1s) | 对象类型 | optional ||
| detectOptions | 检测地址冲突和网关可达性的高级配置项: 包括重试次数(默认为 3 次), 探测间隔(默认为 10ms) 和 超时时间(默认为 100ms) | 对象类型 | optional ||
| logOptions | 日志配置,包括 logLevel(默认为 debug) 和 logFile(默认为 /var/log/spidernet/coordinator.log) | 对象类型 | optional | - |

> 如果您通过 `SpinderMultusConfig CR` 帮助创建 NetworkAttachmentDefinition CR,您可以在 `SpinderMultusConfig` 中配置 `coordinator` (所有字段)。参考: [SpinderMultusConfig](../reference/crd-spidermultusconfig.md)
Expand Down Expand Up @@ -171,7 +171,7 @@ spec:
vethLinkAddress: "169.254.200.1"
```

> `vethLinkAddress` 默认为空,表示不配置。不为空则必须是一个合法的本地链路地址。
> `vethLinkAddress` 默认为空,表示不配置。不为空则必须是一个合法的本地链路地址。

## 自动获取集群 Service 的 CIDR

Expand Down
9 changes: 4 additions & 5 deletions docs/concepts/coordinator.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ Let's delve into how coordinator implements these features.
| podRPFilter | Set the rp_filter sysctl parameter on the pod, which is recommended to be set to 0 | int | optional | 0 |
| hostRPFilter | (deprecated)Set the rp_filter sysctl parameter on the node, which is recommended to be set to 0 | int | optional | 0 |
| txQueueLen | set txqueuelen(Transmit Queue Length) of the pod's interface | int | optional | 0 |
| detectOptions | The advanced configuration of detectGateway and detectIPConflict, including retry numbers(default is 3), interval(default is 1s) and timeout(default is 1s) | obejct | optional | nil |
| detectOptions | The advanced configuration of detectGateway and detectIPConflict, including retry numbers(default is 3), interval(default is 10ms) and timeout(default is 100ms) | obejct | optional | nil |
| logOptions | The configuration of logging, including logLevel(default is debug) and logFile(default is /var/log/spidernet/coordinator.log) | obejct | optional | nil |

> You can configure `coordinator` by specifying all the relevant fields in `SpinderMultusConfig` if a NetworkAttachmentDefinition CR is created via `SpinderMultusConfig CR`. For more information, please refer to [SpinderMultusConfig](../reference/crd-spidermultusconfig.md).
Expand Down Expand Up @@ -101,17 +101,16 @@ spec:

> Note: There are some switches that are not allowed to be probed by arp, otherwise an alarm will be issued, in this case, we need to set detectGateway to false


## Fix MAC address prefix for Pods(alpha)

Some traditional applications may require a fixed MAC address or IP address to couple the behavior of the application. For example, the License Server may need to apply a fixed Mac address
or IP address to issue a license for the app. If the MAC address of a pod changes, the issued license may be invalid. Therefore, you need to fix the MAC address of the pod. Spiderpool can fix
Some traditional applications may require a fixed MAC address or IP address to couple the behavior of the application. For example, the License Server may need to apply a fixed Mac address
or IP address to issue a license for the app. If the MAC address of a pod changes, the issued license may be invalid. Therefore, you need to fix the MAC address of the pod. Spiderpool can fix
the MAC address of the application through `coordinator`, and the fixed rule is to configure the MAC address prefix (2 bytes) + convert the IP of the pod (4 bytes).

Note:

> currently supports updating Macvlan and SR-IOV as pods for CNI. In IPVlan L2 mode, the MAC addresses of the primary interface and the sub-interface are the same and cannot be modified.
>
>
> The fixed rule is to configure the MAC address prefix (2 bytes) + the IP of the converted pod (4 bytes). An IPv4 address is 4 bytes long and can be fully converted to 2 hexadecimal numbers. For IPv6 addresses, only the last 4 bytes are taken.

We can configure it via Spidermultusconfig:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,10 @@ var _ = Describe("MacvlanOverlayOne", Label("overlay", "one-nic", "coordinator")
Expect(frame.CreateSpiderMultusInstance(nad)).NotTo(HaveOccurred())

DeferCleanup(func() {
if CurrentSpecReport().Failed() {
GinkgoWriter.Println("If the use case fails, the cleanup step will be skipped")
return
}
GinkgoWriter.Printf("delete spiderMultusConfig %v/%v. \n", namespace, multusNadName)
Expect(frame.DeleteSpiderMultusInstance(namespace, multusNadName)).NotTo(HaveOccurred())

Expand Down Expand Up @@ -619,6 +623,10 @@ var _ = Describe("MacvlanOverlayOne", Label("overlay", "one-nic", "coordinator")
Expect(frame.CreateSpiderMultusInstance(nad)).NotTo(HaveOccurred())

DeferCleanup(func() {
if CurrentSpecReport().Failed() {
GinkgoWriter.Println("If the use case fails, the cleanup step will be skipped")
return
}
GinkgoWriter.Printf("delete spiderMultusConfig %v/%v. \n", namespace, multusNadName)
Expect(frame.DeleteSpiderMultusInstance(namespace, multusNadName)).NotTo(HaveOccurred())

Expand Down

0 comments on commit 5014593

Please sign in to comment.