Skip to content
This repository has been archived by the owner on Oct 24, 2023. It is now read-only.

fix: Re-ordering HNS policy removal due to 10c change in behavior and consolidating logic in Windows cleanup scripts #4002

Merged
merged 3 commits into from
Nov 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .pipelines/pr-windows-signed-scripts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ pr: none

jobs:
- job: test_staged_windows_provisioning_scripts
timeoutInMinutes: 150
pool:
name: $(BUILD_POOL)
steps:
Expand Down
67 changes: 67 additions & 0 deletions staging/provisioning/windows/cleanupnetwork.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
$Global:ClusterConfiguration = ConvertFrom-Json ((Get-Content "c:\k\kubeclusterconfig.json" -ErrorAction Stop) | out-string)

$global:NetworkMode = "L2Bridge"
$global:ContainerRuntime = $Global:ClusterConfiguration.Cri.Name
$global:NetworkPlugin = $Global:ClusterConfiguration.Cni.Name

ipmo $global:HNSModule

$networkname = $global:NetworkMode.ToLower()
if ($global:NetworkPlugin -eq "azure") {
$networkname = "azure"
}

$hnsNetwork = Get-HnsNetwork | ? Name -EQ $networkname
if ($hnsNetwork) {
# Cleanup all containers
Write-Host "Cleaning up containers"
if ($global:ContainerRuntime -eq "containerd") {
ctr.exe -n k8s.io c ls -q | ForEach-Object { ctr -n k8s.io tasks kill $_ }
ctr.exe -n k8s.io c ls -q | ForEach-Object { ctr -n k8s.io c rm $_ }
}
else {
docker.exe ps -q | ForEach-Object { docker rm $_ -f }
}

Write-Host "Cleaning up persisted HNS policy lists"
# Initially a workaround for https://github.com/kubernetes/kubernetes/pull/68923 in < 1.14,
# and https://github.com/kubernetes/kubernetes/pull/78612 for <= 1.15
#
# October patch 10.0.17763.1554 introduced a breaking change
# which requires the hns policy list to be removed before network if it gets into a bad state
# See https://github.com/Azure/aks-engine/pull/3956#issuecomment-720797433 for more info
# Kubeproxy doesn't fail becuase errors are not handled:
# https://github.com/delulu/kubernetes/blob/524de768bb64b7adff76792ca3bf0f0ece1e849f/pkg/proxy/winkernel/proxier.go#L532
Get-HnsPolicyList | Remove-HnsPolicyList

Write-Host "Cleaning up old HNS network found"
Remove-HnsNetwork $hnsNetwork
Start-Sleep 10
}


if ($global:NetworkPlugin -eq "azure") {
Write-Host "NetworkPlugin azure, starting kubelet."

Write-Host "Cleaning stale CNI data"
# Kill all cni instances & stale data left by cni
# Cleanup all files related to cni
taskkill /IM azure-vnet.exe /f
taskkill /IM azure-vnet-ipam.exe /f

$filesToRemove = @(
"c:\k\azure-vnet.json",
"c:\k\azure-vnet.json.lock",
"c:\k\azure-vnet-ipam.json",
"c:\k\azure-vnet-ipam.json.lock"
"c:\k\azure-vnet-ipamv6.json",
"c:\k\azure-vnet-ipamv6.json.lock"
)

foreach ($file in $filesToRemove) {
if (Test-Path $file) {
Write-Host "Deleting stale file at $file"
Remove-Item $file
}
}
}
142 changes: 19 additions & 123 deletions staging/provisioning/windows/kubeletstart.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,7 @@ $global:CNIPath = [Io.path]::Combine("$global:KubeDir", "cni")
$global:CNIConfig = [Io.path]::Combine($global:CNIPath, "config", "$global:NetworkMode.conf")
$global:CNIConfigPath = [Io.path]::Combine("$global:CNIPath", "config")


$UseContainerD = ($global:ContainerRuntime -eq "containerd")

$KubeNetwork = "azure"
ipmo $global:HNSModule

#TODO ksbrmnn refactor to be sensical instead of if if if ...

Expand Down Expand Up @@ -56,7 +53,7 @@ else {
}

# Update args to use ContainerD if needed
if ($UseContainerD -eq $true) {
if ($global:ContainerRuntime -eq "containerd") {
$KubeletArgList += @("--container-runtime=remote", "--container-runtime-endpoint=npipe://./pipe/containerd-containerd")
}

Expand Down Expand Up @@ -184,65 +181,28 @@ Update-CNIConfigKubenetContainerD($podCIDR, $masterSubnetGW) {
Add-Content -Path $global:CNIConfig -Value (ConvertTo-Json $configJson -Depth 20)
}

# Required to clean up the HNS policy lists properly
Write-Host "Stopping kubeproxy service"
Stop-Service kubeproxy

if ($global:NetworkPlugin -eq "azure") {
Write-Host "NetworkPlugin azure, starting kubelet."

Write-Host "Cleaning stale CNI data"
# Kill all cni instances & stale data left by cni
# Cleanup all files related to cni
taskkill /IM azure-vnet.exe /f
taskkill /IM azure-vnet-ipam.exe /f
$cnijson = [io.path]::Combine("$KubeDir", "azure-vnet-ipam.json")
if ((Test-Path $cnijson)) {
Remove-Item $cnijson
}
$cnilock = [io.path]::Combine("$KubeDir", "azure-vnet-ipam.json.lock")
if ((Test-Path $cnilock)) {
Remove-Item $cnilock
}
$cnijson = [io.path]::Combine("$KubeDir", "azure-vnet-ipamv6.json")
if ((Test-Path $cnijson)) {
Remove-Item $cnijson
}
$cnilock = [io.path]::Combine("$KubeDir", "azure-vnet-ipamv6.json.lock")
if ((Test-Path $cnilock)) {
Remove-Item $cnilock
}
$cnijson = [io.path]::Combine("$KubeDir", "azure-vnet.json")
if ((Test-Path $cnijson)) {
Remove-Item $cnijson
}
$cnilock = [io.path]::Combine("$KubeDir", "azure-vnet.json.lock")
if ((Test-Path $cnilock)) {
Remove-Item $cnilock
}

# startup the service

# Find if network created by CNI exists, if yes, remove it
# This is required to keep the network non-persistent behavior
# Going forward, this would be done by HNS automatically during restart of the node

$hnsNetwork = Get-HnsNetwork | ? Name -EQ $KubeNetwork
if ($hnsNetwork) {
# Cleanup all containers
docker ps -q | foreach { docker rm $_ -f }

Write-Host "Cleaning up old HNS network found"
Remove-HnsNetwork $hnsNetwork
}

./cleanupnetwork.ps1

# Restart Kubeproxy, which would wait, until the network is created
# This was fixed in 1.15, workaround still needed for 1.14 https://github.com/kubernetes/kubernetes/pull/78612
Restart-Service Kubeproxy

# startup the service
$env:AZURE_ENVIRONMENT_FILEPATH = "c:\k\azurestackcloud.json"
Invoke-Expression $KubeletCommandLine
}

if (($global:NetworkPlugin -eq "kubenet") -and ($global:ContainerRuntime -eq "docker")) {
$KubeNetwork = "l2bridge"
if ($global:NetworkPlugin -eq "kubenet") {
try {
$env:AZURE_ENVIRONMENT_FILEPATH = "c:\k\azurestackcloud.json"

Expand All @@ -254,7 +214,7 @@ if (($global:NetworkPlugin -eq "kubenet") -and ($global:ContainerRuntime -eq "do
if (-not $podCidrDiscovered) {
$argList = $KubeletArgListStr

$process = Start-Process -FilePath c:\k\kubelet.exe -PassThru -ArgumentList $kubeletArgList
$process = Start-Process -FilePath c:\k\kubelet.exe -PassThru -ArgumentList $argList

# run kubelet until podCidr is discovered
Write-Host "waiting to discover pod CIDR"
Expand All @@ -270,95 +230,31 @@ if (($global:NetworkPlugin -eq "kubenet") -and ($global:ContainerRuntime -eq "do
$process | Stop-Process | Out-Null
}

# startup the service
$hnsNetwork = Get-HnsNetwork | ? Name -EQ $global:NetworkMode.ToLower()

if ($hnsNetwork) {
# Kubelet has been restarted with existing network.
# Cleanup all containers
docker ps -q | foreach { docker rm $_ -f }
# cleanup network
Write-Host "Cleaning up old HNS network found"
Remove-HnsNetwork $hnsNetwork
Start-Sleep 10
}
./cleanupnetwork.ps1

Write-Host "Creating a new hns Network"
ipmo $global:HNSModule

$hnsNetwork = New-HNSNetwork -Type $global:NetworkMode -AddressPrefix $podCIDR -Gateway $masterSubnetGW -Name $global:NetworkMode.ToLower() -Verbose
# New network has been created, Kubeproxy service has to be restarted
# This was fixed in 1.15, workaround still needed for 1.14 https://github.com/kubernetes/kubernetes/pull/78612
Restart-Service Kubeproxy

Start-Sleep 10
# Add route to all other POD networks
Update-CNIConfigKubenetDocker $podCIDR $masterSubnetGW

Invoke-Expression $KubeletCommandLine
}
catch {
Write-Error $_
}

}

if (($global:NetworkPlugin -eq "kubenet") -and ($global:ContainerRuntime -eq "containerd")) {
$KubeNetwork = "l2bridge"
try {
$masterSubnetGW = Get-DefaultGateway $global:MasterSubnet
$podCIDR = Get-PodCIDR
$podCidrDiscovered = Test-PodCIDR($podCIDR)

# if the podCIDR has not yet been assigned to this node, start the kubelet process to get the podCIDR, and then promptly kill it.
if (-not $podCidrDiscovered) {
$argList = $KubeletArgListStr

$process = Start-Process -FilePath c:\k\kubelet.exe -PassThru -ArgumentList $argList

# run kubelet until podCidr is discovered
Write-Host "waiting to discover pod CIDR"
while (-not $podCidrDiscovered) {
Write-Host "Sleeping for 10s, and then waiting to discover pod CIDR"
Start-Sleep 10

$podCIDR = Get-PodCIDR
$podCidrDiscovered = Test-PodCIDR($podCIDR)
}

# stop the kubelet process now that we have our CIDR, discard the process output
$process | Stop-Process | Out-Null
if ($global:ContainerRuntime -eq "containerd") {
Write-Host "Updating CNI config"
Update-CNIConfigKubenetContainerD $podCIDR $masterSubnetGW
}

# startup the service
$hnsNetwork = Get-HnsNetwork | ? Name -EQ $global:NetworkMode.ToLower()

if ($hnsNetwork) {
# Kubelet has been restarted with existing network.
# Cleanup all containers
# TODO: convert this to ctr.exe -n k8s.io container list ; container rm
docker ps -q | foreach { docker rm $_ -f }
# cleanup network
Write-Host "Cleaning up old HNS network found"
Remove-HnsNetwork $hnsNetwork
Start-Sleep 10
if ($global:ContainerRuntime -eq "docker") {
# Add route to all other POD networks
Update-CNIConfigKubenetDocker $podCIDR $masterSubnetGW
}

Write-Host "Creating a new hns Network"
ipmo $global:HNSModule

$hnsNetwork = New-HNSNetwork -Type $global:NetworkMode -AddressPrefix $podCIDR -Gateway $masterSubnetGW -Name $global:NetworkMode.ToLower() -Verbose
# New network has been created, Kubeproxy service has to be restarted
Restart-Service Kubeproxy

Start-Sleep 10
# Add route to all other POD networks
Write-Host "Updating CNI config"
Update-CNIConfigKubenetContainerD $podCIDR $masterSubnetGW

# startup the service
Invoke-Expression $KubeletCommandLine
}
catch {
Write-Error $_
}

}
39 changes: 1 addition & 38 deletions staging/provisioning/windows/windowsnodereset.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -49,44 +49,7 @@ if ($global:EnableHostsConfigAgent) {
# Perform cleanup
#

$hnsNetwork = Get-HnsNetwork | Where-Object Name -EQ azure
if ($hnsNetwork) {
Write-Log "Cleaning up containers"
if ($UseContainerD -eq $true) {
ctr.exe -n k8s.io c ls -q | ForEach-Object { ctr -n k8s.io tasks kill $_ }
ctr.exe -n k8s.io c ls -q | ForEach-Object { ctr -n k8s.io c rm $_ }
}
else {
docker.exe ps -q | ForEach-Object { docker rm $_ -f }
}

Write-Log "Removing old HNS network 'azure'"
Remove-HnsNetwork $hnsNetwork

taskkill /IM azure-vnet.exe /f
taskkill /IM azure-vnet-ipam.exe /f

$filesToRemove = @(
"c:\k\azure-vnet.json",
"c:\k\azure-vnet.json.lock",
"c:\k\azure-vnet-ipam.json",
"c:\k\azure-vnet-ipam.json.lock"
"c:\k\azure-vnet-ipamv6.json",
"c:\k\azure-vnet-ipamv6.json.lock"
)

foreach ($file in $filesToRemove) {
if (Test-Path $file) {
Write-Log "Deleting stale file at $file"
Remove-Item $file
}
}
}

Write-Log "Cleaning up persisted HNS policy lists"
# Workaround for https://github.com/kubernetes/kubernetes/pull/68923 in < 1.14,
# and https://github.com/kubernetes/kubernetes/pull/78612 for <= 1.15
Get-HnsPolicyList | Remove-HnsPolicyList
./cleanupnetwork.ps1

#
# Create required networks
Expand Down