Skip to content

Commit

Permalink
feat: support cordoning server
Browse files Browse the repository at this point in the history
Adds `cordoned` field to *server* spec to prevent allocation

Signed-off-by: Noel Georgi <[email protected]>
  • Loading branch information
frezbo committed Nov 26, 2021
1 parent ab29103 commit e77bf54
Show file tree
Hide file tree
Showing 19 changed files with 206 additions and 20 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ MODULE := $(shell head -1 go.mod | cut -d' ' -f2)

ARTIFACTS := _out
TEST_PKGS ?= ./...
TALOS_RELEASE ?= v0.13.0
DEFAULT_K8S_VERSION ?= v1.22.2
TALOS_RELEASE ?= v0.13.3
DEFAULT_K8S_VERSION ?= v1.22.3

TOOLS ?= ghcr.io/talos-systems/tools:v0.8.0
PKGS ?= v0.8.0
Expand Down
2 changes: 2 additions & 0 deletions app/sidero-controller-manager/api/v1alpha1/server_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ type ServerSpec struct {
ManagementAPI *ManagementAPI `json:"managementApi,omitempty"`
ConfigPatches []ConfigPatches `json:"configPatches,omitempty"`
Accepted bool `json:"accepted"`
Cordoned bool `json:"cordoned,omitempty"`
PXEBootAlways bool `json:"pxeBootAlways,omitempty"`
}

Expand Down Expand Up @@ -192,6 +193,7 @@ type ServerStatus struct {
// +kubebuilder:resource:scope=Cluster
// +kubebuilder:printcolumn:name="Hostname",type="string",JSONPath=".spec.hostname",description="server hostname"
// +kubebuilder:printcolumn:name="Accepted",type="boolean",JSONPath=".spec.accepted",description="indicates if the server is accepted"
// +kubebuilder:printcolumn:name="Cordoned",type="boolean",JSONPath=".spec.cordoned",description="indicates if the server is cordoned"
// +kubebuilder:printcolumn:name="Allocated",type="boolean",JSONPath=".status.inUse",description="indicates that the server has been allocated"
// +kubebuilder:printcolumn:name="Clean",type="boolean",JSONPath=".status.isClean",description="indicates if the server is clean or not"
// +kubebuilder:printcolumn:name="Power",type="string",JSONPath=".status.power",description="display the current power status"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ func AcceptedServerFilter(s Server) (bool, error) {
return s.Spec.Accepted, nil
}

// NotCordonedServerFilter matches Servers that have Spec.Paused set to false.
func NotCordonedServerFilter(s Server) (bool, error) {
return !s.Spec.Cordoned, nil
}

// SelectorFilter returns a ServerFilter that matches servers against the
// serverclass's selector field.
func (sc *ServerClass) SelectorFilter() func(Server) (bool, error) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ spec:
jsonPath: .spec.accepted
name: Accepted
type: boolean
- description: indicates if the server is cordoned
jsonPath: .spec.cordoned
name: Cordoned
type: boolean
- description: indicates that the server has been allocated
jsonPath: .status.inUse
name: Allocated
Expand Down Expand Up @@ -143,6 +147,8 @@ spec:
- path
type: object
type: array
cordoned:
type: boolean
cpu:
properties:
manufacturer:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ func (r *ServerClassReconciler) Reconcile(ctx context.Context, req ctrl.Request)

results, err := metalv1alpha1.FilterServers(sl.Items,
metalv1alpha1.AcceptedServerFilter,
metalv1alpha1.NotCordonedServerFilter,
sc.SelectorFilter(),
sc.QualifiersFilter(),
)
Expand Down
2 changes: 1 addition & 1 deletion hack/scripts/generate-clusterctl-config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

set -eou pipefail

mkdir -p `dirname "${CLUSTERCTL_CONFIG}"`
mkdir -p "$(dirname "${CLUSTERCTL_CONFIG}")"

cat > "${CLUSTERCTL_CONFIG}" <<EOF
providers:
Expand Down
2 changes: 1 addition & 1 deletion hack/scripts/integration-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ function build_registry_mirrors {

for registry in docker.io k8s.gcr.io quay.io gcr.io ghcr.io registry.dev.talos-systems.io; do
local service="registry-${registry//./-}.ci.svc"
local addr=`python3 -c "import socket; print(socket.gethostbyname('${service}'))"`
local addr=$(python3 -c "import socket; print(socket.gethostbyname('${service}'))")

REGISTRY_MIRROR_FLAGS="${REGISTRY_MIRROR_FLAGS} --registry-mirror ${registry}=http://${addr}:5000"
done
Expand Down
4 changes: 4 additions & 0 deletions hack/start-registry-proxies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,7 @@ docker run -d -p 5004:5000 \
-e REGISTRY_PROXY_REMOTEURL=https://ghcr.io \
--restart always \
--name registry-ghcr.io registry:2

docker run -d -p 5005:5000 \
--restart always \
--name registry-local registry:2
38 changes: 34 additions & 4 deletions sfyra/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,15 @@ Integration test for Sidero/Arges.

## Running

It is recommended to run the test suite with a local registry mirror running.

To run a local mirror run `hack/start-registry-proxies.sh`.
This will create a local registry along with mirrors for `registry-1.docker.io`, `k8s.gcr.io`, `https://quay.io`, `https://gcr.io` and `https://ghcr.io`

Build the test binary and Sidero, push images:

> If you have the local registry mirrors running add `REGISTRY=127.0.0.1:5010` to all the make commands (This will speed up development a lot).
```sh
make USERNAME=<username> TAG=v0.4.0 PUSH=true
```
Expand All @@ -29,6 +36,8 @@ It's also possible to run Sfyra manually to avoid tearing down and recreating wh
After `make USERNAME=<username> TAG=v0.4.0 PUSH=true` run:

```sh
# build sfyra
make sfyra
make talos-artifacts # need to run it only once per Talos release change
make clusterctl-release USERNAME=<username> TAG=v0.4.0 PUSH=true
```
Expand All @@ -49,11 +58,10 @@ export SFYRA_EXTRA_FLAGS="--skip-teardown"
make run-sfyra
```

With `--skip-teardown` flag test leaves the bootstrap cluster running so that next iteration of the test can be run without waiting for the boostrap actions to be finished.
With `--skip-teardown` flag test leaves the bootstrap cluster running so that next iteration of the test can be run without waiting for the bootstrap actions to be finished.
It's possible to run Sfyra tests once again without bringing down the test environment, but make sure that all the clusters are deleted with `kubectl delete clusters --all`.

Flag `--registry-mirror` is optional, but it speeds up provisioning significantly.
See Talos guides on setting up registry pull-through caches, or just run `hack/start-registry-proxies.sh`.

Kubernetes config can be pulled with `talosctl -n 172.24.0.2 kubeconfig --force`.

Expand All @@ -75,8 +83,6 @@ Sometimes it's important to test the flow when the servers are configured to boo
This can be achieved by adding a flag `--default-boot-order=nc` to `sfyra` invocation.
In this case Sidero iPXE server will force VM to boot from disk via iPXE if the server is already provisioned.

> Note: due to the dependency on new `talosctl`, this feature will be available once Talos in Sfyra is updated to version >= 0.11.
## Running with Talos HEAD as a bootstrap cluster

Build the artifacts in Talos:
Expand Down Expand Up @@ -124,3 +130,27 @@ To destroy Sfyra environment use `talosctl`:
sudo -E talosctl cluster destroy --provisioner=qemu --name=sfyra
sudo -E talosctl cluster destroy --provisioner=qemu --name=sfyra-management
```

## Manually registering a server for testing

### Registering

```bash
grpcurl \
-proto app/sidero-controller-manager/internal/api/api.proto \
-plaintext \
-d '{"hostname":"fake","cpu":{"manufacturer":"QEMU","version":"pc-q35-5.2"},"system_information":{"uuid": "a9cf15ab-d96b-4544-b6ab-baebb262213b","family":"Unknown","manufacturer":"QEMU","productName":"Standard PC (Q35 + ICH9, 2009)","serialNumber":"Unknown","skuNumber":"Unknown","version":"pc-q35-5.2"}}' \
172.24.0.2:8081 \
api.Agent/CreateServer
```

### Marking server as cleaned

```bash
grpcurl \
-proto app/sidero-controller-manager/internal/api/api.proto \
-plaintext \
-d '{"uuid": "a9cf15ab-d96b-4544-b6ab-baebb262213b"}' \
172.24.0.2:8081 \
api.Agent/MarkServerAsWiped
```
2 changes: 1 addition & 1 deletion sfyra/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ require (
github.com/talos-systems/go-retry v0.3.1
github.com/talos-systems/net v0.3.1-0.20211112122313-0abe5bdae8f8
github.com/talos-systems/sidero v0.0.0-00010101000000-000000000000
github.com/talos-systems/talos v0.13.2
github.com/talos-systems/talos v0.13.3
github.com/talos-systems/talos/pkg/machinery v0.14.0-alpha.1.0.20211123134516-852bf4a7de81
google.golang.org/grpc v1.42.0
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b
Expand Down
6 changes: 3 additions & 3 deletions sfyra/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -1062,10 +1062,10 @@ github.com/talos-systems/net v0.3.0/go.mod h1:VreSAyRmxMtqussAHSKMKkJQa1YwBTSVfk
github.com/talos-systems/net v0.3.1-0.20211112122313-0abe5bdae8f8 h1:oT2MASZ8V3DuZbhaJWJ8oZ373zfmgXpvw2xLHM5cOYk=
github.com/talos-systems/net v0.3.1-0.20211112122313-0abe5bdae8f8/go.mod h1:zhcGixNJz9dgwFiUwc7gkkAqdVqXagU1SNNoIVXYKGo=
github.com/talos-systems/siderolink v0.0.0-20211125180240-f7cadbcdfbb8/go.mod h1:bEGwDYl9QgC3oZ4kdnJTuR2HX/XlUhxZjx/QAakKuBc=
github.com/talos-systems/talos v0.13.2 h1:XfVA0rO4MxwXsfrTvWkKgxRHRUuMWaYVeU4ORpz7Jqc=
github.com/talos-systems/talos v0.13.2/go.mod h1:238xIFxpQ5fBRWrA1+hlsz0kCQ1N7tHx/SlKYocumeI=
github.com/talos-systems/talos v0.13.3 h1:enZICSB9A8SLzF2Kdv4GIbZv93KPhK+P9WKOm9my4Fg=
github.com/talos-systems/talos v0.13.3/go.mod h1:qKfAsuUd2pz1ZVj0P91oVdmYXdNO9F3fJJ0cbvFlkxE=
github.com/talos-systems/talos/pkg/machinery v0.12.3/go.mod h1:qX77JMZawrDTQaJucqecdlFsHy+dbnZ9YL8Kw4qL7d4=
github.com/talos-systems/talos/pkg/machinery v0.13.1/go.mod h1:fQx1FlvFLSexSOYL1DSl0EjtazujlzNmVDCt2yRoLJ4=
github.com/talos-systems/talos/pkg/machinery v0.13.3/go.mod h1:dJ0cLzqGZJgrFH8dhXpGJHg+r8tzktgjQXEqCOmljdg=
github.com/talos-systems/talos/pkg/machinery v0.14.0-alpha.1.0.20211118180932-1ffa8e048008/go.mod h1:D8NT4Aj+X2OpA6yK6RAtpw1wcgkDS7oD23vqOQWRiP8=
github.com/talos-systems/talos/pkg/machinery v0.14.0-alpha.1.0.20211123134516-852bf4a7de81 h1:/J0yS/k+yfzQpF0Cs2jQfstchDH2Nxnce2XuCxueLGM=
github.com/talos-systems/talos/pkg/machinery v0.14.0-alpha.1.0.20211123134516-852bf4a7de81/go.mod h1:D8NT4Aj+X2OpA6yK6RAtpw1wcgkDS7oD23vqOQWRiP8=
Expand Down
2 changes: 1 addition & 1 deletion sfyra/pkg/tests/environment.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func TestEnvironmentDefault(ctx context.Context, metalClient client.Client, clus
require.NoError(t, err)

environment = v1alpha1.Environment{}
err = retry.Constant(30 * time.Second).Retry(func() error {
err = retry.Constant(60 * time.Second).Retry(func() error {
if err := metalClient.Get(ctx, types.NamespacedName{Name: v1alpha1.EnvironmentDefault}, &environment); err != nil {
if apierrors.IsNotFound(err) {
return retry.ExpectedError(err)
Expand Down
131 changes: 131 additions & 0 deletions sfyra/pkg/tests/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,122 @@ func TestServerAcceptance(ctx context.Context, metalClient client.Client, vmSet
}
}

// TestServerCordoned makes sure the cordoned bool works.
func TestServerCordoned(ctx context.Context, metalClient client.Client, vmSet *vm.Set) TestFunc {
return func(t *testing.T) {
const numDummies = 3

// create dummy servers to test with
dummySpec := v1alpha1.ServerSpec{
CPU: &v1alpha1.CPUInformation{
Manufacturer: "DummyManufacturer",
},
}

for i := 0; i < numDummies; i++ {
serverName := fmt.Sprintf("dummyserver-%s", strconv.Itoa(i))
_, err := createDummyServer(ctx, metalClient, serverName, dummySpec)
require.NoError(t, err)
}

dummyServers := &v1alpha1.ServerList{}

labelSelector, err := labels.Parse("dummy-server=")
require.NoError(t, err)
err = metalClient.List(ctx, dummyServers, client.MatchingLabelsSelector{Selector: labelSelector})
require.NoError(t, err)

// clean up dummies
defer func(client client.Client) {
for _, server := range dummyServers.Items {
server := server
client.Delete(ctx, &server)
}
}(metalClient)

// patch all servers as accepted
for _, server := range dummyServers.Items {
server := server

patchHelper, err := patch.NewHelper(&server, metalClient)
require.NoError(t, err)

server.Spec.Accepted = true
require.NoError(t, patchHelper.Patch(ctx, &server))
}

// verify that all servers shows up as available in `any` serverclass
require.NoError(t, retry.Constant(30*time.Second, retry.WithUnits(5*time.Second)).Retry(func() error {
var serverClass v1alpha1.ServerClass
err := metalClient.Get(ctx, types.NamespacedName{Name: v1alpha1.ServerClassAny}, &serverClass)
if err != nil {
return err
}

availableServers := getAvailableServersFromServerClass(serverClass, dummyServers)
if len(availableServers) == numDummies {
return nil
}

return retry.ExpectedError(fmt.Errorf("%d != %d", len(availableServers), numDummies))
}))

// // cordon a single server and marked as paused
serverName := dummyServers.Items[0].Name

var server v1alpha1.Server

require.NoError(t, metalClient.Get(ctx, types.NamespacedName{Name: serverName}, &server))
patchHelper, err := patch.NewHelper(&server, metalClient)
require.NoError(t, err)

server.Spec.Cordoned = true

require.NoError(t, patchHelper.Patch(ctx, &server))

require.NoError(t, retry.Constant(30*time.Second, retry.WithUnits(5*time.Second)).Retry(func() error {
var serverClass v1alpha1.ServerClass
err := metalClient.Get(ctx, types.NamespacedName{Name: v1alpha1.ServerClassAny}, &serverClass)
if err != nil {
return err
}

availableServers := getAvailableServersFromServerClass(serverClass, dummyServers)
if len(availableServers) == numDummies-1 {
return nil
}

return retry.ExpectedError(fmt.Errorf("%d != %d", len(availableServers), numDummies-1))
}))

// patch the server and marked as not cordoned
var pausedServer v1alpha1.Server

require.NoError(t, metalClient.Get(ctx, types.NamespacedName{Name: serverName}, &pausedServer))
patchHelperPausedServer, err := patch.NewHelper(&pausedServer, metalClient)
require.NoError(t, err)

pausedServer.Spec.Cordoned = false

require.NoError(t, patchHelperPausedServer.Patch(ctx, &pausedServer))

require.NoError(t, retry.Constant(30*time.Second, retry.WithUnits(5*time.Second)).Retry(func() error {
var serverClass v1alpha1.ServerClass
err := metalClient.Get(ctx, types.NamespacedName{Name: v1alpha1.ServerClassAny}, &serverClass)
if err != nil {
return err
}

availableServers := getAvailableServersFromServerClass(serverClass, dummyServers)
if len(availableServers) == numDummies {
return nil
}

return retry.ExpectedError(fmt.Errorf("%d != %d", len(availableServers), numDummies))
}))
}
}

// TestServerResetOnAcceptance tests that servers are reset when accepted.
func TestServerResetOnAcceptance(ctx context.Context, metalClient client.Client) TestFunc {
return func(t *testing.T) {
Expand Down Expand Up @@ -403,3 +519,18 @@ func createDummyServer(ctx context.Context, metalClient client.Client, name stri
return nil
})
}

// getAvailableServersFromServerClass returns a list of servers that are available as part of a serverclass.
func getAvailableServersFromServerClass(serverClass v1alpha1.ServerClass, serverList *v1alpha1.ServerList) []string {
var foundServers []string

for _, server := range serverList.Items {
for _, serverName := range serverClass.Status.ServersAvailable {
if server.Name == serverName {
foundServers = append(foundServers, serverName)
}
}
}

return foundServers
}
4 changes: 4 additions & 0 deletions sfyra/pkg/tests/tests.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ func Run(ctx context.Context, cluster talos.Cluster, vmSet *vm.Set, capiManager
"TestServerAcceptance",
TestServerAcceptance(ctx, metalClient, vmSet),
},
{
"TestServerCordoned",
TestServerCordoned(ctx, metalClient, vmSet),
},
{
"TestServerResetOnAcceptance",
TestServerResetOnAcceptance(ctx, metalClient),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ For instance:
```bash
export CONTROL_PLANE_SERVERCLASS=any
export WORKER_SERVERCLASS=any
export TALOS_VERSION=v0.13.0
export TALOS_VERSION=v0.13.3
export KUBERNETES_VERSION=v1.22.2
export CONTROL_PLANE_PORT=6443
export CONTROL_PLANE_ENDPOINT=1.2.3.4
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ metadata:
name: default
spec:
kernel:
url: "https://github.com/talos-systems/talos/releases/download/v0.13.0/vmlinuz-amd64"
url: "https://github.com/talos-systems/talos/releases/download/v0.13.3/vmlinuz-amd64"
sha512: ""
args:
- console=tty0
Expand All @@ -46,7 +46,7 @@ spec:
- talos.config=http://$PUBLIC_IP:8081/configdata?uuid=
- talos.platform=metal
initrd:
url: "https://github.com/talos-systems/talos/releases/download/v0.13.0/initramfs-amd64.xz"
url: "https://github.com/talos-systems/talos/releases/download/v0.13.3/initramfs-amd64.xz"
sha512: ""
```
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ For instance:
```bash
export CONTROL_PLANE_SERVERCLASS=any
export WORKER_SERVERCLASS=any
export TALOS_VERSION=v0.13.0
export TALOS_VERSION=v0.13.3
export KUBERNETES_VERSION=v1.22.2
export CONTROL_PLANE_PORT=6443
export CONTROL_PLANE_ENDPOINT=1.2.3.4
Expand Down
Loading

0 comments on commit e77bf54

Please sign in to comment.