Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bugfix] Wait for ImageStatus #1602

Merged
merged 1 commit into from
Feb 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
- (Feature) Extract Scheduler API
- (Bugfix) Fix Image Discovery
- (Bugfix) Fix Resources Copy mechanism to prevent invalid pod creation
- (Bugfix) Wait for ImageStatus in ImageDiscover

## [1.2.38](https://github.com/arangodb/kube-arangodb/tree/1.2.38) (2024-02-22)
- (Feature) Extract GRPC Server
Expand Down
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,12 +161,15 @@ Flags:
--deployment.feature.upgrade-version-check-v2 Enable initContainer with pre version check based by Operator - Required ArangoDB 3.8.0 or higher
--features-config-map-name string Name of the Feature Map ConfigMap (default "arangodb-operator-feature-config-map")
-h, --help help for arangodb_operator
--image.discovery.status Discover Operator Image from Pod Status by default. When disabled Pod Spec is used. (default true)
--image.discovery.timeout duration Timeout for image discovery process (default 1m0s)
--internal.scaling-integration Enable Scaling Integration
--kubernetes.burst int Burst for the k8s API (default 30)
--kubernetes.max-batch-size int Size of batch during objects read (default 256)
--kubernetes.qps float32 Number of queries per second for k8s API (default 15)
--log.format string Set log format. Allowed values: 'pretty', 'JSON'. If empty, default format is used (default "pretty")
--log.level stringArray Set log levels in format <level> or <logger>=<level>. Possible loggers: action, agency, api-server, assertion, backup-operator, chaos-monkey, crd, deployment, deployment-ci, deployment-reconcile, deployment-replication, deployment-resilience, deployment-resources, deployment-storage, deployment-storage-pc, deployment-storage-service, http, inspector, k8s-client, ml-batchjob-operator, ml-cronjob-operator, ml-extension-operator, ml-extension-shutdown, ml-storage-operator, monitor, operator, operator-arangojob-handler, operator-v2, operator-v2-event, operator-v2-worker, panics, pod_compare, root, root-event-recorder, server, server-authentication (default [debug])
--log.level stringArray Set log levels in format <level> or <logger>=<level>. Possible loggers: action, agency, api-server, assertion, backup-operator, chaos-monkey, crd, deployment, deployment-ci, deployment-reconcile, deployment-replication, deployment-resilience, deployment-resources, deployment-storage, deployment-storage-pc, deployment-storage-service, http, inspector, integrations, k8s-client, ml-batchjob-operator, ml-cronjob-operator, ml-extension-operator, ml-extension-shutdown, ml-storage-operator, monitor, operator, operator-arangojob-handler, operator-v2, operator-v2-event, operator-v2-worker, panics, pod_compare, root, root-event-recorder, server, server-authentication (default [info])
--log.sampling If true, operator will try to minimize duplication of logging events (default true)
--memory-limit uint Define memory limit for hard shutdown and the dump of goroutines. Used for testing
--metrics.excluded-prefixes stringArray List of the excluded metrics prefixes
--mode.single Enable single mode in Operator. WARNING: There should be only one replica of Operator, otherwise Operator can take unexpected actions
Expand Down
39 changes: 35 additions & 4 deletions cmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,10 @@ var (
backupArangoD time.Duration
backupUploadArangoD time.Duration
}
operatorImageDiscovery struct {
timeout time.Duration
defaultStatusDiscovery bool
}
operatorReconciliationRetry struct {
delay time.Duration
count int
Expand Down Expand Up @@ -241,6 +245,8 @@ func init() {
f.IntVar(&operatorBackup.concurrentUploads, "backup-concurrent-uploads", globals.DefaultBackupConcurrentUploads, "Number of concurrent uploads per deployment")
f.Uint64Var(&memoryLimit.hardLimit, "memory-limit", 0, "Define memory limit for hard shutdown and the dump of goroutines. Used for testing")
f.StringArrayVar(&metricsOptions.excludedMetricPrefixes, "metrics.excluded-prefixes", nil, "List of the excluded metrics prefixes")
f.BoolVar(&operatorImageDiscovery.defaultStatusDiscovery, "image.discovery.status", true, "Discover Operator Image from Pod Status by default. When disabled Pod Spec is used.")
f.DurationVar(&operatorImageDiscovery.timeout, "image.discovery.timeout", time.Minute, "Timeout for image discovery process")
if err := features.Init(&cmdMain); err != nil {
panic(err.Error())
}
Expand Down Expand Up @@ -584,6 +590,20 @@ func newOperatorConfigAndDeps(id, namespace, name string) (operator.Config, oper
// getMyPodInfo looks up the image & service account of the pod with given name in given namespace
// Returns image, serviceAccount, error.
func getMyPodInfo(kubecli kubernetes.Interface, namespace, name string) (string, string, error) {
if image, sa, ok := getMyPodInfoWrap(kubecli, namespace, name, getMyImageInfoFunc(operatorImageDiscovery.defaultStatusDiscovery)); ok {
return image, sa, nil
}

logger.Warn("Unable to discover image, fallback to second method")

if image, sa, ok := getMyPodInfoWrap(kubecli, namespace, name, getMyImageInfoFunc(!operatorImageDiscovery.defaultStatusDiscovery)); ok {
return image, sa, nil
}

return "", "", errors.Errorf("Unable to discover image")
}

func getMyPodInfoWrap(kubecli kubernetes.Interface, namespace, name string, imageFunc func(in *core.Pod) (string, bool)) (string, string, bool) {
var image, sa string
op := func() error {
pod, err := kubecli.CoreV1().Pods(namespace).Get(context.Background(), name, meta.GetOptions{})
Expand All @@ -595,15 +615,26 @@ func getMyPodInfo(kubecli kubernetes.Interface, namespace, name string) (string,
return errors.WithStack(err)
}
sa = pod.Spec.ServiceAccountName
if image, err = k8sutil.GetArangoDBImageIDFromPod(pod, shared.ServerContainerName, shared.OperatorContainerName, constants.MyContainerNameEnv.GetOrDefault(shared.OperatorContainerName)); err != nil {
if i, ok := imageFunc(pod); !ok {
return errors.Wrap(err, "failed to get image ID from pod")
} else {
image = i
}
return nil
}
if err := retry.Retry(op, time.Minute*5); err != nil {
return "", "", errors.WithStack(err)
if err := retry.Retry(op, operatorImageDiscovery.timeout/2); err == nil {
return image, sa, true
}
return "", "", false
}

func getMyImageInfoFunc(status bool) func(pod *core.Pod) (string, bool) {
return func(pod *core.Pod) (string, bool) {
if status {
return k8sutil.GetArangoDBImageIDFromContainerStatuses(pod.Status.ContainerStatuses, shared.ServerContainerName, shared.OperatorContainerName, constants.MyContainerNameEnv.GetOrDefault(shared.OperatorContainerName))
}
return k8sutil.GetArangoDBImageIDFromContainers(pod.Spec.Containers, shared.ServerContainerName, shared.OperatorContainerName, constants.MyContainerNameEnv.GetOrDefault(shared.OperatorContainerName))
}
return image, sa, nil
}

func createRecorder(kubecli kubernetes.Interface, name, namespace string) record.EventRecorder {
Expand Down
15 changes: 12 additions & 3 deletions pkg/util/k8sutil/images.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,16 @@ func GetArangoDBImageIDFromPod(pod *core.Pod, names ...string) (string, error) {
if image, ok := GetArangoDBImageIDFromContainerStatuses(pod.Status.ContainerStatuses, names...); ok {
return image, nil
}

if image, ok := GetArangoDBImageIDFromContainers(pod.Spec.Containers, names...); ok {
return image, nil
}

if cs := pod.Status.ContainerStatuses; len(cs) > 0 {
if image := cs[0].ImageID; image != "" {
return ConvertImageID2Image(image), nil
if disc := ConvertImageID2Image(image); disc != "" {
return disc, nil
}
}
}
if cs := pod.Spec.Containers; len(cs) > 0 {
Expand All @@ -75,7 +78,11 @@ func GetArangoDBImageIDFromPod(pod *core.Pod, names ...string) (string, error) {
func GetArangoDBImageIDFromContainerStatuses(containers []core.ContainerStatus, names ...string) (string, bool) {
for _, name := range names {
if id := container.GetContainerStatusIDByName(containers, name); id != -1 {
return ConvertImageID2Image(containers[id].ImageID), true
if image := containers[id].ImageID; image != "" {
if disc := ConvertImageID2Image(image); disc != "" {
return disc, true
}
}
}
}

Expand All @@ -86,7 +93,9 @@ func GetArangoDBImageIDFromContainerStatuses(containers []core.ContainerStatus,
func GetArangoDBImageIDFromContainers(containers []core.Container, names ...string) (string, bool) {
for _, name := range names {
if id := container.GetContainerIDByName(containers, name); id != -1 {
return containers[id].Image, true
if image := containers[id].Image; image != "" {
return image, true
}
}
}

Expand Down