ERROR_CHANNEL of websocket client is empty when a timeout occurs #2156

Shaokun-X · 2023-11-30T13:54:41Z

What happened:
If a timeout happens when steaming exec command, the websocket client simply stops reading, and the ERROR_CHANNEL becomes empty if the exec command is not finished. Then if one tries to read the returncode property of the client, a TypeError will be raised:

    @property
    def returncode(self):
        if self.is_open():
            return None
        else:
            if self._returncode is None:
                # err will be None
                err = self.read_channel(ERROR_CHANNEL)
                err = yaml.safe_load(err)
                # 'NoneType' object is not subscriptable
                if err['status'] == "Success":
                    self._returncode = 0
                else:
                    self._returncode = int(err['details']['causes'][0]['message'])
            return self._returncode
    ...
    def read_channel(self, channel, timeout=0):
        # self._channels is {}
        if channel not in self._channels:
            ret = self.peek_channel(channel, timeout)
        else:
            ret = self._channels[channel]
        if channel in self._channels:
            del self._channels[channel]
        return ret
    ...
    def peek_channel(self, channel, timeout=0):
        self.update(timeout=timeout)
        if channel in self._channels:
            return self._channels[channel]
        return ""
    ...
    def update(self, timeout=0):
        if not self.is_open():
            # None is returned from here
            return
        ...

What you expected to happen:

A TimeoutException being raised, or a property on the client to query the timeout state

How to reproduce it (as minimally and precisely as possible):

import kubernetes
from kubernetes.stream import stream
from kubernetes import config

pod_name = "my-pod"
container = "my-container"
namespace = "my-namespace"
command = ["/bin/sh", "-c", "sleep 2"]

config.load_kube_config()
k = kubernetes.client.CoreV1Api()

s = stream(
    k.connect_get_namespaced_pod_exec,
    pod_name,
    namespace=namespace,
    container=container,
    command=command,
    stderr=True,
    stdin=False,
    stdout=True,
    tty=False,
    _preload_content=False,
)

s.run_forever(timeout=1)
s.close()

print(s.returncode)

# TypeError: 'NoneType' object is not subscriptable

Anything else we need to know?:
Here are some snippets from kubelet, it looks to me that the Status channel is only written when the execution in container finishes:

// stream exec implementation
func ServeExec(w http.ResponseWriter, req *http.Request, executor Executor, podName string, uid types.UID, container string, cmd []string, streamOpts *Options, idleTimeout, streamCreationTimeout time.Duration, supportedProtocols []string) {
	ctx, ok := createStreams(req, w, streamOpts, supportedProtocols, idleTimeout, streamCreationTimeout)
	if !ok {
		// error is handled by createStreams
		return
	}
	defer ctx.conn.Close()

	err := executor.ExecInContainer(podName, uid, container, cmd, ctx.stdinStream, ctx.stdoutStream, ctx.stderrStream, ctx.tty, ctx.resizeChan, 0)
	if err != nil {
		if exitErr, ok := err.(utilexec.ExitError); ok && exitErr.Exited() {
			rc := exitErr.ExitStatus()
			ctx.writeStatus(&apierrors.StatusError{ErrStatus: metav1.Status{
				Status: metav1.StatusFailure,
				Reason: remotecommandconsts.NonZeroExitCodeReason,
				Details: &metav1.StatusDetails{
					Causes: []metav1.StatusCause{
						{
							Type:    remotecommandconsts.ExitCodeCauseType,
							Message: fmt.Sprintf("%d", rc),
						},
					},
				},
				Message: fmt.Sprintf("command terminated with non-zero exit code: %v", exitErr),
			}})
		} else {
			err = fmt.Errorf("error executing command in container: %v", err)
			runtime.HandleError(err)
			ctx.writeStatus(apierrors.NewInternalError(err))
		}
	} else {
		ctx.writeStatus(&apierrors.StatusError{ErrStatus: metav1.Status{
			Status: metav1.StatusSuccess,
		}})
	}
}


// writeStatus definition
switch negotiatedProtocol {
case v4BinaryWebsocketProtocol, v4Base64WebsocketProtocol:
	ctx.writeStatus = v4WriteStatusFunc(streams[errorChannel])
default:
	ctx.writeStatus = v1WriteStatusFunc(streams[errorChannel])
}

// v1WriteStatusFunc definition
func v1WriteStatusFunc(stream io.Writer) func(status *apierrors.StatusError) error {
	return func(status *apierrors.StatusError) error {
		if status.Status().Status == metav1.StatusSuccess {
			return nil // send error messages
		}
		_, err := stream.Write([]byte(status.Error()))
		return err
	}
}

// v4WriteStatusFunc definition
func v4WriteStatusFunc(stream io.Writer) func(status *apierrors.StatusError) error {
	return func(status *apierrors.StatusError) error {
		bs, err := json.Marshal(status.Status())
		if err != nil {
			return err
		}
		_, err = stream.Write(bs)
		return err
	}
}

// NewInternalError returns an error indicating the item is invalid and cannot be processed.
func NewInternalError(err error) *StatusError {
	return &StatusError{metav1.Status{
		Status: metav1.StatusFailure,
		Code:   http.StatusInternalServerError,
		Reason: metav1.StatusReasonInternalError,
		Details: &metav1.StatusDetails{
			Causes: []metav1.StatusCause{{Message: err.Error()}},
		},
		Message: fmt.Sprintf("Internal error occurred: %v", err),
	}}
}

Environment:

Kubernetes version (kubectl version): v1.24.17-eks-4f4795d
OS (e.g., MacOS 10.13.6): Debian GNU/Linux 12 (Linux-5.10.198-187.748.amzn2.x86_64-x86_64-with-glibc2.36)
Python version (python --version): Python 3.11.6 [GCC 12.2.0]
Python client version (pip list | grep kubernetes): 27.2.0

The text was updated successfully, but these errors were encountered:

roycaihw · 2024-02-12T17:39:28Z

CC @iciclespider. Could you take a look?

k8s-triage-robot · 2024-05-12T18:19:42Z

The Kubernetes project currently lacks enough contributors to adequately respond to all issues.

This bot triages un-triaged issues according to the following rules:

After 90d of inactivity, lifecycle/stale is applied
After 30d of inactivity since lifecycle/stale was applied, lifecycle/rotten is applied
After 30d of inactivity since lifecycle/rotten was applied, the issue is closed

You can:

Mark this issue as fresh with /remove-lifecycle stale
Close this issue with /close
Offer to help out with Issue Triage

Please send feedback to sig-contributor-experience at kubernetes/community.

/lifecycle stale

k8s-triage-robot · 2024-06-11T18:47:50Z

The Kubernetes project currently lacks enough active contributors to adequately respond to all issues.

This bot triages un-triaged issues according to the following rules:

After 90d of inactivity, lifecycle/stale is applied
After 30d of inactivity since lifecycle/stale was applied, lifecycle/rotten is applied
After 30d of inactivity since lifecycle/rotten was applied, the issue is closed

You can:

Mark this issue as fresh with /remove-lifecycle rotten
Close this issue with /close
Offer to help out with Issue Triage

Please send feedback to sig-contributor-experience at kubernetes/community.

/lifecycle rotten

Flauschbaellchen · 2024-07-05T12:55:20Z

Hi all,

I got the same problem while running custom scripts against a container.
As I always want to get the returncode afterwards to know if it was successfully executed or not, I stumpled over this issue.

My code looks similar:

s = stream(
    k.connect_get_namespaced_pod_exec,
    pod_name,
    namespace=namespace,
    container=container,
    command=command,
    stderr=True,
    stdin=False,
    stdout=True,
    tty=False,
    _preload_content=False,
)

while s.is_open():
    s.update(timeout=1)
    if s.peek_stdout() or s.peek_stderr():
        # do some logging

if s.returncode:
    print(f"Command failed: exit code {s.returncode}")

Even if the NoneType is not callable error is fixed and another error is raised, like the timeout error, it would not really solve my problem as the check becomes nevertheless unstable... are there any other options to improve the stability of executing scripts and checking the return code?

/remove-lifecycle rotten

Flauschbaellchen · 2024-07-05T13:14:22Z

As a side note: I was not be able to reproduce this issue when running a command which does not return successfully, e.g. exit 1.
Whereas, with a command which returns successfully, I got this issue around 1 times out of 10.

/remove-lifecycle rotten

k8s-triage-robot · 2024-10-03T13:29:53Z

The Kubernetes project currently lacks enough contributors to adequately respond to all issues.

This bot triages un-triaged issues according to the following rules:

After 90d of inactivity, lifecycle/stale is applied
After 30d of inactivity since lifecycle/stale was applied, lifecycle/rotten is applied
After 30d of inactivity since lifecycle/rotten was applied, the issue is closed

You can:

Mark this issue as fresh with /remove-lifecycle stale
Close this issue with /close
Offer to help out with Issue Triage

Please send feedback to sig-contributor-experience at kubernetes/community.

/lifecycle stale

k8s-triage-robot · 2024-11-02T14:15:55Z

The Kubernetes project currently lacks enough active contributors to adequately respond to all issues.

This bot triages un-triaged issues according to the following rules:

After 90d of inactivity, lifecycle/stale is applied
After 30d of inactivity since lifecycle/stale was applied, lifecycle/rotten is applied
After 30d of inactivity since lifecycle/rotten was applied, the issue is closed

You can:

Mark this issue as fresh with /remove-lifecycle rotten
Close this issue with /close
Offer to help out with Issue Triage

Please send feedback to sig-contributor-experience at kubernetes/community.

/lifecycle rotten

Shaokun-X added the kind/bug Categorizes issue or PR as related to a bug. label Nov 30, 2023

k8s-ci-robot added the lifecycle/stale Denotes an issue or PR has remained open with no activity and has become stale. label May 12, 2024

k8s-ci-robot added lifecycle/rotten Denotes an issue or PR that has aged beyond stale and will be auto-closed. and removed lifecycle/stale Denotes an issue or PR has remained open with no activity and has become stale. labels Jun 11, 2024

k8s-ci-robot removed the lifecycle/rotten Denotes an issue or PR that has aged beyond stale and will be auto-closed. label Jul 5, 2024

k8s-ci-robot added the lifecycle/stale Denotes an issue or PR has remained open with no activity and has become stale. label Oct 3, 2024

k8s-ci-robot added lifecycle/rotten Denotes an issue or PR that has aged beyond stale and will be auto-closed. and removed lifecycle/stale Denotes an issue or PR has remained open with no activity and has become stale. labels Nov 2, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

ERROR_CHANNEL of websocket client is empty when a timeout occurs #2156

ERROR_CHANNEL of websocket client is empty when a timeout occurs #2156

Shaokun-X commented Nov 30, 2023 •

edited

Loading

roycaihw commented Feb 12, 2024

k8s-triage-robot commented May 12, 2024

k8s-triage-robot commented Jun 11, 2024

Flauschbaellchen commented Jul 5, 2024 •

edited

Loading

Flauschbaellchen commented Jul 5, 2024

k8s-triage-robot commented Oct 3, 2024

k8s-triage-robot commented Nov 2, 2024

ERROR_CHANNEL of websocket client is empty when a timeout occurs #2156

ERROR_CHANNEL of websocket client is empty when a timeout occurs #2156

Comments

Shaokun-X commented Nov 30, 2023 • edited Loading

roycaihw commented Feb 12, 2024

k8s-triage-robot commented May 12, 2024

k8s-triage-robot commented Jun 11, 2024

Flauschbaellchen commented Jul 5, 2024 • edited Loading

Flauschbaellchen commented Jul 5, 2024

k8s-triage-robot commented Oct 3, 2024

k8s-triage-robot commented Nov 2, 2024

Shaokun-X commented Nov 30, 2023 •

edited

Loading

Flauschbaellchen commented Jul 5, 2024 •

edited

Loading