diff --git a/utilities/automake.mk b/utilities/automake.mk index 37d679f8227..9a2114df40a 100644 --- a/utilities/automake.mk +++ b/utilities/automake.mk @@ -23,6 +23,8 @@ scripts_DATA += utilities/ovs-lib usdt_SCRIPTS += \ utilities/usdt-scripts/bridge_loop.bt \ utilities/usdt-scripts/dpif_nl_exec_monitor.py \ + utilities/usdt-scripts/kernel_delay.py \ + utilities/usdt-scripts/kernel_delay.rst \ utilities/usdt-scripts/reval_monitor.py \ utilities/usdt-scripts/upcall_cost.py \ utilities/usdt-scripts/upcall_monitor.py @@ -70,6 +72,8 @@ EXTRA_DIST += \ utilities/docker/debian/build-kernel-modules.sh \ utilities/usdt-scripts/bridge_loop.bt \ utilities/usdt-scripts/dpif_nl_exec_monitor.py \ + utilities/usdt-scripts/kernel_delay.py \ + utilities/usdt-scripts/kernel_delay.rst \ utilities/usdt-scripts/reval_monitor.py \ utilities/usdt-scripts/upcall_cost.py \ utilities/usdt-scripts/upcall_monitor.py diff --git a/utilities/usdt-scripts/kernel_delay.py b/utilities/usdt-scripts/kernel_delay.py new file mode 100755 index 00000000000..636e108be1b --- /dev/null +++ b/utilities/usdt-scripts/kernel_delay.py @@ -0,0 +1,1420 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2022,2023 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# Script information: +# ------------------- +# This script allows a developer to quickly identify if the issue at hand +# might be related to the kernel running out of resources or if it really is +# an Open vSwitch issue. +# +# For documentation see the kernel_delay.rst file. +# +# +# Dependencies: +# ------------- +# You need to install the BCC package for your specific platform or build it +# yourself using the following instructions: +# https://raw.githubusercontent.com/iovisor/bcc/master/INSTALL.md +# +# Python needs the following additional packages installed: +# - pytz +# - psutil +# +# You can either install your distribution specific package or use pip: +# pip install pytz psutil +# +import argparse +import datetime +import os +import pytz +import psutil +import re +import sys +import time + +import ctypes as ct + +try: + from bcc import BPF, USDT, USDTException + from bcc.syscall import syscalls, syscall_name +except ModuleNotFoundError: + print("ERROR: Can't find the BPF Compiler Collection (BCC) tools!") + sys.exit(os.EX_OSFILE) + +from enum import IntEnum + + +# +# Actual eBPF source code +# +EBPF_SOURCE = """ +#include +#include + +#define MONITOR_PID + +enum { + +}; + +struct event_t { + u64 ts; + u32 tid; + u32 id; + + int user_stack_id; + int kernel_stack_id; + + u32 syscall; + u64 entry_ts; + +}; + +BPF_RINGBUF_OUTPUT(events, ); +BPF_STACK_TRACE(stack_traces, ); +BPF_TABLE("percpu_array", uint32_t, uint64_t, dropcnt, 1); +BPF_TABLE("percpu_array", uint32_t, uint64_t, trigger_miss, 1); + +BPF_ARRAY(capture_on, u64, 1); +static inline bool capture_enabled(u64 pid_tgid) { + int key = 0; + u64 *ret; + + if ((pid_tgid >> 32) != MONITOR_PID) + return false; + + ret = capture_on.lookup(&key); + return ret && *ret == 1; +} + +static inline bool capture_enabled__() { + int key = 0; + u64 *ret; + + ret = capture_on.lookup(&key); + return ret && *ret == 1; +} + +static struct event_t *get_event(uint32_t id) { + struct event_t *event = events.ringbuf_reserve(sizeof(struct event_t)); + + if (!event) { + dropcnt.increment(0); + return NULL; + } + + event->id = id; + event->ts = bpf_ktime_get_ns(); + event->tid = bpf_get_current_pid_tgid(); + + return event; +} + +static int start_trigger() { + int key = 0; + u64 *val = capture_on.lookup(&key); + + /* If the value is -1 we can't start as we are still processing the + * results in userspace. */ + if (!val || *val != 0) { + trigger_miss.increment(0); + return 0; + } + + struct event_t *event = get_event(EVENT_START_TRIGGER); + if (event) { + events.ringbuf_submit(event, 0); + *val = 1; + } else { + trigger_miss.increment(0); + } + return 0; +} + +static int stop_trigger() { + int key = 0; + u64 *val = capture_on.lookup(&key); + + if (!val || *val != 1) + return 0; + + struct event_t *event = get_event(EVENT_STOP_TRIGGER); + + if (event) + events.ringbuf_submit(event, 0); + + if (val) + *val = -1; + + return 0; +} + + + + + +/* + * For the syscall monitor the following probes get installed. + */ +struct syscall_data_t { + u64 count; + u64 total_ns; + u64 worst_ns; +}; + +struct syscall_data_key_t { + u32 pid; + u32 tid; + u32 syscall; +}; + +BPF_HASH(syscall_start, u64, u64); +BPF_HASH(syscall_data, struct syscall_data_key_t, struct syscall_data_t); + +TRACEPOINT_PROBE(raw_syscalls, sys_enter) { + u64 pid_tgid = bpf_get_current_pid_tgid(); + + if (!capture_enabled(pid_tgid)) + return 0; + + u64 t = bpf_ktime_get_ns(); + syscall_start.update(&pid_tgid, &t); + + return 0; +} + +TRACEPOINT_PROBE(raw_syscalls, sys_exit) { + struct syscall_data_t *val, zero = {}; + struct syscall_data_key_t key; + + u64 pid_tgid = bpf_get_current_pid_tgid(); + + if (!capture_enabled(pid_tgid)) + return 0; + + key.pid = pid_tgid >> 32; + key.tid = (u32)pid_tgid; + key.syscall = args->id; + + u64 *start_ns = syscall_start.lookup(&pid_tgid); + + if (!start_ns) + return 0; + + val = syscall_data.lookup_or_try_init(&key, &zero); + if (val) { + u64 delta = bpf_ktime_get_ns() - *start_ns; + val->count++; + val->total_ns += delta; + if (val->worst_ns == 0 || delta > val->worst_ns) + val->worst_ns = delta; + + if () { + struct event_t *event = get_event(EVENT_SYSCALL); + if (event) { + event->syscall = args->id; + event->entry_ts = *start_ns; + if () { + event->user_stack_id = stack_traces.get_stackid( + args, BPF_F_USER_STACK); + event->kernel_stack_id = stack_traces.get_stackid( + args, 0); + } + events.ringbuf_submit(event, 0); + } + } + } + return 0; +} + + +/* + * For measuring the thread run time, we need the following. + */ +struct run_time_data_t { + u64 count; + u64 total_ns; + u64 max_ns; + u64 min_ns; +}; + +struct pid_tid_key_t { + u32 pid; + u32 tid; +}; + +BPF_HASH(run_start, u64, u64); +BPF_HASH(run_data, struct pid_tid_key_t, struct run_time_data_t); + +static inline void thread_start_run(u64 pid_tgid, u64 ktime) +{ + run_start.update(&pid_tgid, &ktime); +} + +static inline void thread_stop_run(u32 pid, u32 tgid, u64 ktime) +{ + u64 pid_tgid = (u64) tgid << 32 | pid; + u64 *start_ns = run_start.lookup(&pid_tgid); + + if (!start_ns || *start_ns == 0) + return; + + struct run_time_data_t *val, zero = {}; + struct pid_tid_key_t key = { .pid = tgid, + .tid = pid }; + + val = run_data.lookup_or_try_init(&key, &zero); + if (val) { + u64 delta = ktime - *start_ns; + val->count++; + val->total_ns += delta; + if (val->max_ns == 0 || delta > val->max_ns) + val->max_ns = delta; + if (val->min_ns == 0 || delta < val->min_ns) + val->min_ns = delta; + } + *start_ns = 0; +} + + +/* + * For measuring the thread-ready delay, we need the following. + */ +struct ready_data_t { + u64 count; + u64 total_ns; + u64 worst_ns; +}; + +BPF_HASH(ready_start, u64, u64); +BPF_HASH(ready_data, struct pid_tid_key_t, struct ready_data_t); + +static inline int sched_wakeup__(u32 pid, u32 tgid) +{ + u64 pid_tgid = (u64) tgid << 32 | pid; + + if (!capture_enabled(pid_tgid)) + return 0; + + u64 t = bpf_ktime_get_ns(); + ready_start.update(&pid_tgid, &t); + return 0; +} + +RAW_TRACEPOINT_PROBE(sched_wakeup) +{ + struct task_struct *t = (struct task_struct *)ctx->args[0]; + return sched_wakeup__(t->pid, t->tgid); +} + +RAW_TRACEPOINT_PROBE(sched_wakeup_new) +{ + struct task_struct *t = (struct task_struct *)ctx->args[0]; + return sched_wakeup__(t->pid, t->tgid); +} + +RAW_TRACEPOINT_PROBE(sched_switch) +{ + struct task_struct *prev = (struct task_struct *)ctx->args[1]; + struct task_struct *next= (struct task_struct *)ctx->args[2]; + u64 ktime = 0; + + if (!capture_enabled__()) + return 0; + + if (prev-> == TASK_RUNNING && prev->tgid == MONITOR_PID) + sched_wakeup__(prev->pid, prev->tgid); + + if (prev->tgid == MONITOR_PID) { + ktime = bpf_ktime_get_ns(); + thread_stop_run(prev->pid, prev->tgid, ktime); + } + + u64 pid_tgid = (u64)next->tgid << 32 | next->pid; + + if (next->tgid != MONITOR_PID) + return 0; + + if (ktime == 0) + ktime = bpf_ktime_get_ns(); + + u64 *start_ns = ready_start.lookup(&pid_tgid); + + if (start_ns && *start_ns != 0) { + + struct ready_data_t *val, zero = {}; + struct pid_tid_key_t key = { .pid = next->tgid, + .tid = next->pid }; + + val = ready_data.lookup_or_try_init(&key, &zero); + if (val) { + u64 delta = ktime - *start_ns; + val->count++; + val->total_ns += delta; + if (val->worst_ns == 0 || delta > val->worst_ns) + val->worst_ns = delta; + } + *start_ns = 0; + } + + thread_start_run(pid_tgid, ktime); + return 0; +} + + +/* + * For measuring the hard irq time, we need the following. + */ +struct hardirq_start_data_t { + u64 start_ns; + char irq_name[32]; +}; + +struct hardirq_data_t { + u64 count; + u64 total_ns; + u64 worst_ns; +}; + +struct hardirq_data_key_t { + u32 pid; + u32 tid; + char irq_name[32]; +}; + +BPF_HASH(hardirq_start, u64, struct hardirq_start_data_t); +BPF_HASH(hardirq_data, struct hardirq_data_key_t, struct hardirq_data_t); + +TRACEPOINT_PROBE(irq, irq_handler_entry) +{ + u64 pid_tgid = bpf_get_current_pid_tgid(); + + if (!capture_enabled(pid_tgid)) + return 0; + + struct hardirq_start_data_t data = {}; + + data.start_ns = bpf_ktime_get_ns(); + TP_DATA_LOC_READ_STR(&data.irq_name, name, sizeof(data.irq_name)); + hardirq_start.update(&pid_tgid, &data); + return 0; +} + +TRACEPOINT_PROBE(irq, irq_handler_exit) +{ + u64 pid_tgid = bpf_get_current_pid_tgid(); + + if (!capture_enabled(pid_tgid)) + return 0; + + struct hardirq_start_data_t *data; + data = hardirq_start.lookup(&pid_tgid); + if (!data || data->start_ns == 0) + return 0; + + if (args->ret != IRQ_NONE) { + struct hardirq_data_t *val, zero = {}; + struct hardirq_data_key_t key = { .pid = pid_tgid >> 32, + .tid = (u32)pid_tgid }; + + bpf_probe_read_kernel(&key.irq_name, sizeof(key.irq_name), + data->irq_name); + val = hardirq_data.lookup_or_try_init(&key, &zero); + if (val) { + u64 delta = bpf_ktime_get_ns() - data->start_ns; + val->count++; + val->total_ns += delta; + if (val->worst_ns == 0 || delta > val->worst_ns) + val->worst_ns = delta; + } + } + + data->start_ns = 0; + return 0; +} + + +/* + * For measuring the soft irq time, we need the following. + */ +struct softirq_start_data_t { + u64 start_ns; + u32 vec_nr; +}; + +struct softirq_data_t { + u64 count; + u64 total_ns; + u64 worst_ns; +}; + +struct softirq_data_key_t { + u32 pid; + u32 tid; + u32 vec_nr; +}; + +BPF_HASH(softirq_start, u64, struct softirq_start_data_t); +BPF_HASH(softirq_data, struct softirq_data_key_t, struct softirq_data_t); + +TRACEPOINT_PROBE(irq, softirq_entry) +{ + u64 pid_tgid = bpf_get_current_pid_tgid(); + + if (!capture_enabled(pid_tgid)) + return 0; + + struct softirq_start_data_t data = {}; + + data.start_ns = bpf_ktime_get_ns(); + data.vec_nr = args->vec; + softirq_start.update(&pid_tgid, &data); + return 0; +} + +TRACEPOINT_PROBE(irq, softirq_exit) +{ + u64 pid_tgid = bpf_get_current_pid_tgid(); + + if (!capture_enabled(pid_tgid)) + return 0; + + struct softirq_start_data_t *data; + data = softirq_start.lookup(&pid_tgid); + if (!data || data->start_ns == 0) + return 0; + + struct softirq_data_t *val, zero = {}; + struct softirq_data_key_t key = { .pid = pid_tgid >> 32, + .tid = (u32)pid_tgid, + .vec_nr = data->vec_nr}; + + val = softirq_data.lookup_or_try_init(&key, &zero); + if (val) { + u64 delta = bpf_ktime_get_ns() - data->start_ns; + val->count++; + val->total_ns += delta; + if (val->worst_ns == 0 || delta > val->worst_ns) + val->worst_ns = delta; + } + + data->start_ns = 0; + return 0; +} +""" + + +# +# time_ns() +# +try: + from time import time_ns +except ImportError: + # For compatibility with Python <= v3.6. + def time_ns(): + now = datetime.datetime.now() + return int(now.timestamp() * 1e9) + + +# +# Probe class to use for the start/stop triggers +# +class Probe(object): + ''' + The goal for this object is to support as many as possible + probe/events as supported by BCC. See + https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#events--arguments + ''' + def __init__(self, probe, pid=None): + self.pid = pid + self.text_probe = probe + self._parse_text_probe() + + def __str__(self): + if self.probe_type == "usdt": + return "[{}]; {}:{}:{}".format(self.text_probe, self.probe_type, + self.usdt_provider, self.usdt_probe) + elif self.probe_type == "trace": + return "[{}]; {}:{}:{}".format(self.text_probe, self.probe_type, + self.trace_system, self.trace_event) + elif self.probe_type == "kprobe" or self.probe_type == "kretprobe": + return "[{}]; {}:{}".format(self.text_probe, self.probe_type, + self.kprobe_function) + elif self.probe_type == "uprobe" or self.probe_type == "uretprobe": + return "[{}]; {}:{}".format(self.text_probe, self.probe_type, + self.uprobe_function) + else: + return "[{}] <{}:unknown probe>".format(self.text_probe, + self.probe_type) + + def _raise(self, error): + raise ValueError("[{}]; {}".format(self.text_probe, error)) + + def _verify_kprobe_probe(self): + # Nothing to verify for now, just return. + return + + def _verify_trace_probe(self): + # Nothing to verify for now, just return. + return + + def _verify_uprobe_probe(self): + # Nothing to verify for now, just return. + return + + def _verify_usdt_probe(self): + if not self.pid: + self._raise("USDT probes need a valid PID.") + + usdt = USDT(pid=self.pid) + + for probe in usdt.enumerate_probes(): + if probe.provider.decode('utf-8') == self.usdt_provider and \ + probe.name.decode('utf-8') == self.usdt_probe: + return + + self._raise("Can't find UDST probe '{}:{}'".format(self.usdt_provider, + self.usdt_probe)) + + def _parse_text_probe(self): + ''' + The text probe format is defined as follows: + : + + Types: + USDT: u|usdt:: + TRACE: t|trace:: + KPROBE: k|kprobe: + KRETPROBE: kr|kretprobe: + UPROBE: up|uprobe: + URETPROBE: ur|uretprobe: + ''' + args = self.text_probe.split(":") + if len(args) <= 1: + self._raise("Can't extract probe type.") + + if args[0] not in ["k", "kprobe", "kr", "kretprobe", "t", "trace", + "u", "usdt", "up", "uprobe", "ur", "uretprobe"]: + self._raise("Invalid probe type '{}'".format(args[0])) + + self.probe_type = "kprobe" if args[0] == "k" else args[0] + self.probe_type = "kretprobe" if args[0] == "kr" else self.probe_type + self.probe_type = "trace" if args[0] == "t" else self.probe_type + self.probe_type = "usdt" if args[0] == "u" else self.probe_type + self.probe_type = "uprobe" if args[0] == "up" else self.probe_type + self.probe_type = "uretprobe" if args[0] == "ur" else self.probe_type + + if self.probe_type == "usdt": + if len(args) != 3: + self._raise("Invalid number of arguments for USDT") + + self.usdt_provider = args[1] + self.usdt_probe = args[2] + self._verify_usdt_probe() + + elif self.probe_type == "trace": + if len(args) != 3: + self._raise("Invalid number of arguments for TRACE") + + self.trace_system = args[1] + self.trace_event = args[2] + self._verify_trace_probe() + + elif self.probe_type == "kprobe" or self.probe_type == "kretprobe": + if len(args) != 2: + self._raise("Invalid number of arguments for K(RET)PROBE") + self.kprobe_function = args[1] + self._verify_kprobe_probe() + + elif self.probe_type == "uprobe" or self.probe_type == "uretprobe": + if len(args) != 2: + self._raise("Invalid number of arguments for U(RET)PROBE") + self.uprobe_function = args[1] + self._verify_uprobe_probe() + + def _get_kprobe_c_code(self, function_name, function_content): + # + # The kprobe__* do not require a function name, so it's + # ignored in the code generation. + # + return """ +int {}__{}(struct pt_regs *ctx) {{ + {} +}} +""".format(self.probe_type, self.kprobe_function, function_content) + + def _get_trace_c_code(self, function_name, function_content): + # + # The TRACEPOINT_PROBE() do not require a function name, so it's + # ignored in the code generation. + # + return """ +TRACEPOINT_PROBE({},{}) {{ + {} +}} +""".format(self.trace_system, self.trace_event, function_content) + + def _get_uprobe_c_code(self, function_name, function_content): + return """ +int {}(struct pt_regs *ctx) {{ + {} +}} +""".format(function_name, function_content) + + def _get_usdt_c_code(self, function_name, function_content): + return """ +int {}(struct pt_regs *ctx) {{ + {} +}} +""".format(function_name, function_content) + + def get_c_code(self, function_name, function_content): + if self.probe_type == 'kprobe' or self.probe_type == 'kretprobe': + return self._get_kprobe_c_code(function_name, function_content) + elif self.probe_type == 'trace': + return self._get_trace_c_code(function_name, function_content) + elif self.probe_type == 'uprobe' or self.probe_type == 'uretprobe': + return self._get_uprobe_c_code(function_name, function_content) + elif self.probe_type == 'usdt': + return self._get_usdt_c_code(function_name, function_content) + + return "" + + def probe_name(self): + if self.probe_type == 'kprobe' or self.probe_type == 'kretprobe': + return "{}".format(self.kprobe_function) + elif self.probe_type == 'trace': + return "{}:{}".format(self.trace_system, + self.trace_event) + elif self.probe_type == 'uprobe' or self.probe_type == 'uretprobe': + return "{}".format(self.uprobe_function) + elif self.probe_type == 'usdt': + return "{}:{}".format(self.usdt_provider, + self.usdt_probe) + + return "" + + +# +# event_to_dict() +# +def event_to_dict(event): + return dict([(field, getattr(event, field)) + for (field, _) in event._fields_ + if isinstance(getattr(event, field), (int, bytes))]) + + +# +# Event enum +# +Event = IntEnum("Event", ["SYSCALL", "START_TRIGGER", "STOP_TRIGGER"], + start=0) + + +# +# process_event() +# +def process_event(ctx, data, size): + global start_trigger_ts + global stop_trigger_ts + + event = bpf['events'].event(data) + if event.id == Event.SYSCALL: + syscall_events.append({"tid": event.tid, + "ts_entry": event.entry_ts, + "ts_exit": event.ts, + "syscall": event.syscall, + "user_stack_id": event.user_stack_id, + "kernel_stack_id": event.kernel_stack_id}) + elif event.id == Event.START_TRIGGER: + # + # This event would have started the trigger already, so all we need to + # do is record the start timestamp. + # + start_trigger_ts = event.ts + + elif event.id == Event.STOP_TRIGGER: + # + # This event would have stopped the trigger already, so all we need to + # do is record the start timestamp. + stop_trigger_ts = event.ts + + +# +# next_power_of_two() +# +def next_power_of_two(val): + np = 1 + while np < val: + np *= 2 + return np + + +# +# unsigned_int() +# +def unsigned_int(value): + try: + value = int(value) + except ValueError: + raise argparse.ArgumentTypeError("must be an integer") + + if value < 0: + raise argparse.ArgumentTypeError("must be positive") + return value + + +# +# unsigned_nonzero_int() +# +def unsigned_nonzero_int(value): + value = unsigned_int(value) + if value == 0: + raise argparse.ArgumentTypeError("must be nonzero") + return value + + +# +# get_thread_name() +# +def get_thread_name(pid, tid): + try: + with open(f"/proc/{pid}/task/{tid}/comm", encoding="utf8") as f: + return f.readline().strip("\n") + except FileNotFoundError: + pass + + return f"" + + +# +# get_vec_nr_name() +# +def get_vec_nr_name(vec_nr): + known_vec_nr = ["hi", "timer", "net_tx", "net_rx", "block", "irq_poll", + "tasklet", "sched", "hrtimer", "rcu"] + + if vec_nr < 0 or vec_nr > len(known_vec_nr): + return f"" + + return known_vec_nr[vec_nr] + + +# +# start/stop/reset capture +# +def start_capture(): + bpf["capture_on"][ct.c_int(0)] = ct.c_int(1) + + +def stop_capture(force=False): + if force: + bpf["capture_on"][ct.c_int(0)] = ct.c_int(0xffff) + else: + bpf["capture_on"][ct.c_int(0)] = ct.c_int(0) + + +def capture_running(): + return bpf["capture_on"][ct.c_int(0)].value == 1 + + +def reset_capture(): + bpf["syscall_start"].clear() + bpf["syscall_data"].clear() + bpf["run_start"].clear() + bpf["run_data"].clear() + bpf["ready_start"].clear() + bpf["ready_data"].clear() + bpf["hardirq_start"].clear() + bpf["hardirq_data"].clear() + bpf["softirq_start"].clear() + bpf["softirq_data"].clear() + bpf["stack_traces"].clear() + + +# +# Display timestamp +# +def print_timestamp(msg): + ltz = datetime.datetime.now() + utc = ltz.astimezone(pytz.utc) + time_string = "{} @{} ({} UTC)".format( + msg, ltz.isoformat(), utc.strftime("%H:%M:%S")) + print(time_string) + + +# +# process_results() +# +def process_results(syscall_events=None, trigger_delta=None): + if trigger_delta: + print_timestamp("# Triggered sample dump, stop-start delta {:,} ns". + format(trigger_delta)) + else: + print_timestamp("# Sample dump") + + # + # First get a list of all threads we need to report on. + # + threads_syscall = {k.tid for k, _ in bpf["syscall_data"].items() + if k.syscall != 0xffffffff} + + threads_run = {k.tid for k, _ in bpf["run_data"].items() + if k.pid != 0xffffffff} + + threads_ready = {k.tid for k, _ in bpf["ready_data"].items() + if k.pid != 0xffffffff} + + threads_hardirq = {k.tid for k, _ in bpf["hardirq_data"].items() + if k.pid != 0xffffffff} + + threads_softirq = {k.tid for k, _ in bpf["softirq_data"].items() + if k.pid != 0xffffffff} + + threads = sorted(threads_syscall | threads_run | threads_ready | + threads_hardirq | threads_softirq, + key=lambda x: get_thread_name(options.pid, x)) + + # + # Print header... + # + print("{:10} {:16} {}".format("TID", "THREAD", "")) + print("{:10} {:16} {}".format("-" * 10, "-" * 16, "-" * 76)) + indent = 28 * " " + + # + # Print all events/statistics per threads. + # + poll_id = [k for k, v in syscalls.items() if v == b'poll'][0] + for thread in threads: + + if thread != threads[0]: + print("") + + # + # SYSCALL_STATISTICS + # + print("{:10} {:16} {}\n{}{:20} {:>6} {:>10} {:>16} {:>16}".format( + thread, get_thread_name(options.pid, thread), + "[SYSCALL STATISTICS]", indent, + "NAME", "NUMBER", "COUNT", "TOTAL ns", "MAX ns")) + + total_count = 0 + total_ns = 0 + for k, v in sorted(filter(lambda t: t[0].tid == thread, + bpf["syscall_data"].items()), + key=lambda kv: -kv[1].total_ns): + + print("{}{:20.20} {:6} {:10} {:16,} {:16,}".format( + indent, syscall_name(k.syscall).decode('utf-8'), k.syscall, + v.count, v.total_ns, v.worst_ns)) + if k.syscall != poll_id: + total_count += v.count + total_ns += v.total_ns + + if total_count > 0: + print("{}{:20.20} {:6} {:10} {:16,}".format( + indent, "TOTAL( - poll):", "", total_count, total_ns)) + + # + # THREAD RUN STATISTICS + # + print("\n{:10} {:16} {}\n{}{:10} {:>16} {:>16} {:>16}".format( + "", "", "[THREAD RUN STATISTICS]", indent, + "SCHED_CNT", "TOTAL ns", "MIN ns", "MAX ns")) + + for k, v in filter(lambda t: t[0].tid == thread, + bpf["run_data"].items()): + + print("{}{:10} {:16,} {:16,} {:16,}".format( + indent, v.count, v.total_ns, v.min_ns, v.max_ns)) + + # + # THREAD READY STATISTICS + # + print("\n{:10} {:16} {}\n{}{:10} {:>16} {:>16}".format( + "", "", "[THREAD READY STATISTICS]", indent, + "SCHED_CNT", "TOTAL ns", "MAX ns")) + + for k, v in filter(lambda t: t[0].tid == thread, + bpf["ready_data"].items()): + + print("{}{:10} {:16,} {:16,}".format( + indent, v.count, v.total_ns, v.worst_ns)) + + # + # HARD IRQ STATISTICS + # + total_ns = 0 + total_count = 0 + header_printed = False + for k, v in sorted(filter(lambda t: t[0].tid == thread, + bpf["hardirq_data"].items()), + key=lambda kv: -kv[1].total_ns): + + if not header_printed: + print("\n{:10} {:16} {}\n{}{:20} {:>10} {:>16} {:>16}". + format("", "", "[HARD IRQ STATISTICS]", indent, + "NAME", "COUNT", "TOTAL ns", "MAX ns")) + header_printed = True + + print("{}{:20.20} {:10} {:16,} {:16,}".format( + indent, k.irq_name.decode('utf-8'), + v.count, v.total_ns, v.worst_ns)) + + total_count += v.count + total_ns += v.total_ns + + if total_count > 0: + print("{}{:20.20} {:10} {:16,}".format( + indent, "TOTAL:", total_count, total_ns)) + + # + # SOFT IRQ STATISTICS + # + total_ns = 0 + total_count = 0 + header_printed = False + for k, v in sorted(filter(lambda t: t[0].tid == thread, + bpf["softirq_data"].items()), + key=lambda kv: -kv[1].total_ns): + + if not header_printed: + print("\n{:10} {:16} {}\n" + "{}{:20} {:>7} {:>10} {:>16} {:>16}". + format("", "", "[SOFT IRQ STATISTICS]", indent, + "NAME", "VECT_NR", "COUNT", "TOTAL ns", "MAX ns")) + header_printed = True + + print("{}{:20.20} {:>7} {:10} {:16,} {:16,}".format( + indent, get_vec_nr_name(k.vec_nr), k.vec_nr, + v.count, v.total_ns, v.worst_ns)) + + total_count += v.count + total_ns += v.total_ns + + if total_count > 0: + print("{}{:20.20} {:7} {:10} {:16,}".format( + indent, "TOTAL:", "", total_count, total_ns)) + + # + # Print events + # + lost_stack_traces = 0 + if syscall_events: + stack_traces = bpf.get_table("stack_traces") + + print("\n\n# SYSCALL EVENTS:" + "\n{}{:>19} {:>19} {:>10} {:16} {:>10} {}".format( + 2 * " ", "ENTRY (ns)", "EXIT (ns)", "TID", "COMM", + "DELTA (us)", "SYSCALL")) + print("{}{:19} {:19} {:10} {:16} {:10} {}".format( + 2 * " ", "-" * 19, "-" * 19, "-" * 10, "-" * 16, + "-" * 10, "-" * 16)) + for event in syscall_events: + print("{}{:19} {:19} {:10} {:16} {:10,} {}".format( + " " * 2, + event["ts_entry"], event["ts_exit"], event["tid"], + get_thread_name(options.pid, event["tid"]), + int((event["ts_exit"] - event["ts_entry"]) / 1000), + syscall_name(event["syscall"]).decode('utf-8'))) + # + # Not sure where to put this, but I'll add some info on stack + # traces here... Userspace stack traces are very limited due to + # the fact that bcc does not support dwarf backtraces. As OVS + # gets compiled without frame pointers we will not see much. + # If however, OVS does get built with frame pointers, we should not + # use the BPF_STACK_TRACE_BUILDID as it does not seem to handle + # the debug symbols correctly. Also, note that for kernel + # traces you should not use BPF_STACK_TRACE_BUILDID, so two + # buffers are needed. + # + # Some info on manual dwarf walk support: + # https://github.com/iovisor/bcc/issues/3515 + # https://github.com/iovisor/bcc/pull/4463 + # + if options.stack_trace_size == 0: + continue + + if event['kernel_stack_id'] < 0 or event['user_stack_id'] < 0: + lost_stack_traces += 1 + + kernel_stack = stack_traces.walk(event['kernel_stack_id']) \ + if event['kernel_stack_id'] >= 0 else [] + user_stack = stack_traces.walk(event['user_stack_id']) \ + if event['user_stack_id'] >= 0 else [] + + for addr in kernel_stack: + print("{}{}".format( + " " * 10, + bpf.ksym(addr, show_module=True, + show_offset=True).decode('utf-8', 'replace'))) + + for addr in user_stack: + addr_str = bpf.sym(addr, options.pid, show_module=True, + show_offset=True).decode('utf-8', 'replace') + + if addr_str == "[unknown]": + addr_str += " 0x{:x}".format(addr) + + print("{}{}".format(" " * 10, addr_str)) + + # + # Print any footer messages. + # + if lost_stack_traces > 0: + print("\n#WARNING: We where not able to display {} stack traces!\n" + "# Consider increasing the stack trace size using\n" + "# the '--stack-trace-size' option.\n" + "# Note that this can also happen due to a stack id\n" + "# collision.".format(lost_stack_traces)) + + +# +# main() +# +def main(): + # + # Don't like these globals, but ctx passing does not seem to work with the + # existing open_ring_buffer() API :( + # + global bpf + global options + global syscall_events + global start_trigger_ts + global stop_trigger_ts + + start_trigger_ts = 0 + stop_trigger_ts = 0 + + # + # Argument parsing + # + parser = argparse.ArgumentParser() + + parser.add_argument("-D", "--debug", + help="Enable eBPF debugging", + type=int, const=0x3f, default=0, nargs='?') + parser.add_argument("-p", "--pid", metavar="VSWITCHD_PID", + help="ovs-vswitch's PID", + type=unsigned_int, default=None) + parser.add_argument("-s", "--syscall-events", metavar="DURATION_NS", + help="Record syscall events that take longer than " + "DURATION_NS. Omit the duration value to record all " + "syscall events", + type=unsigned_int, const=0, default=None, nargs='?') + parser.add_argument("--buffer-page-count", + help="Number of BPF ring buffer pages, default 1024", + type=unsigned_int, default=1024, metavar="NUMBER") + parser.add_argument("--sample-count", + help="Number of sample runs, default 1", + type=unsigned_nonzero_int, default=1, metavar="RUNS") + parser.add_argument("--sample-interval", + help="Delay between sample runs, default 0", + type=float, default=0, metavar="SECONDS") + parser.add_argument("--sample-time", + help="Sample time, default 0.5 seconds", + type=float, default=0.5, metavar="SECONDS") + parser.add_argument("--skip-syscall-poll-events", + help="Skip poll() syscalls with --syscall-events", + action="store_true") + parser.add_argument("--stack-trace-size", + help="Number of unique stack traces that can be " + "recorded, default 4096. 0 to disable", + type=unsigned_int, default=4096) + parser.add_argument("--start-trigger", metavar="TRIGGER", + help="Start trigger, see documentation for details", + type=str, default=None) + parser.add_argument("--stop-trigger", metavar="TRIGGER", + help="Stop trigger, see documentation for details", + type=str, default=None) + parser.add_argument("--trigger-delta", metavar="DURATION_NS", + help="Only report event when the trigger duration > " + "DURATION_NS, default 0 (all events)", + type=unsigned_int, const=0, default=0, nargs='?') + + options = parser.parse_args() + + # + # Find the PID of the ovs-vswitchd daemon if not specified. + # + if not options.pid: + for proc in psutil.process_iter(): + if 'ovs-vswitchd' in proc.name(): + if options.pid: + print("ERROR: Multiple ovs-vswitchd daemons running, " + "use the -p option!") + sys.exit(os.EX_NOINPUT) + + options.pid = proc.pid + + # + # Error checking on input parameters. + # + if not options.pid: + print("ERROR: Failed to find ovs-vswitchd's PID!") + sys.exit(os.EX_UNAVAILABLE) + + options.buffer_page_count = next_power_of_two(options.buffer_page_count) + + # + # Make sure we are running as root, or else we can not attach the probes. + # + if os.geteuid() != 0: + print("ERROR: We need to run as root to attached probes!") + sys.exit(os.EX_NOPERM) + + # + # Setup any of the start stop triggers + # + if options.start_trigger is not None: + try: + start_trigger = Probe(options.start_trigger, pid=options.pid) + except ValueError as e: + print(f"ERROR: Invalid start trigger {str(e)}") + sys.exit(os.EX_CONFIG) + else: + start_trigger = None + + if options.stop_trigger is not None: + try: + stop_trigger = Probe(options.stop_trigger, pid=options.pid) + except ValueError as e: + print(f"ERROR: Invalid stop trigger {str(e)}") + sys.exit(os.EX_CONFIG) + else: + stop_trigger = None + + # + # Attach probe to running process. + # + source = EBPF_SOURCE.replace("", "\n".join( + [" EVENT_{} = {},".format( + event.name, event.value) for event in Event])) + source = source.replace("", + str(options.buffer_page_count)) + source = source.replace("", str(options.pid)) + + if BPF.kernel_struct_has_field(b'task_struct', b'state') == 1: + source = source.replace('', 'state') + else: + source = source.replace('', '__state') + + poll_id = [k for k, v in syscalls.items() if v == b'poll'][0] + if options.syscall_events is None: + syscall_trace_events = "false" + elif options.syscall_events == 0: + if not options.skip_syscall_poll_events: + syscall_trace_events = "true" + else: + syscall_trace_events = f"args->id != {poll_id}" + else: + syscall_trace_events = "delta > {}".format(options.syscall_events) + if options.skip_syscall_poll_events: + syscall_trace_events += f" && args->id != {poll_id}" + + source = source.replace("", + syscall_trace_events) + + source = source.replace("", + str(options.stack_trace_size)) + + source = source.replace("", "true" + if options.stack_trace_size > 0 else "false") + + # + # Handle start/stop probes + # + if start_trigger: + source = source.replace("", + start_trigger.get_c_code( + "start_trigger_probe", + "return start_trigger();")) + else: + source = source.replace("", "") + + if stop_trigger: + source = source.replace("", + stop_trigger.get_c_code( + "stop_trigger_probe", + "return stop_trigger();")) + else: + source = source.replace("", "") + + # + # Setup usdt or other probes that need handling trough the BFP class. + # + usdt = USDT(pid=int(options.pid)) + try: + if start_trigger and start_trigger.probe_type == 'usdt': + usdt.enable_probe(probe=start_trigger.probe_name(), + fn_name="start_trigger_probe") + if stop_trigger and stop_trigger.probe_type == 'usdt': + usdt.enable_probe(probe=stop_trigger.probe_name(), + fn_name="stop_trigger_probe") + + except USDTException as e: + print("ERROR: {}".format( + (re.sub('^', ' ' * 7, str(e), flags=re.MULTILINE)).strip(). + replace("--with-dtrace or --enable-dtrace", + "--enable-usdt-probes"))) + sys.exit(os.EX_OSERR) + + bpf = BPF(text=source, usdt_contexts=[usdt], debug=options.debug) + + if start_trigger: + try: + if start_trigger.probe_type == "uprobe": + bpf.attach_uprobe(name=f"/proc/{options.pid}/exe", + sym=start_trigger.probe_name(), + fn_name="start_trigger_probe", + pid=options.pid) + + if start_trigger.probe_type == "uretprobe": + bpf.attach_uretprobe(name=f"/proc/{options.pid}/exe", + sym=start_trigger.probe_name(), + fn_name="start_trigger_probe", + pid=options.pid) + except Exception as e: + print("ERROR: Failed attaching uprobe start trigger " + f"'{start_trigger.probe_name()}';\n {str(e)}") + sys.exit(os.EX_OSERR) + + if stop_trigger: + try: + if stop_trigger.probe_type == "uprobe": + bpf.attach_uprobe(name=f"/proc/{options.pid}/exe", + sym=stop_trigger.probe_name(), + fn_name="stop_trigger_probe", + pid=options.pid) + + if stop_trigger.probe_type == "uretprobe": + bpf.attach_uretprobe(name=f"/proc/{options.pid}/exe", + sym=stop_trigger.probe_name(), + fn_name="stop_trigger_probe", + pid=options.pid) + except Exception as e: + print("ERROR: Failed attaching uprobe stop trigger" + f"'{stop_trigger.probe_name()}';\n {str(e)}") + sys.exit(os.EX_OSERR) + + # + # If no triggers are configured use the delay configuration + # + bpf['events'].open_ring_buffer(process_event) + + sample_count = 0 + while sample_count < options.sample_count: + sample_count += 1 + syscall_events = [] + + if not options.start_trigger: + print_timestamp("# Start sampling") + start_capture() + stop_time = -1 if options.stop_trigger else \ + time_ns() + options.sample_time * 1000000000 + else: + # For start triggers the stop time depends on the start trigger + # time, or depends on the stop trigger if configured. + stop_time = -1 if options.stop_trigger else 0 + + while True: + keyboard_interrupt = False + try: + last_start_ts = start_trigger_ts + last_stop_ts = stop_trigger_ts + + if stop_time > 0: + delay = int((stop_time - time_ns()) / 1000000) + if delay <= 0: + break + else: + delay = -1 + + bpf.ring_buffer_poll(timeout=delay) + + if stop_time <= 0 and last_start_ts != start_trigger_ts: + print_timestamp( + "# Start sampling (trigger@{})".format( + start_trigger_ts)) + + if not options.stop_trigger: + stop_time = time_ns() + \ + options.sample_time * 1000000000 + + if last_stop_ts != stop_trigger_ts: + break + + except KeyboardInterrupt: + keyboard_interrupt = True + break + + if options.stop_trigger and not capture_running(): + print_timestamp("# Stop sampling (trigger@{})".format( + stop_trigger_ts)) + else: + print_timestamp("# Stop sampling") + + if stop_trigger_ts != 0 and start_trigger_ts != 0: + trigger_delta = stop_trigger_ts - start_trigger_ts + else: + trigger_delta = None + + if not trigger_delta or trigger_delta >= options.trigger_delta: + stop_capture(force=True) # Prevent a new trigger to start. + process_results(syscall_events=syscall_events, + trigger_delta=trigger_delta) + elif trigger_delta: + sample_count -= 1 + print_timestamp("# Sample dump skipped, delta {:,} ns".format( + trigger_delta)) + + reset_capture() + stop_capture() + + if keyboard_interrupt: + break + + if options.sample_interval > 0: + time.sleep(options.sample_interval) + + # + # Report lost events. + # + dropcnt = bpf.get_table("dropcnt") + for k in dropcnt.keys(): + count = dropcnt.sum(k).value + if k.value == 0 and count > 0: + print("\n# WARNING: Not all events were captured, {} were " + "dropped!\n# Increase the BPF ring buffer size " + "with the --buffer-page-count option.".format(count)) + + if (options.sample_count > 1): + trigger_miss = bpf.get_table("trigger_miss") + for k in trigger_miss.keys(): + count = trigger_miss.sum(k).value + if k.value == 0 and count > 0: + print("\n# WARNING: Not all start triggers were successful. " + "{} were missed due to\n# slow userspace " + "processing!".format(count)) + + +# +# Start main() as the default entry point... +# +if __name__ == '__main__': + main() diff --git a/utilities/usdt-scripts/kernel_delay.rst b/utilities/usdt-scripts/kernel_delay.rst new file mode 100644 index 00000000000..0ebd30afb67 --- /dev/null +++ b/utilities/usdt-scripts/kernel_delay.rst @@ -0,0 +1,596 @@ +Troubleshooting Open vSwitch: Is the kernel to blame? +===================================================== +Often, when troubleshooting Open vSwitch (OVS) in the field, you might be left +wondering if the issue is really OVS-related, or if it's a problem with the +kernel being overloaded. Messages in the log like +``Unreasonably long XXXXms poll interval`` might suggest it's OVS, but from +experience, these are mostly related to an overloaded Linux Kernel. +The kernel_delay.py tool can help you quickly identify if the focus of your +investigation should be OVS or the Linux kernel. + + +Introduction +------------ +``kernel_delay.py`` consists of a Python script that uses the BCC [#BCC]_ +framework to install eBPF probes. The data the eBPF probes collect will be +analyzed and presented to the user by the Python script. Some of the presented +data can also be captured by the individual scripts included in the BBC [#BCC]_ +framework. + +kernel_delay.py has two modes of operation: + +- In **time mode**, the tool runs for a specific time and collects the + information. +- In **trigger mode**, event collection can be started and/or stopped based on + a specific eBPF probe. Currently, the following probes are supported: + - USDT probes + - Kernel tracepoints + - kprobe + - kretprobe + - uprobe + - uretprobe + + +In addition, the option, ``--sample-count``, exists to specify how many +iterations you would like to do. When using triggers, you can also ignore +samples if they are less than a number of nanoseconds with the +``--trigger-delta`` option. The latter might be useful when debugging Linux +syscalls which take a long time to complete. More on this later. Finally, you +can configure the delay between two sample runs with the ``--sample-interval`` +option. + +Before getting into more details, you can run the tool without any options +to see what the output looks like. Notice that it will try to automatically +get the process ID of the running ``ovs-vswitchd``. You can overwrite this +with the ``--pid`` option. + +.. code-block:: console + + $ sudo ./kernel_delay.py + # Start sampling @2023-06-08T12:17:22.725127 (10:17:22 UTC) + # Stop sampling @2023-06-08T12:17:23.224781 (10:17:23 UTC) + # Sample dump @2023-06-08T12:17:23.224855 (10:17:23 UTC) + TID THREAD + ---------- ---------------- ---------------------------------------------------------------------------- + 27090 ovs-vswitchd [SYSCALL STATISTICS] + + + 31741 revalidator122 [SYSCALL STATISTICS] + NAME NUMBER COUNT TOTAL ns MAX ns + poll 7 5 184,193,176 184,191,520 + recvmsg 47 494 125,208,756 310,331 + futex 202 8 18,768,758 4,023,039 + sendto 44 10 375,861 266,867 + sendmsg 46 4 43,294 11,213 + write 1 1 5,949 5,949 + getrusage 98 1 1,424 1,424 + read 0 1 1,292 1,292 + TOTAL( - poll): 519 144,405,334 + + [THREAD RUN STATISTICS] + SCHED_CNT TOTAL ns MIN ns MAX ns + 6 136,764,071 1,480 115,146,424 + + [THREAD READY STATISTICS] + SCHED_CNT TOTAL ns MAX ns + 7 11,334 6,636 + + [HARD IRQ STATISTICS] + NAME COUNT TOTAL ns MAX ns + eno8303-rx-1 1 3,586 3,586 + TOTAL: 1 3,586 + + [SOFT IRQ STATISTICS] + NAME VECT_NR COUNT TOTAL ns MAX ns + net_rx 3 1 17,699 17,699 + sched 7 6 13,820 3,226 + rcu 9 16 13,586 1,554 + timer 1 3 10,259 3,815 + TOTAL: 26 55,364 + + +By default, the tool will run for half a second in `time mode`. To extend this +you can use the ``--sample-time`` option. + + +What will it report +------------------- +The above sample output separates the captured data on a per-thread basis. +For this, it displays the thread's id (``TID``) and name (``THREAD``), +followed by resource-specific data. Which are: + +- ``SYSCALL STATISTICS`` +- ``THREAD RUN STATISTICS`` +- ``THREAD READY STATISTICS`` +- ``HARD IRQ STATISTICS`` +- ``SOFT IRQ STATISTICS`` + +The following sections will describe in detail what statistics they report. + + +``SYSCALL STATISTICS`` +~~~~~~~~~~~~~~~~~~~~~~ +``SYSCALL STATISTICS`` tell you which Linux system calls got executed during +the measurement interval. This includes the number of times the syscall was +called (``COUNT``), the total time spent in the system calls (``TOTAL ns``), +and the worst-case duration of a single call (``MAX ns``). + +It also shows the total of all system calls, but it excludes the poll system +call, as the purpose of this call is to wait for activity on a set of sockets, +and usually, the thread gets swapped out. + +Note that it only counts calls that started and stopped during the +measurement interval! + + +``THREAD RUN STATISTICS`` +~~~~~~~~~~~~~~~~~~~~~~~~~ +``THREAD RUN STATISTICS`` tell you how long the thread was running on a CPU +during the measurement interval. + +Note that these statistics only count events where the thread started and +stopped running on a CPU during the measurement interval. For example, if +this was a PMD thread, you should see zero ``SCHED_CNT`` and ``TOTAL_ns``. +If not, there might be a misconfiguration. + + +``THREAD READY STATISTICS`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``THREAD READY STATISTICS`` tell you the time between the thread being ready +to run and it actually running on the CPU. + +Note that these statistics only count events where the thread was getting +ready to run and started running during the measurement interval. + + +``HARD IRQ STATISTICS`` +~~~~~~~~~~~~~~~~~~~~~~~ +``HARD IRQ STATISTICS`` tell you how much time was spent servicing hard +interrupts during the threads run time. + +It shows the interrupt name (``NAME``), the number of interrupts (``COUNT``), +the total time spent in the interrupt handler (``TOTAL ns``), and the +worst-case duration (``MAX ns``). + + +``SOFT IRQ STATISTICS`` +~~~~~~~~~~~~~~~~~~~~~~~ +``SOFT IRQ STATISTICS`` tell you how much time was spent servicing soft +interrupts during the threads run time. + +It shows the interrupt name (``NAME``), vector number (``VECT_NR``), the +number of interrupts (``COUNT``), the total time spent in the interrupt +handler (``TOTAL ns``), and the worst-case duration (``MAX ns``). + + +The ``--syscall-events`` option +------------------------------- +In addition to reporting global syscall statistics in ``SYSCALL_STATISTICS``, +the tool can also report each individual syscall. This can be a usefull +second step if the ``SYSCALL_STATISTICS`` show high latency numbers. + +All you need to do is add the ``--syscall-events`` option, with or without +the additional ``DURATION_NS`` parameter. The ``DUTATION_NS`` parameter +allows you to exclude events that take less than the supplied time. + +The ``--skip-syscall-poll-events`` option allows you to exclude poll +syscalls from the report. + +Below is an example run, note that the resource-specific data is removed +to highlight the syscall events: + +.. code-block:: console + + $ sudo ./kernel_delay.py --syscall-events 50000 --skip-syscall-poll-events + # Start sampling @2023-06-13T17:10:46.460874 (15:10:46 UTC) + # Stop sampling @2023-06-13T17:10:46.960727 (15:10:46 UTC) + # Sample dump @2023-06-13T17:10:46.961033 (15:10:46 UTC) + TID THREAD + ---------- ---------------- ---------------------------------------------------------------------------- + 3359686 ipf_clean2 [SYSCALL STATISTICS] + ... + 3359635 ovs-vswitchd [SYSCALL STATISTICS] + ... + 3359697 revalidator12 [SYSCALL STATISTICS] + ... + 3359698 revalidator13 [SYSCALL STATISTICS] + ... + 3359699 revalidator14 [SYSCALL STATISTICS] + ... + 3359700 revalidator15 [SYSCALL STATISTICS] + ... + + # SYSCALL EVENTS: + ENTRY (ns) EXIT (ns) TID COMM DELTA (us) SYSCALL + ------------------- ------------------- ---------- ---------------- ---------- ---------------- + 2161821694935486 2161821695031201 3359699 revalidator14 95 futex + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode+0x9 [kernel] + do_syscall_64+0x68 [kernel] + entry_SYSCALL_64_after_hwframe+0x72 [kernel] + __GI___lll_lock_wait+0x30 [libc.so.6] + ovs_mutex_lock_at+0x18 [ovs-vswitchd] + [unknown] 0x696c003936313a63 + 2161821695276882 2161821695333687 3359698 revalidator13 56 futex + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode+0x9 [kernel] + do_syscall_64+0x68 [kernel] + entry_SYSCALL_64_after_hwframe+0x72 [kernel] + __GI___lll_lock_wait+0x30 [libc.so.6] + ovs_mutex_lock_at+0x18 [ovs-vswitchd] + [unknown] 0x696c003134313a63 + 2161821695275820 2161821695405733 3359700 revalidator15 129 futex + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode+0x9 [kernel] + do_syscall_64+0x68 [kernel] + entry_SYSCALL_64_after_hwframe+0x72 [kernel] + __GI___lll_lock_wait+0x30 [libc.so.6] + ovs_mutex_lock_at+0x18 [ovs-vswitchd] + [unknown] 0x696c003936313a63 + 2161821695964969 2161821696052021 3359635 ovs-vswitchd 87 accept + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode+0x9 [kernel] + do_syscall_64+0x68 [kernel] + entry_SYSCALL_64_after_hwframe+0x72 [kernel] + __GI_accept+0x4d [libc.so.6] + pfd_accept+0x3a [ovs-vswitchd] + [unknown] 0x7fff19f2bd00 + [unknown] 0xe4b8001f0f + +As you can see above, the output also shows the stackback trace. You can +disable this using the ``--stack-trace-size 0`` option. + +As you can see above, the backtrace does not show a lot of useful information +due to the BCC [#BCC]_ toolkit not supporting dwarf decoding. So to further +analyze system call backtraces, you could use perf. The following perf +script can do this for you (refer to the embedded instructions): + +https://github.com/chaudron/perf_scripts/blob/master/analyze_perf_pmd_syscall.py + + +Using triggers +-------------- +The tool supports start and, or stop triggers. This will allow you to capture +statistics triggered by a specific event. The following combinations of +stop-and-start triggers can be used. + +If you only use ``--start-trigger``, the inspection start when the trigger +happens and runs until the ``--sample-time`` number of seconds has passed. +The example below shows all the supported options in this scenario. + +.. code-block:: console + + $ sudo ./kernel_delay.py --start-trigger up:bridge_run --sample-time 4 \ + --sample-count 4 --sample-interval 1 + + +If you only use ``--stop-trigger``, the inspection starts immediately and +stops when the trigger happens. The example below shows all the supported +options in this scenario. + +.. code-block:: console + + $ sudo ./kernel_delay.py --stop-trigger upr:bridge_run \ + --sample-count 4 --sample-interval 1 + + +If you use both ``--start-trigger`` and ``--stop-trigger`` triggers, the +statistics are captured between the two first occurrences of these events. +The example below shows all the supported options in this scenario. + +.. code-block:: console + + $ sudo ./kernel_delay.py --start-trigger up:bridge_run \ + --stop-trigger upr:bridge_run \ + --sample-count 4 --sample-interval 1 \ + --trigger-delta 50000 + +What triggers are supported? Note that what ``kernel_delay.py`` calls triggers, +BCC [#BCC]_, calls events; these are eBPF tracepoints you can attach to. +For more details on the supported tracepoints, check out the BCC +documentation [#BCC_EVENT]_. + +The list below shows the supported triggers and their argument format: + +**USDT probes:** + [u|usdt]:{provider}:{probe} +**Kernel tracepoint:** + [t:trace]:{system}:{event} +**kprobe:** + [k:kprobe]:{kernel_function} +**kretprobe:** + [kr:kretprobe]:{kernel_function} +**uprobe:** + [up:uprobe]:{function} +**uretprobe:** + [upr:uretprobe]:{function} + +Here are a couple of trigger examples, more use-case-specific examples can be +found in the *Examples* section. + +.. code-block:: console + + --start|stop-trigger u:udpif_revalidator:start_dump + --start|stop-trigger t:openvswitch:ovs_dp_upcall + --start|stop-trigger k:ovs_dp_process_packet + --start|stop-trigger kr:ovs_dp_process_packet + --start|stop-trigger up:bridge_run + --start|stop-trigger upr:bridge_run + + +Examples +-------- +This section will give some examples of how to use this tool in real-world +scenarios. Let's start with the issue where Open vSwitch reports +``Unreasonably long XXXXms poll interval`` on your revalidator threads. Note +that there is a blog available explaining how the revalidator process works +in OVS [#REVAL_BLOG]_. + +First, let me explain this log message. It gets logged if the time delta +between two ``poll_block()`` calls is more than 1 second. In other words, +the process was spending a lot of time processing stuff that was made +available by the return of the ``poll_block()`` function. + +Do a run with the tool using the existing USDT revalidator probes as a start +and stop trigger (Note that the resource-specific data is removed from the none +revalidator threads): + +.. code-block:: console + + $ sudo ./kernel_delay.py --start-trigger u:udpif_revalidator:start_dump --stop-trigger u:udpif_revalidator:sweep_done + # Start sampling (trigger@791777093512008) @2023-06-14T14:52:00.110303 (12:52:00 UTC) + # Stop sampling (trigger@791778281498462) @2023-06-14T14:52:01.297975 (12:52:01 UTC) + # Triggered sample dump, stop-start delta 1,187,986,454 ns @2023-06-14T14:52:01.298021 (12:52:01 UTC) + TID THREAD + ---------- ---------------- ---------------------------------------------------------------------------- + 1457761 handler24 [SYSCALL STATISTICS] + NAME NUMBER COUNT TOTAL ns MAX ns + sendmsg 46 6110 123,274,761 41,776 + recvmsg 47 136299 99,397,508 49,896 + futex 202 51 7,655,832 7,536,776 + poll 7 4068 1,202,883 2,907 + getrusage 98 2034 586,602 1,398 + sendto 44 9 213,682 27,417 + TOTAL( - poll): 144503 231,128,385 + + [THREAD RUN STATISTICS] + SCHED_CNT TOTAL ns MIN ns MAX ns + + [THREAD READY STATISTICS] + SCHED_CNT TOTAL ns MAX ns + 1 1,438 1,438 + + [SOFT IRQ STATISTICS] + NAME VECT_NR COUNT TOTAL ns MAX ns + sched 7 21 59,145 3,769 + rcu 9 50 42,917 2,234 + TOTAL: 71 102,062 + 1457733 ovs-vswitchd [SYSCALL STATISTICS] + ... + 1457792 revalidator55 [SYSCALL STATISTICS] + NAME NUMBER COUNT TOTAL ns MAX ns + futex 202 73 572,576,329 19,621,600 + recvmsg 47 815 296,697,618 405,338 + sendto 44 3 78,302 26,837 + sendmsg 46 3 38,712 13,250 + write 1 1 5,073 5,073 + TOTAL( - poll): 895 869,396,034 + + [THREAD RUN STATISTICS] + SCHED_CNT TOTAL ns MIN ns MAX ns + 48 394,350,393 1,729 140,455,796 + + [THREAD READY STATISTICS] + SCHED_CNT TOTAL ns MAX ns + 49 23,650 1,559 + + [SOFT IRQ STATISTICS] + NAME VECT_NR COUNT TOTAL ns MAX ns + sched 7 14 26,889 3,041 + rcu 9 28 23,024 1,600 + TOTAL: 42 49,913 + + +Above you see from the start of the output that the trigger took more than a +second (1,187,986,454 ns), which is already know, by looking at the output of +the ``ovs-vsctl upcall/show`` command. + +From the *revalidator55*'s ``SYSCALL STATISTICS`` statistics you can see it +spent almost 870ms handling syscalls, and there were no poll() calls being +executed. The ``THREAD RUN STATISTICS`` statistics here are a bit misleading, +as it looks like OVS only spent 394ms on the CPU. But earlier, it was mentioned +that this time does not include the time being on the CPU at the start or stop +of an event. What is exactly the case here, because USDT probes were used. + +From the above data and maybe some ``top`` output, it can be determined that +the *revalidator55* thread is taking a lot of CPU time, probably because it +has to do a lot of revalidator work by itself. The solution here is to increase +the number of revalidator threads, so more work could be done in parallel. + +Here is another run of the same command in another scenario: + +.. code-block:: console + + $ sudo ./kernel_delay.py --start-trigger u:udpif_revalidator:start_dump --stop-trigger u:udpif_revalidator:sweep_done + # Start sampling (trigger@795160501758971) @2023-06-14T15:48:23.518512 (13:48:23 UTC) + # Stop sampling (trigger@795160764940201) @2023-06-14T15:48:23.781381 (13:48:23 UTC) + # Triggered sample dump, stop-start delta 263,181,230 ns @2023-06-14T15:48:23.781414 (13:48:23 UTC) + TID THREAD + ---------- ---------------- ---------------------------------------------------------------------------- + 1457733 ovs-vswitchd [SYSCALL STATISTICS] + ... + 1457792 revalidator55 [SYSCALL STATISTICS] + NAME NUMBER COUNT TOTAL ns MAX ns + recvmsg 47 284 193,422,110 46,248,418 + sendto 44 2 46,685 23,665 + sendmsg 46 2 24,916 12,703 + write 1 1 6,534 6,534 + TOTAL( - poll): 289 193,500,245 + + [THREAD RUN STATISTICS] + SCHED_CNT TOTAL ns MIN ns MAX ns + 2 47,333,558 331,516 47,002,042 + + [THREAD READY STATISTICS] + SCHED_CNT TOTAL ns MAX ns + 3 87,000,403 45,999,712 + + [SOFT IRQ STATISTICS] + NAME VECT_NR COUNT TOTAL ns MAX ns + sched 7 2 9,504 5,109 + TOTAL: 2 9,504 + + +Here you can see the revalidator run took about 263ms, which does not look +odd, however, the ``THREAD READY STATISTICS`` information shows that OVS was +waiting 87ms for a CPU to be run on. This means the revalidator process could +have finished 87ms faster. Looking at the ``MAX ns`` value, a worst-case delay +of almost 46ms can be seen, which hints at an overloaded system. + +One final example that uses a ``uprobe`` to get some statistics on a +``bridge_run()`` execution that takes more than 1ms. + +.. code-block:: console + + $ sudo ./kernel_delay.py --start-trigger up:bridge_run --stop-trigger ur:bridge_run --trigger-delta 1000000 + # Start sampling (trigger@2245245432101270) @2023-06-14T16:21:10.467919 (14:21:10 UTC) + # Stop sampling (trigger@2245245432414656) @2023-06-14T16:21:10.468296 (14:21:10 UTC) + # Sample dump skipped, delta 313,386 ns @2023-06-14T16:21:10.468419 (14:21:10 UTC) + # Start sampling (trigger@2245245505301745) @2023-06-14T16:21:10.540970 (14:21:10 UTC) + # Stop sampling (trigger@2245245506911119) @2023-06-14T16:21:10.542499 (14:21:10 UTC) + # Triggered sample dump, stop-start delta 1,609,374 ns @2023-06-14T16:21:10.542565 (14:21:10 UTC) + TID THREAD + ---------- ---------------- ---------------------------------------------------------------------------- + 3371035 [SYSCALL STATISTICS] + ... + 3371102 handler66 [SYSCALL STATISTICS] + ... + 3366258 ovs-vswitchd [SYSCALL STATISTICS] + NAME NUMBER COUNT TOTAL ns MAX ns + futex 202 43 403,469 199,312 + clone3 435 13 174,394 30,731 + munmap 11 8 115,774 21,861 + poll 7 5 92,969 38,307 + unlink 87 2 49,918 35,741 + mprotect 10 8 47,618 13,201 + accept 43 10 31,360 6,976 + mmap 9 8 30,279 5,776 + write 1 6 27,720 11,774 + rt_sigprocmask 14 28 12,281 970 + read 0 6 9,478 2,318 + recvfrom 45 3 7,024 4,024 + sendto 44 1 4,684 4,684 + getrusage 98 5 4,594 1,342 + close 3 2 2,918 1,627 + recvmsg 47 1 2,722 2,722 + TOTAL( - poll): 144 924,233 + + [THREAD RUN STATISTICS] + SCHED_CNT TOTAL ns MIN ns MAX ns + 13 817,605 5,433 524,376 + + [THREAD READY STATISTICS] + SCHED_CNT TOTAL ns MAX ns + 14 28,646 11,566 + + [SOFT IRQ STATISTICS] + NAME VECT_NR COUNT TOTAL ns MAX ns + rcu 9 1 2,838 2,838 + TOTAL: 1 2,838 + + 3371110 revalidator74 [SYSCALL STATISTICS] + ... + 3366311 urcu3 [SYSCALL STATISTICS] + ... + + +OVS removed some of the threads and their resource-specific data, but based +on the ```` thread name, you can determine that some +threads no longer exist. In the ``ovs-vswitchd`` thread, you can see some +``clone3`` syscalls, indicating threads were created. In this example, it was +due to the deletion of a bridge, which resulted in the recreation of the +revalidator and handler threads. + + +Use with Openshift +------------------ +This section describes how you would use the tool on a node in an OpenShift +cluster. It assumes you have console access to the node, either directly or +through a debug container. + +A base fedora38 container will be used through podman, as this will allow the +use of some additional tools and packages needed. + +First the containers need to be started: + +.. code-block:: console + + [core@sno-master ~]$ sudo podman run -it --rm \ + -e PS1='[(DEBUG)\u@\h \W]\$ ' \ + --privileged --network=host --pid=host \ + -v /lib/modules:/lib/modules:ro \ + -v /sys/kernel/debug:/sys/kernel/debug \ + -v /proc:/proc \ + -v /:/mnt/rootdir \ + quay.io/fedora/fedora:38-x86_64 + + [(DEBUG)root@sno-master /]# + + +Next add the ``linux_delay.py`` dependencies: + +.. code-block:: console + + [(DEBUG)root@sno-master /]# dnf install -y bcc-tools perl-interpreter \ + python3-pytz python3-psutil + + +You need to install the devel, debug and source RPMs for your OVS and kernel +version: + +.. code-block:: console + + [(DEBUG)root@sno-master home]# rpm -i \ + openvswitch2.17-debuginfo-2.17.0-67.el8fdp.x86_64.rpm \ + openvswitch2.17-debugsource-2.17.0-67.el8fdp.x86_64.rpm \ + kernel-devel-4.18.0-372.41.1.el8_6.x86_64.rpm + + +Now the tool can be started. Here the above ``bridge_run()`` example is used: + +.. code-block:: console + + [(DEBUG)root@sno-master home]# ./kernel_delay.py --start-trigger up:bridge_run --stop-trigger ur:bridge_run + # Start sampling (trigger@75279117343513) @2023-06-15T11:44:07.628372 (11:44:07 UTC) + # Stop sampling (trigger@75279117443980) @2023-06-15T11:44:07.628529 (11:44:07 UTC) + # Triggered sample dump, stop-start delta 100,467 ns @2023-06-15T11:44:07.628569 (11:44:07 UTC) + TID THREAD + ---------- ---------------- ---------------------------------------------------------------------------- + 1246 ovs-vswitchd [SYSCALL STATISTICS] + NAME NUMBER COUNT TOTAL ns MAX ns + getdents64 217 2 8,560 8,162 + openat 257 1 6,951 6,951 + accept 43 4 6,942 3,763 + recvfrom 45 1 3,726 3,726 + recvmsg 47 2 2,880 2,188 + stat 4 2 1,946 1,384 + close 3 1 1,393 1,393 + fstat 5 1 1,324 1,324 + TOTAL( - poll): 14 33,722 + + [THREAD RUN STATISTICS] + SCHED_CNT TOTAL ns MIN ns MAX ns + + [THREAD READY STATISTICS] + SCHED_CNT TOTAL ns MAX ns + + +.. rubric:: Footnotes + +.. [#BCC] https://github.com/iovisor/bcc +.. [#BCC_EVENT] https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#events--arguments +.. [#REVAL_BLOG] https://developers.redhat.com/articles/2022/10/19/open-vswitch-revalidator-process-explained