From 1bb8a32f5c1c267b088568e298d0177cd18a9154 Mon Sep 17 00:00:00 2001 From: Francisco Javier Honduvilla Coto Date: Wed, 24 Apr 2024 10:14:05 +0100 Subject: [PATCH 1/6] Add kernel tests This commit adds kernel tests running on x86_64 for 5.15, 6.0, 6.2, and 6.6 using vmtest[0]. The main goal is to ensure that the BPF programs load fine for various kernels. The original plan was to have a nix derivation that would do everything: 1) download the kernels 2) build vmtest 3) build lightswitch 4) generate the vmtest configuration, and finally 5) run vmtest with said configuration. I attempted to do this in this branch [1] but I was taking too long and decided for this approach that gets us 80% of the value for 20% of the work. The downsides for this approach are: 1) As the vmtest config file is not generated on the fly, the kernel paths have to be hardcoded. This is not as bad as it might seem because nix guarantees these paths to be the same accross machines, but it's still a bit kludgy; 2) There is an expectation that lightswitch in dev mode has been built before runnign the vmtests; How to run the kernel tests =========================== ``` nix develop cargo build vmtest ``` Notes ===== We still require qemu-guest-agen to be running in the base OS (nix or not) which is not ideal, but a proper fix might be quite involved. Test Plan ========= Forced a panic to make sure the kernel tests would fail. - [0]: https://github.com/danobi/vmtest - [1]: https://github.com/javierhonduco/lightswitch/commits/wip-vmtests --- ...upload.yml => static-build-and-upload.yml} | 0 .github/workflows/vmtests.yml | 34 ++++++++ .gitignore | 1 + flake.nix | 5 ++ vm.nix | 78 +++++++++++++++++++ vmtest.toml | 19 +++++ 6 files changed, 137 insertions(+) rename .github/workflows/{upload.yml => static-build-and-upload.yml} (100%) create mode 100644 .github/workflows/vmtests.yml create mode 100644 vm.nix create mode 100644 vmtest.toml diff --git a/.github/workflows/upload.yml b/.github/workflows/static-build-and-upload.yml similarity index 100% rename from .github/workflows/upload.yml rename to .github/workflows/static-build-and-upload.yml diff --git a/.github/workflows/vmtests.yml b/.github/workflows/vmtests.yml new file mode 100644 index 0000000..a3f4d32 --- /dev/null +++ b/.github/workflows/vmtests.yml @@ -0,0 +1,34 @@ +name: vmtests +on: + pull_request: + push: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + vmtests: + runs-on: ubuntu-22.04 + permissions: + id-token: write + contents: read + + steps: + - uses: actions/checkout@main + - uses: DeterminateSystems/nix-installer-action@main + - uses: DeterminateSystems/magic-nix-cache-action@main + + - name: Install system dependencies + run: | + export DEBIAN_FRONTEND=noninteractive + sudo apt-get -y install --no-install-recommends qemu-system-x86 qemu-guest-agent + + - name: Set up nix dev env + run: nix develop --command echo 0 + + - name: Build `lightswitch` + run: nix develop --ignore-environment --command bash -c 'cargo build' + + - name: Run kernel tests + run: nix develop --command 'vmtest' \ No newline at end of file diff --git a/.gitignore b/.gitignore index 142cfac..5f85844 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /target flame.svg src/bpf/*_skel.rs +.vmtest.log diff --git a/flake.nix b/flake.nix index c8a4a13..5cff985 100644 --- a/flake.nix +++ b/flake.nix @@ -53,6 +53,11 @@ # snapshot testing plugin binary cargo-insta # ocamlPackages.magic-trace + (import ./vm.nix { inherit pkgs; }).vmtest + (import ./vm.nix { inherit pkgs; }).kernel_5_15 + (import ./vm.nix { inherit pkgs; }).kernel_6_0 + (import ./vm.nix { inherit pkgs; }).kernel_6_2 + (import ./vm.nix { inherit pkgs; }).kernel_6_6 ]; LIBCLANG_PATH = lib.makeLibraryPath [ llvmPackages_16.libclang ]; diff --git a/vm.nix b/vm.nix new file mode 100644 index 0000000..173f4b2 --- /dev/null +++ b/vm.nix @@ -0,0 +1,78 @@ +{ pkgs }: +{ + kernel_5_15 = pkgs.stdenv.mkDerivation { + name = "download-kernel-5.15"; + src = pkgs.fetchurl { + url = "https://github.com/danobi/vmtest/releases/download/test_assets/bzImage-v5.15-fedora38"; + hash = "sha256-nq8W72vuNKCgO1OS6aJtAfg7AjHavRZ7WAkP7X6V610="; + }; + dontUnpack = true; + installPhase = '' + mkdir -p $out + cp -r $src $out/bzImage + ''; + }; + + kernel_6_0 = pkgs.stdenv.mkDerivation { + name = "download-kernel-6.0"; + src = pkgs.fetchurl { + url = "https://github.com/danobi/vmtest/releases/download/test_assets/bzImage-v6.0-fedora38"; + hash = "sha256-ZBBQ0yVUn+Isd2b+a32oMEbNo8T1v46P3rEtZ+1j9Ic="; + }; + dontUnpack = true; + installPhase = '' + mkdir -p $out + cp -r $src $out/bzImage + ''; + }; + + kernel_6_2 = pkgs.stdenv.mkDerivation { + name = "download-kernel-6.2"; + src = pkgs.fetchurl { + url = "https://github.com/danobi/vmtest/releases/download/test_assets/bzImage-v6.2-fedora38"; + hash = "sha256-YO2HEIWTuEEJts9JrW3V7UVR7t4J3+8On+tjdELa2m8="; + }; + dontUnpack = true; + installPhase = '' + mkdir -p $out + cp -r $src $out/bzImage + ''; + }; + + kernel_6_6 = pkgs.stdenv.mkDerivation { + name = "download-kernel-6.6"; + src = pkgs.fetchurl { + url = "https://github.com/danobi/vmtest/releases/download/test_assets/bzImage-v6.6-fedora38"; + hash = "sha256-6Fu16SPBITP0sI3lapkckZna6GKBn2hID038itt82jA="; + }; + dontUnpack = true; + installPhase = '' + mkdir -p $out + cp -r $src $out/bzImage + ''; + }; + + vmtest = pkgs.rustPlatform.buildRustPackage { + name = "vmtest"; + src = pkgs.fetchFromGitHub { + owner = "danobi"; + repo = "vmtest"; + rev = "51f11bf301fea054342996802a16ed21fb5054f4"; + sha256 = "sha256-qtTq0dnDHi1ITfQzKrXz+1dRMymAFBivWpjXntD09+A="; + }; + cargoHash = "sha256-SHjjCWz4FVVk1cczkMltRVEB3GK8jz2tVABNSlSZiUc="; + # nativeCheckInputs = [ pkgs.qemu ]; + + # There are some errors trying to access `/build/source/tests/*`. + doCheck = false; + + meta = with pkgs.lib; { + description = "Helps run tests in virtual machines"; + homepage = "https://github.com/danobi/vmtest/"; + license = licenses.asl20; + mainProgram = ""; + maintainers = with maintainers; [ ]; + platforms = platforms.linux; + }; + }; +} diff --git a/vmtest.toml b/vmtest.toml new file mode 100644 index 0000000..3a06893 --- /dev/null +++ b/vmtest.toml @@ -0,0 +1,19 @@ +[[target]] +name = "Fedora 5.15" +kernel = "/nix/store/cg9wq48zz2dbvayh7sigr5smaf4dwcxp-download-kernel-5.15/bzImage" +command = "/mnt/vmtest/target/debug/lightswitch --duration 0" + +[[target]] +name = "Fedora 6.0" +kernel = "/nix/store/p8xc9gqdkpfjpmcb9ja2gx6g6bd9cby7-download-kernel-6.0/bzImage" +command = "/mnt/vmtest/target/debug/lightswitch --duration 0" + +[[target]] +name = "Fedora 6.2" +kernel = "/nix/store/79wzcqy12fncdf6pk1fj627ggh7v0nc7-download-kernel-6.2/bzImage" +command = "/mnt/vmtest/target/debug/lightswitch --duration 0" + +[[target]] +name = "Fedora 6.6" +kernel = "/nix/store/77ixckavs2qidx1pglmlxsj6bfvjqijb-download-kernel-6.6/bzImage" +command = "/mnt/vmtest/target/debug/lightswitch --duration 0" \ No newline at end of file From a932078cd2728c018ad97a4a65bd6ad5f6cb4f0e Mon Sep 17 00:00:00 2001 From: Francisco Javier Honduvilla Coto Date: Mon, 29 Apr 2024 11:42:20 +0100 Subject: [PATCH 2/6] Remediation for verifier regression in kernels ~6.8 and greater Some days ago we put a fix for arm64 kernels as a verifier regression was introduced causing loading errors [commit](https://github.com/javierhonduco/lightswitch/commit/a8094d18d0e8a662d3581380934cd185eb30bf45). Turns out this is not just affecting arm64, as it just happened it's that my arm64 VM was running a more up to date kernel. This regression affects x86 as well. This commit changes the remediation to unrolloing the binary search loop and adds a kernel regression test. I haven't measured the performance implication of doing this vs the previous remediation. This is out of the scope of this PR and it's something I will work on later on. When I have spare cycles I should report this upstream. The error ========= ``` 292: (bf) r6 = r3 ; R3_w=scalar(id=52209,smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x1869e,var_off=(0x0; 0x1ffff)) R6_w=scalar(id=52209,smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x1869e,var_off=(0x0; 0x1ffff)) 293: (bf) r3 = r2 ; R2_w=scalar(id=52208,smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=0x1869d,var_off=(0x0; 0x1ffff)) R3_w=scalar(id=52208,smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=0x1869d,var_off=(0x0; 0x1ffff)) 294: (2d) if r4 > r8 goto pc+1 296: R0_w=scalar() R1=11 R2_w=scalar(id=52208,smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=0x1869d,var_off=(0x0; 0x1ffff)) R3_w=scalar(id=52208,smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=0x1869d,var_off=(0x0; 0x1ffff)) R4_w=scalar(umin=9,umax=0xfffffffffffffffa) R5_w=scalar() R6_w=scalar(id=52209,smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x1869e,var_off=(0x0; 0x1ffff)) R7=scalar(id=52027,smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x1869e,var_off=(0x0; 0x1ffff)) R8=scalar(id=24520,umin=8,umax=0xfffffffffffffff9) R9_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=0xff0000,var_off=(0x0; 0xff0000)) R10=fp0 fp-16=mmmmmmmm fp-24=mmmmmmmm fp-32=mmmmmmmm fp-40=mmmmmmmm fp-48=map_value(map=unwind_tables,ks=8,vs=1400000) fp-56=6 fp-64=scalar(id=52027,smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x1869e,var_off=(0x0; 0x1ffff)) fp-72=map_value(map=heap,ks=4,vs=1080) fp-80=map_value(map=heap,ks=4,vs=1080) fp-88=map_value(map=exec_mappings,ks=16,vs=32) fp-96=mmmmmmmm fp-104=ctx() fp-112=mmmmmmmm fp-120=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) fp-128=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) fp-136=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) ; if (table->rows[mid].pc <= pc) { 296: (2d) if r4 > r8 goto pc-83 214: R0_w=scalar() R1=11 R2_w=scalar(id=52208,smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=0x1869d,var_off=(0x0; 0x1ffff)) R3_w=scalar(id=52208,smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=0x1869d,var_off=(0x0; 0x1ffff)) R4_w=scalar(umin=9,umax=0xfffffffffffffffa) R5_w=scalar() R6_w=scalar(id=52209,smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x1869e,var_off=(0x0; 0x1ffff)) R7=scalar(id=52027,smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x1869e,var_off=(0x0; 0x1ffff)) R8=scalar(id=24520,umin=8,umax=0xfffffffffffffff9) R9_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=0xff0000,var_off=(0x0; 0xff0000)) R10=fp0 fp-16=mmmmmmmm fp-24=mmmmmmmm fp-32=mmmmmmmm fp-40=mmmmmmmm fp-48=map_value(map=unwind_tables,ks=8,vs=1400000) fp-56=6 fp-64=scalar(id=52027,smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x1869e,var_off=(0x0; 0x1ffff)) fp-72=map_value(map=heap,ks=4,vs=1080) fp-80=map_value(map=heap,ks=4,vs=1080) fp-88=map_value(map=exec_mappings,ks=16,vs=32) fp-96=mmmmmmmm fp-104=ctx() fp-112=mmmmmmmm fp-120=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) fp-128=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) fp-136=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) ; LOG("========== left %llu right %llu (shard index %d)", left, right, 214: (18) r9 = 0xbadfadbadfadbad ; R9_w=0xbadfadbadfadbad ; for (int i = 0; i < MAX_BINARY_SEARCH_DEPTH; i++) { 216: (07) r1 += -1 ; R1_w=10 217: (bf) r2 = r1 ; R1_w=10 R2_w=10 218: (67) r2 <<= 32 BPF program is too large. Processed 1000001 insn processed 1000001 insns (limit 1000000) max_states_per_insn 29 total_states 18306 peak_states 2761 mark_read 106 -- END PROG LOAD LOG -- libbpf: prog 'dwarf_unwind': failed to load: -7 libbpf: failed to load object 'profiler_bpf' libbpf: failed to load BPF skeleton 'profiler_bpf': -7 thread 'main' panicked at src/profiler.rs:159:36: load skel: Error: Argument list too long (os error 7) note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace ``` Test Plan ========= Added a regression test. --- flake.nix | 2 ++ src/bpf/profiler.bpf.c | 26 ++++++++------------------ vm.nix | 26 ++++++++++++++++++++++++++ vmtest.toml | 10 ++++++++++ 4 files changed, 46 insertions(+), 18 deletions(-) diff --git a/flake.nix b/flake.nix index 5cff985..62c8764 100644 --- a/flake.nix +++ b/flake.nix @@ -58,6 +58,8 @@ (import ./vm.nix { inherit pkgs; }).kernel_6_0 (import ./vm.nix { inherit pkgs; }).kernel_6_2 (import ./vm.nix { inherit pkgs; }).kernel_6_6 + (import ./vm.nix { inherit pkgs; }).kernel_6_8_7 + (import ./vm.nix { inherit pkgs; }).kernel_6_9_rc5 ]; LIBCLANG_PATH = lib.makeLibraryPath [ llvmPackages_16.libclang ]; diff --git a/src/bpf/profiler.bpf.c b/src/bpf/profiler.bpf.c index b570f67..3d08111 100644 --- a/src/bpf/profiler.bpf.c +++ b/src/bpf/profiler.bpf.c @@ -71,29 +71,19 @@ struct { } rate_limits SEC(".maps"); -// On arm64 running 6.8.4-200.fc39.aarch64 the verifier fails with argument list too long. This -// did not use to happen before and it's probably due to a regression in the way the verifier -// accounts for the explored paths. I have tried many other things, such as two mid variables -// but that did not do it. The theory of why this works is that perhaps it's making the verifier -// reset the state it had about the program exploration so the branches its exploring per iteration -// do not grow as much. -#ifdef __TARGET_ARCH_arm64 -static u32 __attribute__((optnone)) verifier_workaround(u32 value) { - return value; -} -#endif -#ifdef __TARGET_ARCH_x86 -static __always_inline u32 verifier_workaround(u32 value) { - return value; -} -#endif - // Binary search the unwind table to find the row index containing the unwind // information for a given program counter (pc) relative to the object file. static __always_inline u64 find_offset_for_pc(stack_unwind_table_t *table, u64 pc, u64 left, u64 right) { u64 found = BINARY_SEARCH_DEFAULT; + + // On kernels ~6.8 and greater the verifier fails with argument list too long. This did not use to + // happen before and it's probably due to a regression in the way the verifier accounts for the explored + // paths. I have tried many other things, such as two mid variables but that did not do it. + // Perhaps unrolling the loop works is that the verifier doesn't have as many states to explore + // per iteration. + #pragma unroll for (int i = 0; i < MAX_BINARY_SEARCH_DEPTH; i++) { // TODO(javierhonduco): ensure that this condition is right as we use // unsigned values... @@ -102,7 +92,7 @@ static __always_inline u64 find_offset_for_pc(stack_unwind_table_t *table, u64 p return found; } - u32 mid = verifier_workaround((left + right) / 2); + u32 mid = (left + right) / 2; // Appease the verifier. if (mid < 0 || mid >= MAX_UNWIND_TABLE_SIZE) { LOG("\t.should never happen, mid: %lu, max: %lu", mid, diff --git a/vm.nix b/vm.nix index 173f4b2..d847ed7 100644 --- a/vm.nix +++ b/vm.nix @@ -52,6 +52,32 @@ ''; }; + kernel_6_8_7 = pkgs.stdenv.mkDerivation { + name = "download-kernel-6.8.7"; + src = pkgs.fetchurl { + url = "https://github.com/javierhonduco/lightswitch-kernels/raw/c0af7a3/bzImage_v6.8.7"; + hash = "sha256-fZwGajRi9+otzokRxoss99aH9PLRuyl2UfJ5Echehdo="; + }; + dontUnpack = true; + installPhase = '' + mkdir -p $out + cp -r $src $out/bzImage + ''; + }; + + kernel_6_9_rc5 = pkgs.stdenv.mkDerivation { + name = "download-kernel-6.9-rc5"; + src = pkgs.fetchurl { + url = "https://github.com/javierhonduco/lightswitch-kernels/raw/c0af7a3/bzImage_v6.9-rc5"; + hash = "sha256-EA+nJ1M0/6QFPVA+fYkvXDhBcsmTnALpGr+tCJZsVyw="; + }; + dontUnpack = true; + installPhase = '' + mkdir -p $out + cp -r $src $out/bzImage + ''; + }; + vmtest = pkgs.rustPlatform.buildRustPackage { name = "vmtest"; src = pkgs.fetchFromGitHub { diff --git a/vmtest.toml b/vmtest.toml index 3a06893..3e9501b 100644 --- a/vmtest.toml +++ b/vmtest.toml @@ -16,4 +16,14 @@ command = "/mnt/vmtest/target/debug/lightswitch --duration 0" [[target]] name = "Fedora 6.6" kernel = "/nix/store/77ixckavs2qidx1pglmlxsj6bfvjqijb-download-kernel-6.6/bzImage" +command = "/mnt/vmtest/target/debug/lightswitch --duration 0" + +[[target]] +name = "Upstream 6.8.7" +kernel = "/nix/store/3ip5l5bmrxpjdf11xbrqpbrh7pqjn9wa-download-kernel-6.8.7/bzImage" +command = "/mnt/vmtest/target/debug/lightswitch --duration 0" + +[[target]] +name = "Upstream v6.9-rc5" +kernel = "/nix/store/zjwmw9s2gr772mjf0y3pfqvknbs0mcdc-download-kernel-6.9-rc5/bzImage" command = "/mnt/vmtest/target/debug/lightswitch --duration 0" \ No newline at end of file From 8cbe2f99e8c5488dce6f12d2a34d0a1860378440 Mon Sep 17 00:00:00 2001 From: Francisco Javier Honduvilla Coto Date: Mon, 29 Apr 2024 13:40:29 +0100 Subject: [PATCH 3/6] Use the right register type in arm64 As we were using thhe x86 one, `bpf_user_pt_regs_t` is defined in vmlinux as: ``` typedef struct pt_regs bpf_user_pt_regs_t; ``` in x86 and: ``` typedef struct user_pt_regs bpf_user_pt_regs_t; ``` in arm64 Future work =========== Ensure `retrieve_task_registers` is correct for arm64. Test Plan ========= Tested on my x86 and arm64 machines. --- src/bpf/profiler.bpf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bpf/profiler.bpf.c b/src/bpf/profiler.bpf.c index 3d08111..f36ccc7 100644 --- a/src/bpf/profiler.bpf.c +++ b/src/bpf/profiler.bpf.c @@ -219,7 +219,7 @@ static __always_inline bool retrieve_task_registers(u64 *ip, u64 *sp, u64 *bp) { } void *ptr = stack + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; - struct pt_regs *regs = ((struct pt_regs *)ptr) - 1; + bpf_user_pt_regs_t *regs = ((bpf_user_pt_regs_t *)ptr) - 1; *ip = PT_REGS_IP_CORE(regs); *sp = PT_REGS_SP_CORE(regs); @@ -560,7 +560,7 @@ int dwarf_unwind(struct bpf_perf_event_data *ctx) { } // Set up the initial unwinding state. -static __always_inline bool set_initial_state(unwind_state_t *unwind_state, struct pt_regs *regs) { +static __always_inline bool set_initial_state(unwind_state_t *unwind_state, bpf_user_pt_regs_t *regs) { unwind_state->stack.len = 0; unwind_state->tail_calls = 0; From 2da179012329b57f3f206dcb1407cc9f18f9978c Mon Sep 17 00:00:00 2001 From: Francisco Javier Honduvilla Coto Date: Wed, 1 May 2024 15:28:29 +0100 Subject: [PATCH 4/6] tests: Fix off-by-32 bits in LPM tree The integration tests started failing starting with kernel 6.8, and after some checks, it pointed to this commit https://github.com/torvalds/linux/commit/9b75dbeb36fcd9fc7ed51d370310d0518a387769. The tests fail now because the prefix length we pass is 32 bits higher (see initialiser) than the maximum prefix length. After this commit was introduced, any prefix length that's higher than the key size will return NULL earlier, rather than returning the value with the longest match, even if it's beyond the size the key can encode. Test Plan ========= Ran the tests and some profiles on a box running 6.8.7-200 (Fedora) without issues. Also added an assertion to ensure that the prefix_len is not bigger than the LPM trie key size. --- src/bpf/profiler_bindings.rs | 12 ++++++++++-- src/profiler.rs | 2 +- src/util.rs | 21 +++++++++++++-------- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/src/bpf/profiler_bindings.rs b/src/bpf/profiler_bindings.rs index 43e22a7..c9c7661 100644 --- a/src/bpf/profiler_bindings.rs +++ b/src/bpf/profiler_bindings.rs @@ -16,9 +16,17 @@ unsafe impl Plain for exec_mappings_key {} unsafe impl Plain for mapping_t {} impl exec_mappings_key { - pub fn new(pid: u32, address: u64, prefix: u32) -> Self { + pub fn new(pid: u32, address: u64, prefix_len: u32) -> Self { + let key_size_bits = std::mem::size_of::() * 8; + assert!( + prefix_len <= key_size_bits.try_into().unwrap(), + "prefix_len {} should be <= than the size of exec_mappings_key {}", + prefix_len, + key_size_bits + ); + Self { - prefix_len: 32 + prefix, + prefix_len, pid: pid.to_be(), data: address.to_be(), } diff --git a/src/profiler.rs b/src/profiler.rs index eb1dd51..76b1419 100644 --- a/src/profiler.rs +++ b/src/profiler.rs @@ -924,7 +924,7 @@ impl Profiler<'_> { let key = exec_mappings_key::new( pid.try_into().unwrap(), address_range.addr, - address_range.range, + 32 + address_range.prefix_len, ); self.add_bpf_mapping(&key, &mapping).unwrap(); diff --git a/src/util.rs b/src/util.rs index 4875c44..1d5f01b 100644 --- a/src/util.rs +++ b/src/util.rs @@ -1,7 +1,7 @@ #[derive(Debug, PartialEq)] pub struct AddressBlockRange { pub addr: u64, - pub range: u32, + pub prefix_len: u32, } /// For a given address range, calculate all the prefix ranges to ensure searching @@ -22,7 +22,7 @@ pub fn summarize_address_range(low: u64, high: u64) -> Vec { ); res.push(AddressBlockRange { addr: curr, - range: 64 - number_of_bits, + prefix_len: 64 - number_of_bits, }); curr += 1 << number_of_bits; if curr - 1 == u64::MAX { @@ -51,18 +51,21 @@ mod tests { assert_eq!( summarize_address_range(0, 100), vec![ - AddressBlockRange { addr: 0, range: 58 }, + AddressBlockRange { + addr: 0, + prefix_len: 58 + }, AddressBlockRange { addr: 64, - range: 59 + prefix_len: 59 }, AddressBlockRange { addr: 96, - range: 62 + prefix_len: 62 }, AddressBlockRange { addr: 100, - range: 64 + prefix_len: 64 } ] ); @@ -100,7 +103,8 @@ mod tests { assert!(mapping1.end < mapping2.begin); for address_range in summarize_address_range(mapping1.begin, mapping1.end - 1) { - let key = exec_mappings_key::new(510530, address_range.addr, address_range.range); + let key = + exec_mappings_key::new(510530, address_range.addr, 32 + address_range.prefix_len); map.update( unsafe { plain::as_bytes(&key) }, unsafe { plain::as_bytes(&mapping1) }, @@ -110,7 +114,8 @@ mod tests { } for address_range in summarize_address_range(mapping2.begin, mapping2.end - 1) { - let key = exec_mappings_key::new(510530, address_range.addr, address_range.range); + let key = + exec_mappings_key::new(510530, address_range.addr, 32 + address_range.prefix_len); map.update( unsafe { plain::as_bytes(&key) }, unsafe { plain::as_bytes(&mapping2) }, From a8eaa1b7ccf008ee60e3f55fbf035816185a2064 Mon Sep 17 00:00:00 2001 From: Francisco Javier Honduvilla Coto Date: Wed, 1 May 2024 16:03:40 +0100 Subject: [PATCH 5/6] unwinder/native: Log vDSO segments Unwinding through vDSO regions is not supported yet. We need to open the non-file memory mapping and retrieve its unwind info, but until it's implemented let's log with the right message. The mapping types will probably disappear, that's why I am not bothering with extracting them into a constant or enum. --- src/bpf/profiler.bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bpf/profiler.bpf.c b/src/bpf/profiler.bpf.c index f36ccc7..66422b3 100644 --- a/src/bpf/profiler.bpf.c +++ b/src/bpf/profiler.bpf.c @@ -340,7 +340,7 @@ int dwarf_unwind(struct bpf_perf_event_data *ctx) { } if (mapping->type == 2) { - LOG("JIT section, stopping"); + LOG("vDSO section, stopping"); return 1; } From 07677fb2837f12e7b4d4ae46cb6e679cefeef3fb Mon Sep 17 00:00:00 2001 From: Francisco Javier Honduvilla Coto Date: Mon, 29 Apr 2024 13:23:22 +0100 Subject: [PATCH 6/6] Unwind information performance improvements This commit: - sort the FDEs rather than the whole unwind information, which is a smaller vector. This showed no performance changes in benchmarks; - optimise the unwind info in-place. Most of the performance gains in this commits comes from this changes; Wrote a benchmark with criterion.rs and saw a ~25% decrease in time spent building and optimising the unwind info. ``` Running benches/unwind_info.rs (target/release/deps/unwind_info-b3269dc277690031) WARNING: HTML report generation will become a non-default optional feature in Criterion.rs 0.4.0. This feature is being moved to cargo-criterion (https://github.com/bheisler/cargo-criterion) and will be optional in a future version of Criterion.rs. To silence this warning, either switch to cargo-criterion or enable the 'html_reports' feature in your Cargo.toml. Gnuplot not found, using plotters backend unwind info + sorting time: [37.697 ms 39.061 ms 40.592 ms] change: [-26.710% -23.315% -19.870%] (p = 0.00 < 0.05) Performance has improved. Found 8 outliers among 100 measurements (8.00%) 3 (3.00%) high mild 5 (5.00%) high severe ``` For this test, this node.js binary was used ``` /home/javierhonduco/.vscode-server/cli/servers/Stable-e170252f762678dec6ca2cc69aba1570769a5d39/server/node: ELF 64-bit LSB executable, x86-64, version 1 (GNU/Linux), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 3.2.0, BuildID[sha1]=e034c66a6dc897492d6ecae6c6b046669f94ad96, with debug_info, not stripped, too many notes (256) ``` I am not commiting the benchmarks as I bumped into some rough edges with criterion and I am not sure if it's maintained anymore. Might be worth adding some benchmarks later on. Test Plan ========= Compared the unwind info before and after this change for a bunch of executables. --- src/main.rs | 29 +++++-- src/profiler.rs | 12 +-- src/unwind_info.rs | 212 ++++++++++++++++++++++++--------------------- 3 files changed, 140 insertions(+), 113 deletions(-) diff --git a/src/main.rs b/src/main.rs index 1477292..4aec28c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -20,7 +20,10 @@ use tracing_subscriber::FmtSubscriber; use lightswitch::collector::Collector; use lightswitch::object::ObjectFile; use lightswitch::profiler::Profiler; -use lightswitch::unwind_info::{compact_printing_callback, UnwindInfoBuilder}; +use lightswitch::unwind_info::in_memory_unwind_info; +use lightswitch::unwind_info::remove_redundant; +use lightswitch::unwind_info::remove_unnecesary_markers; +use lightswitch::unwind_info::UnwindInfoBuilder; const SAMPLE_FREQ_RANGE: RangeInclusive = 1..=1009; @@ -125,6 +128,25 @@ fn main() -> Result<(), Box> { let args = Cli::parse(); + if let Some(path) = args.show_unwind_info { + let mut unwind_info = in_memory_unwind_info(&path).unwrap(); + remove_unnecesary_markers(&mut unwind_info); + remove_redundant(&mut unwind_info); + + for compact_row in unwind_info { + let pc = compact_row.pc; + let cfa_type = compact_row.cfa_type; + let rbp_type = compact_row.rbp_type; + let cfa_offset = compact_row.cfa_offset; + let rbp_offset = compact_row.rbp_offset; + println!( + "pc: {:x} cfa_type: {:<2} rbp_type: {:<2} cfa_offset: {:<4} rbp_offset: {:<4}", + pc, cfa_type, rbp_type, cfa_offset, rbp_offset + ); + } + return Ok(()); + } + let subscriber = FmtSubscriber::builder() .with_max_level(if args.filter_logs { Level::TRACE @@ -137,11 +159,6 @@ fn main() -> Result<(), Box> { tracing::subscriber::set_global_default(subscriber).expect("setting default subscriber failed"); - if let Some(path) = args.show_unwind_info { - UnwindInfoBuilder::with_callback(&path, compact_printing_callback)?.process()?; - return Ok(()); - } - if let Some(path) = args.show_info { let objet_file = ObjectFile::new(&PathBuf::from(path.clone())).unwrap(); println!("build id {:?}", objet_file.build_id()); diff --git a/src/profiler.rs b/src/profiler.rs index 76b1419..4d7112e 100644 --- a/src/profiler.rs +++ b/src/profiler.rs @@ -752,17 +752,9 @@ impl Profiler<'_> { } span.exit(); - let span: span::EnteredSpan = span!(Level::DEBUG, "sort unwind info").entered(); - found_unwind_info.sort_by(|a, b| { - let a_pc = a.pc; - let b_pc = b.pc; - a_pc.cmp(&b_pc) - }); - span.exit(); - let span: span::EnteredSpan = span!(Level::DEBUG, "optimize unwind info").entered(); - let found_unwind_info = remove_unnecesary_markers(&found_unwind_info); - let found_unwind_info = remove_redundant(&found_unwind_info); + remove_unnecesary_markers(&mut found_unwind_info); + remove_redundant(&mut found_unwind_info); span.exit(); debug!( diff --git a/src/unwind_info.rs b/src/unwind_info.rs index b5d4a63..9f02cee 100644 --- a/src/unwind_info.rs +++ b/src/unwind_info.rs @@ -9,7 +9,7 @@ use thiserror::Error; use crate::bpf::profiler_bindings::stack_unwind_row_t; use anyhow::anyhow; -use tracing::error; +use tracing::{error, span, Level}; #[repr(u8)] pub enum CfaType { @@ -252,8 +252,8 @@ impl<'a> UnwindInfoBuilder<'a> { let eh_frame = EhFrame::new(eh_frame_data, endian); let mut entries_iter = eh_frame.entries(&bases); - let mut ctx = Box::new(UnwindContext::new()); let mut cur_cie = None; + let mut pc_and_fde_offset = Vec::new(); while let Ok(Some(entry)) = entries_iter.next() { match entry { @@ -261,7 +261,7 @@ impl<'a> UnwindInfoBuilder<'a> { cur_cie = Some(cie); } CieOrFde::Fde(partial_fde) => { - if let Ok(fde) = partial_fde.parse(|eh_frame, bases, cie_offset| { + let fde = partial_fde.parse(|eh_frame, bases, cie_offset| { if let Some(cie) = &cur_cie { if cie.offset() == cie_offset.0 { return Ok(cie.clone()); @@ -272,111 +272,125 @@ impl<'a> UnwindInfoBuilder<'a> { cur_cie = Some(cie.clone()); } cie - }) { - (self.callback)(&UnwindData::Function( - fde.initial_address(), - fde.initial_address() + fde.len(), - )); - - let mut table = fde.rows(&eh_frame, &bases, &mut ctx)?; - - loop { - let mut compact_row = CompactUnwindRow::default(); - - match table.next_row() { - Ok(None) => break, - Ok(Some(row)) => { - compact_row.pc = row.start_address(); - match row.cfa() { - CfaRule::RegisterAndOffset { register, offset } => { - if register == &RBP_X86 { - compact_row.cfa_type = - CfaType::FramePointerOffset as u8; - } else if register == &RSP_X86 { - compact_row.cfa_type = - CfaType::StackPointerOffset as u8; - } else { - compact_row.cfa_type = - CfaType::UnsupportedRegisterOffset as u8; - } - - match u16::try_from(*offset) { - Ok(off) => { - compact_row.cfa_offset = off; - } - Err(_) => { - compact_row.cfa_type = - CfaType::OffsetDidNotFit as u8; - } - } - } - CfaRule::Expression(exp) => { - let found_expression = exp.0.slice(); - - if found_expression == *PLT1 { - compact_row.cfa_offset = PltType::Plt1 as u16; - } else if found_expression == *PLT2 { - compact_row.cfa_offset = PltType::Plt2 as u16; - } - - compact_row.cfa_type = CfaType::Expression as u8; - } - }; - - match row.register(RBP_X86) { - gimli::RegisterRule::Undefined => {} - gimli::RegisterRule::Offset(offset) => { - compact_row.rbp_type = RbpType::CfaOffset as u8; - compact_row.rbp_offset = - i16::try_from(offset).expect("convert rbp offset"); - } - gimli::RegisterRule::Register(_reg) => { - compact_row.rbp_type = RbpType::Register as u8; - } - gimli::RegisterRule::Expression(_) => { - compact_row.rbp_type = RbpType::Expression as u8; - } - _ => { - // print!(", rbp unsupported {:?}", rbp); - } - } + }); + + if let Ok(fde) = fde { + pc_and_fde_offset.push((fde.initial_address(), fde.offset())); + } + } + } + } - if row.register(fde.cie().return_address_register()) - == gimli::RegisterRule::Undefined - { - compact_row.rbp_type = - RbpType::UndefinedReturnAddress as u8; + let span = span!(Level::DEBUG, "sort pc and fdes").entered(); + pc_and_fde_offset.sort_by_key(|(pc, _)| *pc); + span.exit(); + + let mut ctx = Box::new(UnwindContext::new()); + for (_, fde_offset) in pc_and_fde_offset { + let fde = eh_frame.fde_from_offset( + &bases, + gimli::EhFrameOffset(fde_offset), + EhFrame::cie_from_offset, + )?; + + (self.callback)(&UnwindData::Function( + fde.initial_address(), + fde.initial_address() + fde.len(), + )); + + let mut table = fde.rows(&eh_frame, &bases, &mut ctx)?; + + loop { + let mut compact_row = CompactUnwindRow::default(); + + match table.next_row() { + Ok(None) => break, + Ok(Some(row)) => { + compact_row.pc = row.start_address(); + match row.cfa() { + CfaRule::RegisterAndOffset { register, offset } => { + if register == &RBP_X86 { + compact_row.cfa_type = CfaType::FramePointerOffset as u8; + } else if register == &RSP_X86 { + compact_row.cfa_type = CfaType::StackPointerOffset as u8; + } else { + compact_row.cfa_type = CfaType::UnsupportedRegisterOffset as u8; + } + + match u16::try_from(*offset) { + Ok(off) => { + compact_row.cfa_offset = off; + } + Err(_) => { + compact_row.cfa_type = CfaType::OffsetDidNotFit as u8; } + } + } + CfaRule::Expression(exp) => { + let found_expression = exp.0.slice(); - // print!(", ra {:?}", row.register(fde.cie().return_address_register())); + if found_expression == *PLT1 { + compact_row.cfa_offset = PltType::Plt1 as u16; + } else if found_expression == *PLT2 { + compact_row.cfa_offset = PltType::Plt2 as u16; } - _ => continue, + + compact_row.cfa_type = CfaType::Expression as u8; } + }; - (self.callback)(&UnwindData::Instruction(compact_row)); + match row.register(RBP_X86) { + gimli::RegisterRule::Undefined => {} + gimli::RegisterRule::Offset(offset) => { + compact_row.rbp_type = RbpType::CfaOffset as u8; + compact_row.rbp_offset = + i16::try_from(offset).expect("convert rbp offset"); + } + gimli::RegisterRule::Register(_reg) => { + compact_row.rbp_type = RbpType::Register as u8; + } + gimli::RegisterRule::Expression(_) => { + compact_row.rbp_type = RbpType::Expression as u8; + } + _ => { + // print!(", rbp unsupported {:?}", rbp); + } } - // start_addresses.push(fde.initial_address() as u32); - // end_addresses.push((fde.initial_address() + fde.len()) as u32); + + if row.register(fde.cie().return_address_register()) + == gimli::RegisterRule::Undefined + { + compact_row.rbp_type = RbpType::UndefinedReturnAddress as u8; + } + + // print!(", ra {:?}", row.register(fde.cie().return_address_register())); } + _ => continue, } + + (self.callback)(&UnwindData::Instruction(compact_row)); } + // start_addresses.push(fde.initial_address() as u32); + // end_addresses.push((fde.initial_address() + fde.len()) as u32); } Ok(()) } } -// Must be sorted. Also, not very optimized as of now. -pub fn remove_unnecesary_markers(info: &Vec) -> Vec { - let mut unwind_info = Vec::with_capacity(info.len()); +// Must be sorted. +pub fn remove_unnecesary_markers(unwind_info: &mut Vec) { let mut last_row: Option = None; + let mut new_i: usize = 0; + + for i in 0..unwind_info.len() { + let row = unwind_info[i]; - for row in info { if let Some(last_row_unwrapped) = last_row { let previous_is_redundant_marker = (last_row_unwrapped.cfa_type == CfaType::EndFdeMarker as u8) && last_row_unwrapped.pc == row.pc; if previous_is_redundant_marker { - unwind_info.pop(); + new_i -= 1; } } @@ -387,22 +401,25 @@ pub fn remove_unnecesary_markers(info: &Vec) -> Vec) -> Vec { - let mut unwind_info = Vec::with_capacity(info.len()); +// Must be sorted. +pub fn remove_redundant(unwind_info: &mut Vec) { let mut last_row: Option = None; + let mut new_i: usize = 0; - for row in info { + for i in 0..unwind_info.len() { let mut redundant = false; + let row = unwind_info[i]; + if let Some(last_row_unwrapped) = last_row { redundant = row.cfa_type == last_row_unwrapped.cfa_type && row.cfa_offset == last_row_unwrapped.cfa_offset @@ -411,17 +428,18 @@ pub fn remove_redundant(info: &Vec) -> Vec anyhow::Result> { - let mut unwind_info = Vec::new(); + let mut unwind_info: Vec = Vec::new(); let mut last_function_end_addr: Option = None; let mut last_row = None;