From c2a5309efa83f03f2dde3bce9307993ba095ddfc Mon Sep 17 00:00:00 2001
From: Francisco Javier Honduvilla Coto <javierhonduco@gmail.com>
Date: Mon, 29 Apr 2024 11:42:20 +0100
Subject: [PATCH] Remediation for verifier regression in kernels ~6.8 and
 greater

Some days ago we put a fix for arm64 kernels as a verifier regression
was introduced causing loading errors
[commit](https://github.com/javierhonduco/lightswitch/commit/a8094d18d0e8a662d3581380934cd185eb30bf45).

Turns out this is not just affecting arm64, as it just happened it's
that my arm64 VM was running a more up to date kernel. This regression
affects x86 as well.

This commit changes the remediation to unrolloing the binary search loop
and adds a kernel regression test. I haven't measured the performance
implication of doing this vs the previous remediation. This is out of
the scope of this PR and it's something I will work on later on.

When I have spare cycles I should report this upstream.

The error
=========

```
292: (bf) r6 = r3                     ; R3_w=scalar(id=52209,smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x1869e,var_off=(0x0; 0x1ffff)) R6_w=scalar(id=52209,smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x1869e,var_off=(0x0; 0x1ffff))
293: (bf) r3 = r2                     ; R2_w=scalar(id=52208,smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=0x1869d,var_off=(0x0; 0x1ffff)) R3_w=scalar(id=52208,smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=0x1869d,var_off=(0x0; 0x1ffff))
294: (2d) if r4 > r8 goto pc+1 296: R0_w=scalar() R1=11 R2_w=scalar(id=52208,smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=0x1869d,var_off=(0x0; 0x1ffff)) R3_w=scalar(id=52208,smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=0x1869d,var_off=(0x0; 0x1ffff)) R4_w=scalar(umin=9,umax=0xfffffffffffffffa) R5_w=scalar() R6_w=scalar(id=52209,smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x1869e,var_off=(0x0; 0x1ffff)) R7=scalar(id=52027,smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x1869e,var_off=(0x0; 0x1ffff)) R8=scalar(id=24520,umin=8,umax=0xfffffffffffffff9) R9_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=0xff0000,var_off=(0x0; 0xff0000)) R10=fp0 fp-16=mmmmmmmm fp-24=mmmmmmmm fp-32=mmmmmmmm fp-40=mmmmmmmm fp-48=map_value(map=unwind_tables,ks=8,vs=1400000) fp-56=6 fp-64=scalar(id=52027,smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x1869e,var_off=(0x0; 0x1ffff)) fp-72=map_value(map=heap,ks=4,vs=1080) fp-80=map_value(map=heap,ks=4,vs=1080) fp-88=map_value(map=exec_mappings,ks=16,vs=32) fp-96=mmmmmmmm fp-104=ctx() fp-112=mmmmmmmm fp-120=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) fp-128=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) fp-136=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff))
; if (table->rows[mid].pc <= pc) {
296: (2d) if r4 > r8 goto pc-83 214: R0_w=scalar() R1=11 R2_w=scalar(id=52208,smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=0x1869d,var_off=(0x0; 0x1ffff)) R3_w=scalar(id=52208,smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=0x1869d,var_off=(0x0; 0x1ffff)) R4_w=scalar(umin=9,umax=0xfffffffffffffffa) R5_w=scalar() R6_w=scalar(id=52209,smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x1869e,var_off=(0x0; 0x1ffff)) R7=scalar(id=52027,smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x1869e,var_off=(0x0; 0x1ffff)) R8=scalar(id=24520,umin=8,umax=0xfffffffffffffff9) R9_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=0xff0000,var_off=(0x0; 0xff0000)) R10=fp0 fp-16=mmmmmmmm fp-24=mmmmmmmm fp-32=mmmmmmmm fp-40=mmmmmmmm fp-48=map_value(map=unwind_tables,ks=8,vs=1400000) fp-56=6 fp-64=scalar(id=52027,smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x1869e,var_off=(0x0; 0x1ffff)) fp-72=map_value(map=heap,ks=4,vs=1080) fp-80=map_value(map=heap,ks=4,vs=1080) fp-88=map_value(map=exec_mappings,ks=16,vs=32) fp-96=mmmmmmmm fp-104=ctx() fp-112=mmmmmmmm fp-120=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) fp-128=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) fp-136=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff))
; LOG("========== left %llu right %llu (shard index %d)", left, right,
214: (18) r9 = 0xbadfadbadfadbad      ; R9_w=0xbadfadbadfadbad
; for (int i = 0; i < MAX_BINARY_SEARCH_DEPTH; i++) {
216: (07) r1 += -1                    ; R1_w=10
217: (bf) r2 = r1                     ; R1_w=10 R2_w=10
218: (67) r2 <<= 32
BPF program is too large. Processed 1000001 insn
processed 1000001 insns (limit 1000000) max_states_per_insn 29 total_states 18306 peak_states 2761 mark_read 106
-- END PROG LOAD LOG --
libbpf: prog 'dwarf_unwind': failed to load: -7
libbpf: failed to load object 'profiler_bpf'
libbpf: failed to load BPF skeleton 'profiler_bpf': -7
thread 'main' panicked at src/profiler.rs:159:36:
load skel: Error: Argument list too long (os error 7)
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
```

Test Plan
=========

Added a regression test.
---
 flake.nix              |  2 ++
 src/bpf/profiler.bpf.c | 26 ++++++++------------------
 vm.nix                 | 26 ++++++++++++++++++++++++++
 vmtest.toml            | 10 ++++++++++
 4 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/flake.nix b/flake.nix
index 5cff985..62c8764 100644
--- a/flake.nix
+++ b/flake.nix
@@ -58,6 +58,8 @@
               (import ./vm.nix { inherit pkgs; }).kernel_6_0
               (import ./vm.nix { inherit pkgs; }).kernel_6_2
               (import ./vm.nix { inherit pkgs; }).kernel_6_6
+              (import ./vm.nix { inherit pkgs; }).kernel_6_8_7
+              (import ./vm.nix { inherit pkgs; }).kernel_6_9_rc5
             ];
 
             LIBCLANG_PATH = lib.makeLibraryPath [ llvmPackages_16.libclang ];
diff --git a/src/bpf/profiler.bpf.c b/src/bpf/profiler.bpf.c
index b570f67..3d08111 100644
--- a/src/bpf/profiler.bpf.c
+++ b/src/bpf/profiler.bpf.c
@@ -71,29 +71,19 @@ struct {
 } rate_limits SEC(".maps");
 
 
-// On arm64 running 6.8.4-200.fc39.aarch64 the verifier fails with argument list too long. This
-// did not use to happen before and it's probably due to a regression in the way the verifier
-// accounts for the explored paths. I have tried many other things, such as two mid variables
-// but that did not do it. The theory of why this works is that perhaps it's making the verifier
-// reset the state it had about the program exploration so the branches its exploring per iteration
-// do not grow as much.
-#ifdef __TARGET_ARCH_arm64
-static u32 __attribute__((optnone)) verifier_workaround(u32 value) {
-    return value;
-}
-#endif
-#ifdef __TARGET_ARCH_x86
-static __always_inline u32 verifier_workaround(u32 value) {
-    return value;
-}
-#endif
-
 // Binary search the unwind table to find the row index containing the unwind
 // information for a given program counter (pc) relative to the object file.
 static __always_inline u64 find_offset_for_pc(stack_unwind_table_t *table, u64 pc, u64 left,
                               u64 right) {
   u64 found = BINARY_SEARCH_DEFAULT;
 
+
+  // On kernels ~6.8 and greater the verifier fails with argument list too long. This did not use to
+  // happen before and it's probably due to a regression in the way the verifier accounts for the explored
+  // paths. I have tried many other things, such as two mid variables but that did not do it.
+  // Perhaps unrolling the loop works is that the verifier doesn't have as many states to explore
+  // per iteration.
+  #pragma unroll
   for (int i = 0; i < MAX_BINARY_SEARCH_DEPTH; i++) {
     // TODO(javierhonduco): ensure that this condition is right as we use
     // unsigned values...
@@ -102,7 +92,7 @@ static __always_inline u64 find_offset_for_pc(stack_unwind_table_t *table, u64 p
       return found;
     }
 
-    u32 mid = verifier_workaround((left + right) / 2);
+    u32 mid = (left + right) / 2;
     // Appease the verifier.
     if (mid < 0 || mid >= MAX_UNWIND_TABLE_SIZE) {
       LOG("\t.should never happen, mid: %lu, max: %lu", mid,
diff --git a/vm.nix b/vm.nix
index 173f4b2..d847ed7 100644
--- a/vm.nix
+++ b/vm.nix
@@ -52,6 +52,32 @@
     '';
   };
 
+  kernel_6_8_7 = pkgs.stdenv.mkDerivation {
+    name = "download-kernel-6.8.7";
+    src = pkgs.fetchurl {
+      url = "https://github.com/javierhonduco/lightswitch-kernels/raw/c0af7a3/bzImage_v6.8.7";
+      hash = "sha256-fZwGajRi9+otzokRxoss99aH9PLRuyl2UfJ5Echehdo=";
+    };
+    dontUnpack = true;
+    installPhase = ''
+      mkdir -p $out
+      cp -r $src $out/bzImage
+    '';
+  };
+
+  kernel_6_9_rc5 = pkgs.stdenv.mkDerivation {
+    name = "download-kernel-6.9-rc5";
+    src = pkgs.fetchurl {
+      url = "https://github.com/javierhonduco/lightswitch-kernels/raw/c0af7a3/bzImage_v6.9-rc5";
+      hash = "sha256-EA+nJ1M0/6QFPVA+fYkvXDhBcsmTnALpGr+tCJZsVyw=";
+    };
+    dontUnpack = true;
+    installPhase = ''
+      mkdir -p $out
+      cp -r $src $out/bzImage
+    '';
+  };
+
   vmtest = pkgs.rustPlatform.buildRustPackage {
     name = "vmtest";
     src = pkgs.fetchFromGitHub {
diff --git a/vmtest.toml b/vmtest.toml
index 3a06893..3e9501b 100644
--- a/vmtest.toml
+++ b/vmtest.toml
@@ -16,4 +16,14 @@ command = "/mnt/vmtest/target/debug/lightswitch --duration 0"
 [[target]]
 name = "Fedora 6.6"
 kernel = "/nix/store/77ixckavs2qidx1pglmlxsj6bfvjqijb-download-kernel-6.6/bzImage"
+command = "/mnt/vmtest/target/debug/lightswitch --duration 0"
+
+[[target]]
+name = "Upstream 6.8.7"
+kernel = "/nix/store/3ip5l5bmrxpjdf11xbrqpbrh7pqjn9wa-download-kernel-6.8.7/bzImage"
+command = "/mnt/vmtest/target/debug/lightswitch --duration 0"
+
+[[target]]
+name = "Upstream v6.9-rc5"
+kernel = "/nix/store/zjwmw9s2gr772mjf0y3pfqvknbs0mcdc-download-kernel-6.9-rc5/bzImage"
 command = "/mnt/vmtest/target/debug/lightswitch --duration 0"
\ No newline at end of file