diff --git a/pkg/ebpf/verifier/elf.go b/pkg/ebpf/verifier/elf.go new file mode 100644 index 00000000000000..31ef4ce75855e1 --- /dev/null +++ b/pkg/ebpf/verifier/elf.go @@ -0,0 +1,312 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +//go:build linux_bpf + +// This file uses the DWARF and cilium/ebpf libraries to build a source map for an eBPF object file. +// This map links each instruction in the program to the source line in the original C code. This task +// is more complex than it'd looks like due to the complexity of the DWARF format and the way eBPF programs +// are compiled and stored in the ELF file. +// +// In the DWARF format, the source line information is stored in the .debug_lines section. The contents +// of that section (or the result of interpreting that section, because the binary format is just a list of +// instructions that generate the data, not the data itself; luckily for us that's done by the DWARF library) +// is a list of line entries, each of them with a file, line number, and address. The address represents the offset +// of the instruction in the program. +// +// Then, we have the .debug_info section which contains all the information about symbols, types, and others. We +// are interested in the Subprogram objects (functions), which have a name and a lowpc attribute. The lowpc attribute +// is the address of the first instruction of the function in the .debug_lines section. +// +// Now, the main problem is that those addresses are not unique, but are just offsets relative to the start of the sequence, which is a +// concept I couldn't find a definition for. But from reading the standard, it seems that a sequence is an ELF section with +// executable code. So, in order to have a translation from the .debug_lines addresses to eBPF program + instruction index, we +// need to do the following> +// 1. Build a map of symbols (functions) to the sequence they belong to (implemented in buildSymbolToSequenceMap based on ELF data) +// 2. Read the .debug_info section to get the start address of each program, and use the previous map to append the sequence index. With that we +// build a map (sequence, address) -> program name. +// 3. Read the .debug_lines section, tracking the sequence index. Each sequence is a monotonically +// increasing sequence of addresses, so we track restarts of the sequence to increase the sequence index. +// For each line, we check if that tuple (sequence, address) points to a program start. If so, then every subsequent line +// belongs to that program, until we find a new program start. The corresponding assembly instruction index is the offset of the line +// relative to the start of the program. With this information we build a map of (program name, instruction index) -> source line. +// 4. Now we just iterate through all the instructions in each eBPF program using cilium/ebpf and assign the corresponding source line +// to each instruction. We use this extra library as, in some cases, it provides the source line information for the instruction from +// BTF data, which can be helpful to validate the correctness of the DWARF data. + +package verifier + +import ( + "debug/dwarf" + "debug/elf" + "errors" + "fmt" + "io" + + "github.com/cilium/ebpf" +) + +// getLineReader gets the line reader for a DWARF data object, searching in the compilation unit entry +func getLineReader(dwarfData *dwarf.Data) (*dwarf.LineReader, error) { + entryReader := dwarfData.Reader() + if entryReader == nil { + return nil, errors.New("cannot get dwarf reader") + } + + for { + entry, err := entryReader.Next() + if err != nil { + return nil, fmt.Errorf("failed to iterate DWARF entries: %w", err) + } + if entry == nil { + break + } + + if entry.Tag == dwarf.TagCompileUnit { + lineReader, err := dwarfData.LineReader(entry) + if err != nil { + return nil, fmt.Errorf("cannot instantiate line reader: %w", err) + } + return lineReader, nil + } + } + return nil, fmt.Errorf("no line reader found in DWARF data") +} + +// progStartPoint defines a possible start point for a program: section index + address +type progStartPoint struct { + sequenceIndex int + addr int64 +} + +// buildProgStartMap builds a map of DWARF line information points that are the start of a program. +// This is used to know which program a line belongs to, as the DWARF line info data doesn't have that information. +func buildProgStartMap(dwarfData *dwarf.Data, symToSeq map[string]int) (map[progStartPoint]string, error) { + progStartLines := make(map[progStartPoint]string) + entryReader := dwarfData.Reader() + if entryReader == nil { + return nil, fmt.Errorf("cannot get dwarf reader") + } + + for { + entry, err := entryReader.Next() + if err != nil { + return nil, fmt.Errorf("failed to iterate DWARF entries: %w", err) + } + if entry == nil { + break + } + + if entry.Tag != dwarf.TagSubprogram || len(entry.Field) == 0 { + continue + } + + // Find the program name and program start address in its sequence + progStartAddr := int64(-1) + progName := "" + for _, field := range entry.Field { + if field.Attr == dwarf.AttrName { + progName = field.Val.(string) + } else if field.Attr == dwarf.AttrLowpc { + progStartAddr = int64(field.Val.(uint64)) + } + } + if progName == "" || progStartAddr == -1 { + continue // Ignore if we don't have all the fields + } + + seqIndex, ok := symToSeq[progName] + if !ok { + return nil, fmt.Errorf("cannot find sequence for symbol %s", progName) + } + startPoint := progStartPoint{seqIndex, progStartAddr} + progStartLines[startPoint] = progName + } + + return progStartLines, nil +} + +// buildSymbolToSequenceMap builds a map that links each symbol to the sequence index it belongs to. +// The address in the DWARF debug_line section is relative to the start of each sequence, but the symbol information +// doesn't explicitly say which sequence it belongs to. This function builds that map. +func buildSymbolToSequenceMap(elfFile *elf.File) (map[string]int, error) { + symbols, err := elfFile.Symbols() + if err != nil { + return nil, fmt.Errorf("failed to read symbols from ELF file: %w", err) + } + + // Each sequence is a section, unless that section has no content + sectIndexToSeqIndex := make(map[int]int) + idx := 0 + for i, sect := range elfFile.Sections { + if sect.Flags&elf.SHF_EXECINSTR != 0 && sect.Size > 0 { + sectIndexToSeqIndex[i] = idx + idx++ + } + } + + symToSeq := make(map[string]int) + for _, sym := range symbols { + sectIndex := int(sym.Section) + if sectIndex >= 0 && sectIndex < len(elfFile.Sections) { + symToSeq[sym.Name] = sectIndexToSeqIndex[int(sectIndex)] + } + } + + return symToSeq, nil +} + +// openSafeELFFile opens an ELF file and recovers from panics that might happen when reading it. +func openSafeELFFile(path string) (safe *elf.File, err error) { + defer func() { + r := recover() + if r == nil { + return + } + + safe = nil + err = fmt.Errorf("reading ELF file panicked: %s", r) + }() + + file, err := elf.Open(path) + if err != nil { + return nil, err + } + + return file, nil +} + +// getSourceMap builds the source map for an eBPF program. It returns two maps, one that +// for each program function maps the instruction offset to the source line information, and +// another that for each section maps the functions that belong to it. +func getSourceMap(file string, spec *ebpf.CollectionSpec) (map[string]map[int]*SourceLine, map[string][]string, error) { + // Open the ELF file + elfFile, err := openSafeELFFile(file) + if err != nil { + return nil, nil, fmt.Errorf("cannot open ELF file %s: %w", file, err) + } + defer elfFile.Close() + + // Read the debug information for line data. The Go DWARF reader fails when reading eBPF + // files because of missing support for relocations. However, we don't need them here as we're + // not necessary for line info, so we can skip them. The DWARF library will skip that processing + // if we set manually the type of the file to ET_EXEC. + elfFile.Type = elf.ET_EXEC + dwarfData, err := elfFile.DWARF() + if err != nil { + return nil, nil, fmt.Errorf("cannot read DWARF data for %s: %w", file, err) + } + entryReader := dwarfData.Reader() + if entryReader == nil { + return nil, nil, fmt.Errorf("cannot get DWARF reader for %s: %w", file, err) + } + + // Get the reader for the .debug_lines section + lineReader, err := getLineReader(dwarfData) + if err != nil { + return nil, nil, fmt.Errorf("cannot get line reader for %s: %w", file, err) + } + + // Build the map that links each symbol to the sequence index it belongs to + symToSeq, err := buildSymbolToSequenceMap(elfFile) + if err != nil { + return nil, nil, fmt.Errorf("cannot build symbol to section index map for %s: %w", file, err) + } + + // Build the map that links the sequence index + address to the start of each program, as the DWARF + // line info data doesn't tell you which program a line belongs to. + progStartMap, err := buildProgStartMap(dwarfData, symToSeq) + if err != nil { + return nil, nil, fmt.Errorf("cannot build program start lines for %s: %w", file, err) + } + + // Now build the map that, for each program, links the instruction offset to line information + // Read all the lines in the .debug_lines section + offsets := make(map[string]map[uint64]string) + currProgram := "" + startingOffset := uint64(0) + sequenceIndex := 0 + prevAddress := 0 + for { + var line dwarf.LineEntry + err := lineReader.Next(&line) + if err == io.EOF { + break + } + if err != nil { + return nil, nil, fmt.Errorf("DWARF lineReader file %s: %w", file, err) + } + + // Ignore lines with no data + if line.File == nil || line.Line == 0 { + continue + } + + // Increase section indexes whenever we reset the program. We have to look at the value of the previous + // address, because we might have multiple lines that have the same zero address. + if line.Address == 0 && prevAddress != 0 { + sequenceIndex++ + } + prevAddress = int(line.Address) + + startPoint := progStartPoint{sequenceIndex, int64(line.Address)} + lineinfo := fmt.Sprintf("%s:%d", line.File.Name, line.Line) + + // Reset the current program only if it's the first time we see it. Multiple + // assembly instructions might point to the first source line of a program + if newProg, ok := progStartMap[startPoint]; ok && newProg != currProgram { + // We need to keep track of the starting offset for each program to calculate + // the offset relative to the start. The eBPF loaders count program instructions + // from the start of the program, while in the ELF binary they're relative to the + // section start, and we might have multiple functions per section. + startingOffset = line.Address + currProgram = progStartMap[startPoint] + } + + if currProgram == "" { + // We might have information of programs that are not in the spec, ignore those + continue + } + if _, ok := offsets[currProgram]; !ok { + offsets[currProgram] = make(map[uint64]string) + } + offset := line.Address - startingOffset + offsets[currProgram][offset] = lineinfo + } + + // Now that we have line information for each instruction, we can build the source map + sourceMap := make(map[string]map[int]*SourceLine) + funcsPerSection := make(map[string][]string) + currLineInfo := "" + currLine := "" + for _, progSpec := range spec.Programs { + sourceMap[progSpec.Name] = make(map[int]*SourceLine) + funcsPerSection[progSpec.SectionName] = append(funcsPerSection[progSpec.SectionName], progSpec.Name) + + iter := progSpec.Instructions.Iterate() + for iter.Next() { + ins := iter.Ins + insOffset := iter.Offset.Bytes() + insIdx := int(iter.Offset) // Use the instruction offset as the index, because that's what the verifier uses. This accounts for double-wide instructions + if _, ok := offsets[progSpec.Name][insOffset]; ok { + // A single C line can generate multiple instructions, only update the value + // if we have a new source line + currLineInfo = offsets[progSpec.Name][insOffset] + currLine = "" + } else if insIdx == 0 { + return nil, nil, fmt.Errorf("missing line information at initial instruction for program %s", progSpec.Name) + } + // Keep the last source line for the instruction if we don't have a new one + if ins.Source() != nil && ins.Source().String() != "" { + currLine = ins.Source().String() + } + + sline := SourceLine{LineInfo: currLineInfo, Line: currLine} + sourceMap[progSpec.Name][insIdx] = &sline + } + } + + return sourceMap, funcsPerSection, nil +} diff --git a/pkg/ebpf/verifier/elf_test.go b/pkg/ebpf/verifier/elf_test.go new file mode 100644 index 00000000000000..4132fd3bf62a6e --- /dev/null +++ b/pkg/ebpf/verifier/elf_test.go @@ -0,0 +1,120 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +//go:build linux_bpf + +package verifier + +import ( + "io/fs" + "os" + "path/filepath" + "sort" + "strconv" + "strings" + "testing" + + "github.com/cilium/ebpf" + "golang.org/x/exp/maps" + + ddebpf "github.com/DataDog/datadog-agent/pkg/ebpf" + "github.com/DataDog/datadog-agent/pkg/util/filesystem" + + "github.com/stretchr/testify/require" +) + +func TestGetSourceMap(t *testing.T) { + objectFiles := make(map[string]string) + directory := ddebpf.NewConfig().BPFDir + err := filepath.WalkDir(directory, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + if d.IsDir() { + return nil + } + + if strings.Contains(path, "-debug") || !strings.HasSuffix(path, ".o") { + return nil + } + + if _, ok := objectFiles[d.Name()]; !ok { + objectFiles[d.Name()] = path + } + return nil + }) + require.NoError(t, err) + require.Greater(t, len(objectFiles), 0, "no object files found") + + fileCache := make(map[string][]string) + + for name, path := range objectFiles { + t.Run(name, func(tt *testing.T) { + spec, err := ebpf.LoadCollectionSpec(path) + require.NoError(tt, err) + sourceMap, funcsPerSection, err := getSourceMap(path, spec) + + require.NoError(tt, err) + require.NotEmpty(tt, sourceMap) + require.NotEmpty(tt, funcsPerSection) + + for _, funcs := range funcsPerSection { + require.NotEmpty(tt, funcs) + for _, f := range funcs { + require.Contains(tt, sourceMap, f) + } + } + + for prog, progSourceMap := range sourceMap { + require.NotEmpty(tt, progSourceMap) + hasSourceInfo := false + + // Iterate all the instructions and compare the two sources of data we have. + // On one hand we have file-line from DWARF, on the other we have the line contents + // from BTF data. We compare the two and make sure they match for most of the lines + // We accept some divergence as sometimes there will be differences with macros, etc. + insList := maps.Keys(progSourceMap) + sort.Ints(insList) + for _, ins := range insList { + sl := progSourceMap[ins] + if sl.LineInfo == "" { + continue + } + hasSourceInfo = true + + if sl.Line == "" { // We cannot compare with btf-defined source lines + continue + } + + // Do not try to read files in KMT environment, as we lack the source files + if _, ok := os.LookupEnv("GITLAB_CI"); ok { + continue + } + + // Compare the line info with the one from the actual file + infoParts := strings.Split(sl.LineInfo, ":") + require.Len(tt, infoParts, 2) + line, err := strconv.Atoi(infoParts[1]) + require.NoError(tt, err) + sourceFile := infoParts[0] + + if _, ok := fileCache[sourceFile]; !ok { + // Read all the lines from the file + lines, err := filesystem.ReadLines(sourceFile) + require.NoError(tt, err, "cannot read file %s", sourceFile) + fileCache[sourceFile] = lines + } + + require.GreaterOrEqual(tt, line, 0, "invalid line %d, ins %d", line, ins) + require.LessOrEqual(tt, line, len(fileCache[sourceFile]), "line %d not found in %s, ins %d", line, sourceFile, ins) + expectedLine := fileCache[sourceFile][line-1] + require.Equal(tt, expectedLine, sl.Line, "mismatch at instruction %d, lineinfo=%s, prog %s", ins, sl.LineInfo, prog) + } + + require.True(tt, hasSourceInfo, "no source info found for %s", prog) + } + }) + } +} diff --git a/pkg/ebpf/verifier/stats.go b/pkg/ebpf/verifier/stats.go index d75f46056ae9f1..f3d8089552760c 100644 --- a/pkg/ebpf/verifier/stats.go +++ b/pkg/ebpf/verifier/stats.go @@ -13,7 +13,6 @@ import ( "fmt" "log" "os" - "os/exec" "path/filepath" "reflect" "regexp" @@ -83,112 +82,6 @@ func BuildVerifierStats(opts *StatsOptions) (*StatsResult, map[string]struct{}, return results, failedToLoad, nil } -func getSourceMap(file string) (map[string]map[int]*SourceLine, map[string][]string, error) { - // call llvm-objdump to get the source map in the shell - // We cannot use the go DWARF library because it doesn't support certain features - // (replications) for eBPF programs. - cmd := exec.Command("llvm-objdump", "-Sl", file) - out, err := cmd.Output() - if err != nil { - return nil, nil, fmt.Errorf("failed to run llvm-objdump on %s: %w", file, err) - } - - sourceMap := make(map[string]map[int]*SourceLine) - funcsPerSection := make(map[string][]string) - lines := strings.Split(string(out), "\n") - nextLineInfo := "" - currLineInfo, currLine := "", "" - currSect, currFunc := "", "" - - sectionRegex := regexp.MustCompile("Disassembly of section (.*):") - functionRegex := regexp.MustCompile("^[0-9a-fA-F]{16} <([a-zA-Z_][a-zA-Z0-9_]+)>:$") - lineInfoRegex := regexp.MustCompile("^; [^:]+:[0-9]+") - functionJustStarted := false - insnOffset := 0 - - // Very ad-hoc parsing but enough for our purposes - for _, line := range lines { - if len(line) == 0 { - continue - } - - // With -l, llvm-objdump will print the source line info - // in two lines starting with ;. The first is the file and line number, - // the second is the source line itself. - // So we keep track of the last two things we found that started with ";" - // Sometimes we can get a function entry point for an assembly line without source information, - // so we need to discard that. We only save the source information if the first line is of the form - // "; :" and the second line is the actual source line. - // Note that a single code line might translate to multiple assembly instructions, so we do - // this once and keep the state for all assembly lines following. - if line[0] == ';' { - if lineInfoRegex.MatchString(line) { - nextLineInfo = line - } else if nextLineInfo != "" { - currLineInfo = strings.TrimPrefix(nextLineInfo, "; ") - currLine = strings.TrimPrefix(line, "; ") - nextLineInfo = "" - } - continue - } - nextLineInfo = "" // Reset the next line info if we don't have a source line - - // Check for section headers - sectionMatch := sectionRegex.FindStringSubmatch(line) - if len(sectionMatch) >= 2 { - currSect = strings.ReplaceAll(sectionMatch[1], "/", "__") // match naming convention - log.Printf("Found section %s\n", currSect) - continue - } - - // Check for function names - functionMatch := functionRegex.FindStringSubmatch(line) - if len(functionMatch) >= 2 && !strings.HasPrefix(functionMatch[1], "LBB") { // Ignore block labels - currFunc = functionMatch[1] - log.Printf("Found function %s\n", currFunc) - - if currSect == "" { - log.Printf("WARN: Found function %s without section, line=%v\n", currFunc, line) - } else { - funcsPerSection[currSect] = append(funcsPerSection[currSect], currFunc) - } - - if _, ok := sourceMap[currFunc]; !ok { - sourceMap[currFunc] = make(map[int]*SourceLine) - } - functionJustStarted = true // Mark that this function just started so we have the instruction offset of the start - continue - } - - // We should have a section and function at this point, ignore the line if we don't - if currSect == "" || currFunc == "" { - continue - } - - line = strings.TrimLeft(line, " \t") - parts := strings.Split(line, ":") - insn, err := strconv.Atoi(parts[0]) - if err != nil { - continue - } - - if functionJustStarted { - // llvm-objdump counts instructions since the start of the section, but the verifier - // counts from the start of the function. We need to account for that offset - insnOffset = insn - functionJustStarted = false - } - insn -= insnOffset - - sourceMap[currFunc][insn] = &SourceLine{ - LineInfo: currLineInfo, - Line: currLine, - } - } - - return sourceMap, funcsPerSection, nil -} - func generateLoadFunction(file string, opts *StatsOptions, results *StatsResult, failedToLoad map[string]struct{}) func(bytecode.AssetReader, manager.Options) error { return func(bc bytecode.AssetReader, managerOptions manager.Options) error { kversion, err := kernel.HostVersion() @@ -251,7 +144,7 @@ func generateLoadFunction(file string, opts *StatsOptions, results *StatsResult, ) if opts.DetailedComplexity { - sourceMap, funcsPerSect, err = getSourceMap(file) + sourceMap, funcsPerSect, err = getSourceMap(file, collectionSpec) if err != nil { return fmt.Errorf("failed to get llvm-objdump data for %v: %w", file, err) }