pw_tokenizer: Add Detokenizer constructor with elf binary section

Add an additional detokenizer constructor which takes input of a pigweed entries elf section and read the entries to the database. Change-Id: I24d1dee1e05bcecdc5d50e84c081be3f65f93689 Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/190650 Reviewed-by: Wyatt Hepler <[email protected]> Presubmit-Verified: CQ Bot Account <[email protected]> Commit-Queue: Yixuan Wang <[email protected]>
google · Feb 26, 2024 · 614d94a · 614d94a
1 parent 1ea2d05
commit 614d94a
Show file tree

Hide file tree

Showing 9 changed files with 119 additions and 2 deletions.
diff --git a/pw_tokenizer/Android.bp b/pw_tokenizer/Android.bp
@@ -30,6 +30,7 @@ cc_library_static {
         "pw_log_null_headers",
         "pw_polyfill_headers",
         "pw_preprocessor_headers",
+        "pw_result_headers",
         "pw_span_headers",
     ],
     export_header_lib_headers: [
@@ -39,6 +40,7 @@ cc_library_static {
         "pw_log_null_headers",
         "pw_polyfill_headers",
         "pw_preprocessor_headers",
+        "pw_result_headers",
         "pw_span_headers",
     ],
     srcs: [

diff --git a/pw_tokenizer/BUILD.bazel b/pw_tokenizer/BUILD.bazel
@@ -16,6 +16,8 @@ load("@rules_python//python:proto.bzl", "py_proto_library")
 load(
     "//pw_build:pigweed.bzl",
     "pw_cc_binary",
+    "pw_cc_blob_info",
+    "pw_cc_blob_library",
     "pw_cc_test",
     "pw_linker_script",
 )
@@ -108,6 +110,7 @@ cc_library(
     deps = [
         ":base64",
         "//pw_bytes",
+        "//pw_result",
         "//pw_span",
         "//pw_varint",
     ],
@@ -193,13 +196,29 @@ pw_cc_test(
     ],
 )
 
+pw_cc_blob_info(
+    name = "detokenizer_example_elf_blob",
+    file_path = "//pw_tokenizer/py:example_binary_with_tokenized_strings.elf",
+    symbol_name = "kElfSection",
+)
+
+pw_cc_blob_library(
+    name = "detokenizer_elf_test_blob",
+    blobs = [
+        ":detokenizer_example_elf_blob",
+    ],
+    namespace = "test::ns",
+    out_header = "pw_tokenizer/example_binary_with_tokenized_strings.h",
+)
+
 pw_cc_test(
     name = "detokenize_test",
     srcs = [
         "detokenize_test.cc",
     ],
     deps = [
         ":decoder",
+        ":detokenizer_elf_test_blob",
         "//pw_unit_test",
     ],
 )

diff --git a/pw_tokenizer/BUILD.gn b/pw_tokenizer/BUILD.gn
@@ -16,6 +16,7 @@ import("//build_overrides/pigweed.gni")
 
 import("$dir_pw_arduino_build/arduino.gni")
 import("$dir_pw_bloat/bloat.gni")
+import("$dir_pw_build/cc_blob_library.gni")
 import("$dir_pw_build/module_config.gni")
 import("$dir_pw_build/target_types.gni")
 import("$dir_pw_docgen/docs.gni")
@@ -117,6 +118,7 @@ pw_source_set("decoder") {
   public_configs = [ ":public_include_path" ]
   public_deps = [
     dir_pw_preprocessor,
+    dir_pw_result,
     dir_pw_span,
   ]
   deps = [
@@ -226,7 +228,10 @@ pw_test("decode_test") {
 
 pw_test("detokenize_test") {
   sources = [ "detokenize_test.cc" ]
-  deps = [ ":decoder" ]
+  deps = [
+    ":decoder",
+    ":detokenizer_elf_test_blob",
+  ]
 
   # TODO(tonymd): This fails on Teensyduino 1.54 beta core. It may be related to
   # linking in stl functions. Will debug when 1.54 is released.
@@ -366,3 +371,15 @@ pw_size_diff("tokenizer_size_report") {
     },
   ]
 }
+
+pw_cc_blob_library("detokenizer_elf_test_blob") {
+  out_header = "pw_tokenizer/example_binary_with_tokenized_strings.h"
+  namespace = "test::ns"
+  blobs = [
+    {
+      file_path = "py/example_binary_with_tokenized_strings.elf"
+      symbol_name = "kElfSection"
+    },
+  ]
+  visibility = [ ":*" ]
+}
diff --git a/pw_tokenizer/CMakeLists.txt b/pw_tokenizer/CMakeLists.txt
@@ -14,6 +14,7 @@
 
 include($ENV{PW_ROOT}/pw_build/pigweed.cmake)
 include($ENV{PW_ROOT}/pw_protobuf_compiler/proto.cmake)
+include($ENV{PW_ROOT}/pw_build/cc_blob_library.cmake)
 
 pw_add_module_config(pw_tokenizer_CONFIG)
 
@@ -36,6 +37,7 @@ pw_add_library(pw_tokenizer STATIC
     public
   PUBLIC_DEPS
     pw_containers
+    pw_result
     pw_span
     pw_preprocessor
     pw_tokenizer.config
@@ -166,6 +168,7 @@ pw_add_test(pw_tokenizer.detokenize_test
     detokenize_test.cc
   PRIVATE_DEPS
     pw_tokenizer.decoder
+    pw_build.detokenizer_elf_test_blob
   GROUPS
     modules
     pw_tokenizer
@@ -213,6 +216,16 @@ pw_add_test(pw_tokenizer.tokenize_test
     pw_tokenizer
 )
 
+pw_cc_blob_library(pw_build.detokenizer_elf_test_blob
+  HEADER
+    pw_tokenizer/example_binary_with_tokenized_strings.h
+  NAMESPACE
+    test::ns
+  BLOB
+    SYMBOL_NAME kElfSection
+    PATH py/example_binary_with_tokenized_strings.elf
+)
+
 if(Zephyr_FOUND)
   zephyr_link_libraries_ifdef(CONFIG_PIGWEED_TOKENIZER pw_tokenizer)
   zephyr_link_libraries_ifdef(CONFIG_PIGWEED_TOKENIZER_BASE64 pw_tokenizer.base64)

diff --git a/pw_tokenizer/detokenize.cc b/pw_tokenizer/detokenize.cc
@@ -19,6 +19,7 @@
 
 #include "pw_bytes/bit.h"
 #include "pw_bytes/endian.h"
+#include "pw_result/result.h"
 #include "pw_tokenizer/base64.h"
 #include "pw_tokenizer/internal/decode.h"
 #include "pw_tokenizer/nested_tokenization.h"
@@ -171,6 +172,36 @@ Detokenizer::Detokenizer(const TokenDatabase& database) {
   }
 }
 
+Result<Detokenizer> Detokenizer::FromElfSection(
+    span<const uint8_t> elf_section) {
+  size_t index = 0;
+  std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database;
+
+  while (index + sizeof(_pw_tokenizer_EntryHeader) < elf_section.size()) {
+    _pw_tokenizer_EntryHeader header;
+    std::memcpy(
+        &header, elf_section.data() + index, sizeof(_pw_tokenizer_EntryHeader));
+    index += sizeof(_pw_tokenizer_EntryHeader);
+
+    if (header.magic != _PW_TOKENIZER_ENTRY_MAGIC) {
+      return Status::DataLoss();
+    }
+
+    index += header.domain_length;
+    if (index + header.string_length <= elf_section.size()) {
+      // TODO(b/326365218): Construct FormatString with string_view to avoid
+      // creating a copy here.
+      std::string entry(
+          reinterpret_cast<const char*>(elf_section.data() + index),
+          header.string_length);
+      index += header.string_length;
+      database[header.token].emplace_back(entry.c_str(),
+                                          TokenDatabase::kDateRemovedNever);
+    }
+  }
+  return Detokenizer(std::move(database));
+}
+
 DetokenizedString Detokenizer::Detokenize(
     const span<const uint8_t>& encoded) const {
   // The token is missing from the encoded data; there is nothing to do.

diff --git a/pw_tokenizer/detokenize_test.cc b/pw_tokenizer/detokenize_test.cc
@@ -16,6 +16,7 @@
 
 #include <string_view>
 
+#include "pw_tokenizer/example_binary_with_tokenized_strings.h"
 #include "pw_unit_test/framework.h"
 
 namespace pw::tokenizer {
@@ -66,6 +67,24 @@ TEST_F(Detokenize, NoFormatting) {
   EXPECT_EQ(detok_.Detokenize("\xff\xee\xee\xdd"sv).BestString(), "FOUR");
 }
 
+TEST_F(Detokenize, FromElfSection) {
+  // Create a detokenizer from an ELF file with only the pw_tokenizer sections.
+  // See py/detokenize_test.py.
+  // Offset and size of the .pw_tokenizer.entries section in bytes.
+  constexpr uint32_t database_offset_ = 0x00000174;
+  constexpr size_t database_size_ = 0x000004C2;
+
+  pw::span<const uint8_t> tokenEntries(
+      reinterpret_cast<const uint8_t*>(test::ns::kElfSection.data() +
+                                       database_offset_),
+      database_size_);
+  pw::Result<Detokenizer> detok_from_elf_ =
+      Detokenizer::FromElfSection(tokenEntries);
+  ASSERT_TRUE(detok_from_elf_.ok());
+  EXPECT_EQ(detok_from_elf_->Detokenize("\xd6\x8c\x66\x2e").BestString(),
+            "Jello, world!");
+}
+
 TEST_F(Detokenize, BestString_MissingToken_IsEmpty) {
   EXPECT_FALSE(detok_.Detokenize("").ok());
   EXPECT_TRUE(detok_.Detokenize("", 0u).BestString().empty());

diff --git a/pw_tokenizer/public/pw_tokenizer/detokenize.h b/pw_tokenizer/public/pw_tokenizer/detokenize.h
@@ -31,6 +31,7 @@
 #include <utility>
 #include <vector>
 
+#include "pw_result/result.h"
 #include "pw_span/span.h"
 #include "pw_tokenizer/internal/decode.h"
 #include "pw_tokenizer/token_database.h"
@@ -80,6 +81,16 @@ class Detokenizer {
   // referenced by the Detokenizer after construction; its memory can be freed.
   Detokenizer(const TokenDatabase& database);
 
+  // Constructs a detokenier by directly passing the parsed database.
+  explicit Detokenizer(
+      std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>>&&
+          database)
+      : database_(std::move(database)) {}
+
+  // Factory method which returns a detokenizer instance from the
+  // .pw_tokenizer.entries section of an ELF binary.
+  static Result<Detokenizer> FromElfSection(span<const uint8_t> elf_section);
+
   // Decodes and detokenizes the encoded message. Returns a DetokenizedString
   // that stores all possible detokenized string results.
   DetokenizedString Detokenize(const span<const uint8_t>& encoded) const;

diff --git a/pw_tokenizer/public/pw_tokenizer/token_database.h b/pw_tokenizer/public/pw_tokenizer/token_database.h
@@ -93,13 +93,17 @@ class TokenDatabase {
   }
 
  public:
+  /// Default date_removed for an entry in the token datase if it was never
+  /// removed.
+  static constexpr uint32_t kDateRemovedNever = 0xFFFFFFFF;
+
   /// An entry in the token database.
   struct Entry {
     /// The token that represents this string.
     uint32_t token;
 
     /// The date the token and string was removed from the database, or
-    /// `0xFFFFFFFF` if it was never removed. Dates are encoded such that
+    /// kDateRemovedNever if it was never removed. Dates are encoded such that
     /// natural integer sorting sorts from oldest to newest dates. The day is
     /// stored an an 8-bit day, 8-bit month, and 16-bit year, packed into a
     /// little-endian `uint32_t`.

diff --git a/pw_tokenizer/public/pw_tokenizer/tokenize.h b/pw_tokenizer/public/pw_tokenizer/tokenize.h
@@ -25,6 +25,7 @@
 
 #endif  // __cplusplus
 
+#include "pw_polyfill/static_assert.h"
 #include "pw_preprocessor/arguments.h"
 #include "pw_preprocessor/compiler.h"
 #include "pw_preprocessor/concat.h"