Skip to content

Commit

Permalink
pw_tokenizer: Add Detokenizer constructor with elf binary section
Browse files Browse the repository at this point in the history
Add an additional detokenizer constructor which takes input of a
pigweed entries elf section and read the entries to the database.

Change-Id: I24d1dee1e05bcecdc5d50e84c081be3f65f93689
Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/190650
Reviewed-by: Wyatt Hepler <[email protected]>
Presubmit-Verified: CQ Bot Account <[email protected]>
Commit-Queue: Yixuan Wang <[email protected]>
  • Loading branch information
yixuanwang authored and CQ Bot Account committed Feb 26, 2024
1 parent 1ea2d05 commit 614d94a
Show file tree
Hide file tree
Showing 9 changed files with 119 additions and 2 deletions.
2 changes: 2 additions & 0 deletions pw_tokenizer/Android.bp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ cc_library_static {
"pw_log_null_headers",
"pw_polyfill_headers",
"pw_preprocessor_headers",
"pw_result_headers",
"pw_span_headers",
],
export_header_lib_headers: [
Expand All @@ -39,6 +40,7 @@ cc_library_static {
"pw_log_null_headers",
"pw_polyfill_headers",
"pw_preprocessor_headers",
"pw_result_headers",
"pw_span_headers",
],
srcs: [
Expand Down
19 changes: 19 additions & 0 deletions pw_tokenizer/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ load("@rules_python//python:proto.bzl", "py_proto_library")
load(
"//pw_build:pigweed.bzl",
"pw_cc_binary",
"pw_cc_blob_info",
"pw_cc_blob_library",
"pw_cc_test",
"pw_linker_script",
)
Expand Down Expand Up @@ -108,6 +110,7 @@ cc_library(
deps = [
":base64",
"//pw_bytes",
"//pw_result",
"//pw_span",
"//pw_varint",
],
Expand Down Expand Up @@ -193,13 +196,29 @@ pw_cc_test(
],
)

pw_cc_blob_info(
name = "detokenizer_example_elf_blob",
file_path = "//pw_tokenizer/py:example_binary_with_tokenized_strings.elf",
symbol_name = "kElfSection",
)

pw_cc_blob_library(
name = "detokenizer_elf_test_blob",
blobs = [
":detokenizer_example_elf_blob",
],
namespace = "test::ns",
out_header = "pw_tokenizer/example_binary_with_tokenized_strings.h",
)

pw_cc_test(
name = "detokenize_test",
srcs = [
"detokenize_test.cc",
],
deps = [
":decoder",
":detokenizer_elf_test_blob",
"//pw_unit_test",
],
)
Expand Down
19 changes: 18 additions & 1 deletion pw_tokenizer/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import("//build_overrides/pigweed.gni")

import("$dir_pw_arduino_build/arduino.gni")
import("$dir_pw_bloat/bloat.gni")
import("$dir_pw_build/cc_blob_library.gni")
import("$dir_pw_build/module_config.gni")
import("$dir_pw_build/target_types.gni")
import("$dir_pw_docgen/docs.gni")
Expand Down Expand Up @@ -117,6 +118,7 @@ pw_source_set("decoder") {
public_configs = [ ":public_include_path" ]
public_deps = [
dir_pw_preprocessor,
dir_pw_result,
dir_pw_span,
]
deps = [
Expand Down Expand Up @@ -226,7 +228,10 @@ pw_test("decode_test") {

pw_test("detokenize_test") {
sources = [ "detokenize_test.cc" ]
deps = [ ":decoder" ]
deps = [
":decoder",
":detokenizer_elf_test_blob",
]

# TODO(tonymd): This fails on Teensyduino 1.54 beta core. It may be related to
# linking in stl functions. Will debug when 1.54 is released.
Expand Down Expand Up @@ -366,3 +371,15 @@ pw_size_diff("tokenizer_size_report") {
},
]
}

pw_cc_blob_library("detokenizer_elf_test_blob") {
out_header = "pw_tokenizer/example_binary_with_tokenized_strings.h"
namespace = "test::ns"
blobs = [
{
file_path = "py/example_binary_with_tokenized_strings.elf"
symbol_name = "kElfSection"
},
]
visibility = [ ":*" ]
}
13 changes: 13 additions & 0 deletions pw_tokenizer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

include($ENV{PW_ROOT}/pw_build/pigweed.cmake)
include($ENV{PW_ROOT}/pw_protobuf_compiler/proto.cmake)
include($ENV{PW_ROOT}/pw_build/cc_blob_library.cmake)

pw_add_module_config(pw_tokenizer_CONFIG)

Expand All @@ -36,6 +37,7 @@ pw_add_library(pw_tokenizer STATIC
public
PUBLIC_DEPS
pw_containers
pw_result
pw_span
pw_preprocessor
pw_tokenizer.config
Expand Down Expand Up @@ -166,6 +168,7 @@ pw_add_test(pw_tokenizer.detokenize_test
detokenize_test.cc
PRIVATE_DEPS
pw_tokenizer.decoder
pw_build.detokenizer_elf_test_blob
GROUPS
modules
pw_tokenizer
Expand Down Expand Up @@ -213,6 +216,16 @@ pw_add_test(pw_tokenizer.tokenize_test
pw_tokenizer
)

pw_cc_blob_library(pw_build.detokenizer_elf_test_blob
HEADER
pw_tokenizer/example_binary_with_tokenized_strings.h
NAMESPACE
test::ns
BLOB
SYMBOL_NAME kElfSection
PATH py/example_binary_with_tokenized_strings.elf
)

if(Zephyr_FOUND)
zephyr_link_libraries_ifdef(CONFIG_PIGWEED_TOKENIZER pw_tokenizer)
zephyr_link_libraries_ifdef(CONFIG_PIGWEED_TOKENIZER_BASE64 pw_tokenizer.base64)
Expand Down
31 changes: 31 additions & 0 deletions pw_tokenizer/detokenize.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include "pw_bytes/bit.h"
#include "pw_bytes/endian.h"
#include "pw_result/result.h"
#include "pw_tokenizer/base64.h"
#include "pw_tokenizer/internal/decode.h"
#include "pw_tokenizer/nested_tokenization.h"
Expand Down Expand Up @@ -171,6 +172,36 @@ Detokenizer::Detokenizer(const TokenDatabase& database) {
}
}

Result<Detokenizer> Detokenizer::FromElfSection(
span<const uint8_t> elf_section) {
size_t index = 0;
std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database;

while (index + sizeof(_pw_tokenizer_EntryHeader) < elf_section.size()) {
_pw_tokenizer_EntryHeader header;
std::memcpy(
&header, elf_section.data() + index, sizeof(_pw_tokenizer_EntryHeader));
index += sizeof(_pw_tokenizer_EntryHeader);

if (header.magic != _PW_TOKENIZER_ENTRY_MAGIC) {
return Status::DataLoss();
}

index += header.domain_length;
if (index + header.string_length <= elf_section.size()) {
// TODO(b/326365218): Construct FormatString with string_view to avoid
// creating a copy here.
std::string entry(
reinterpret_cast<const char*>(elf_section.data() + index),
header.string_length);
index += header.string_length;
database[header.token].emplace_back(entry.c_str(),
TokenDatabase::kDateRemovedNever);
}
}
return Detokenizer(std::move(database));
}

DetokenizedString Detokenizer::Detokenize(
const span<const uint8_t>& encoded) const {
// The token is missing from the encoded data; there is nothing to do.
Expand Down
19 changes: 19 additions & 0 deletions pw_tokenizer/detokenize_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include <string_view>

#include "pw_tokenizer/example_binary_with_tokenized_strings.h"
#include "pw_unit_test/framework.h"

namespace pw::tokenizer {
Expand Down Expand Up @@ -66,6 +67,24 @@ TEST_F(Detokenize, NoFormatting) {
EXPECT_EQ(detok_.Detokenize("\xff\xee\xee\xdd"sv).BestString(), "FOUR");
}

TEST_F(Detokenize, FromElfSection) {
// Create a detokenizer from an ELF file with only the pw_tokenizer sections.
// See py/detokenize_test.py.
// Offset and size of the .pw_tokenizer.entries section in bytes.
constexpr uint32_t database_offset_ = 0x00000174;
constexpr size_t database_size_ = 0x000004C2;

pw::span<const uint8_t> tokenEntries(
reinterpret_cast<const uint8_t*>(test::ns::kElfSection.data() +
database_offset_),
database_size_);
pw::Result<Detokenizer> detok_from_elf_ =
Detokenizer::FromElfSection(tokenEntries);
ASSERT_TRUE(detok_from_elf_.ok());
EXPECT_EQ(detok_from_elf_->Detokenize("\xd6\x8c\x66\x2e").BestString(),
"Jello, world!");
}

TEST_F(Detokenize, BestString_MissingToken_IsEmpty) {
EXPECT_FALSE(detok_.Detokenize("").ok());
EXPECT_TRUE(detok_.Detokenize("", 0u).BestString().empty());
Expand Down
11 changes: 11 additions & 0 deletions pw_tokenizer/public/pw_tokenizer/detokenize.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <utility>
#include <vector>

#include "pw_result/result.h"
#include "pw_span/span.h"
#include "pw_tokenizer/internal/decode.h"
#include "pw_tokenizer/token_database.h"
Expand Down Expand Up @@ -80,6 +81,16 @@ class Detokenizer {
// referenced by the Detokenizer after construction; its memory can be freed.
Detokenizer(const TokenDatabase& database);

// Constructs a detokenier by directly passing the parsed database.
explicit Detokenizer(
std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>>&&
database)
: database_(std::move(database)) {}

// Factory method which returns a detokenizer instance from the
// .pw_tokenizer.entries section of an ELF binary.
static Result<Detokenizer> FromElfSection(span<const uint8_t> elf_section);

// Decodes and detokenizes the encoded message. Returns a DetokenizedString
// that stores all possible detokenized string results.
DetokenizedString Detokenize(const span<const uint8_t>& encoded) const;
Expand Down
6 changes: 5 additions & 1 deletion pw_tokenizer/public/pw_tokenizer/token_database.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,17 @@ class TokenDatabase {
}

public:
/// Default date_removed for an entry in the token datase if it was never
/// removed.
static constexpr uint32_t kDateRemovedNever = 0xFFFFFFFF;

/// An entry in the token database.
struct Entry {
/// The token that represents this string.
uint32_t token;

/// The date the token and string was removed from the database, or
/// `0xFFFFFFFF` if it was never removed. Dates are encoded such that
/// kDateRemovedNever if it was never removed. Dates are encoded such that
/// natural integer sorting sorts from oldest to newest dates. The day is
/// stored an an 8-bit day, 8-bit month, and 16-bit year, packed into a
/// little-endian `uint32_t`.
Expand Down
1 change: 1 addition & 0 deletions pw_tokenizer/public/pw_tokenizer/tokenize.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

#endif // __cplusplus

#include "pw_polyfill/static_assert.h"
#include "pw_preprocessor/arguments.h"
#include "pw_preprocessor/compiler.h"
#include "pw_preprocessor/concat.h"
Expand Down

0 comments on commit 614d94a

Please sign in to comment.