Skip to content

Commit

Permalink
Validate encoding in string_reader, and not in format strings
Browse files Browse the repository at this point in the history
  • Loading branch information
eliaskosunen committed Nov 8, 2023
1 parent 5e6f60f commit 877efb9
Show file tree
Hide file tree
Showing 7 changed files with 179 additions and 12 deletions.
6 changes: 6 additions & 0 deletions include/scn/detail/format_string.h
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,11 @@ namespace scn {
constexpr void on_literal_text(const CharT* begin,
const CharT* end) const
{
// TODO: Do we want to validate Unicode in format strings?
#if 1
SCN_UNUSED(begin);
SCN_UNUSED(end);
#else
while (begin != end) {
const auto len =
utf_code_point_length_by_starting_code_unit(*begin);
Expand All @@ -418,6 +423,7 @@ namespace scn {
begin += len;
}
#endif
}

constexpr auto on_arg_id()
Expand Down
70 changes: 61 additions & 9 deletions src/scn/impl/algorithms/read.h
Original file line number Diff line number Diff line change
Expand Up @@ -204,19 +204,71 @@ namespace scn {
simple_borrowed_iterator_t<Range> read_until_code_point(Range&& range,
Predicate pred)
{
auto it = ranges::begin(range);
if constexpr (ranges::contiguous_range<Range> &&
ranges::sized_range<Range>) {
std::array<char32_t, 16> cp_buf{};
std::array<uint8_t, 16> idx_buf{};
auto it = ranges::begin(range);
while (it != ranges::end(range)) {
auto chunk_begin = it;
size_t code_point_count = 0;
uint8_t code_unit_idx = 0;
while (code_point_count < cp_buf.size() &&
it != ranges::end(range)) {
if (code_point_length_by_starting_code_unit(*it) != 0) {
idx_buf[code_point_count] = code_unit_idx;
++code_point_count;
}
++it;
++code_unit_idx;
}

while (it != ranges::end(range)) {
const auto [iter, value] = read_code_point_into(
ranges::subrange{it, ranges::end(range)});
const auto cp = decode_code_point_exhaustive(value.view());
if (pred(cp)) {
break;
auto input = detail::make_string_view_from_pointers(
detail::to_address(chunk_begin),
detail::to_address(it));
auto codepoints = span{cp_buf.data(), code_point_count};
auto transcode_result =
transcode_possibly_invalid(input, codepoints);
if (SCN_UNLIKELY(!transcode_result)) {
it = chunk_begin;
auto end = it + code_unit_idx;
while (it != end) {
const auto [iter, value] = read_code_point_into(
ranges::subrange{it, ranges::end(range)});
const auto cp =
decode_code_point_exhaustive(value.view());
if (pred(cp)) {
return it;
}
it = iter;
}
continue;
}

for (size_t i = 0; i < code_point_count; ++i) {
if (pred(cp_buf[i])) {
return chunk_begin + idx_buf[i];
}
}
}
it = iter;

return it;
}
else {
auto it = ranges::begin(range);

return it;
while (it != ranges::end(range)) {
const auto [iter, value] = read_code_point_into(
ranges::subrange{it, ranges::end(range)});
const auto cp = decode_code_point_exhaustive(value.view());
if (pred(cp)) {
break;
}
it = iter;
}

return it;
}
}

template <typename Range, typename Predicate>
Expand Down
13 changes: 12 additions & 1 deletion src/scn/impl/reader/string_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ namespace scn {
std::basic_string<DestCharT>& dst)
{
dst.clear();
transcode_to_string(src, dst);
transcode_valid_to_string(src, dst);
return {};
}

Expand Down Expand Up @@ -100,6 +100,11 @@ namespace scn {

auto src = make_contiguous_buffer(
ranges::subrange{ranges::begin(range), result});
if (!validate_unicode(src.view())) {
return unexpected_scan_error(
scan_error::invalid_scanned_value,
"Invalid encoding in scanned string");
}
if (auto e = transcode_if_necessary(SCN_MOVE(src), value);
SCN_UNLIKELY(!e)) {
return unexpected(e);
Expand Down Expand Up @@ -148,6 +153,12 @@ namespace scn {
value = std::basic_string_view<ValueCharT>(
ranges::data(view), ranges_polyfill::usize(view));

if (!validate_unicode(value)) {
return unexpected_scan_error(
scan_error::invalid_scanned_value,
"Invalid encoding in scanned string_view");
}

return SCN_MOVE(result);
}
}
Expand Down
68 changes: 68 additions & 0 deletions src/scn/impl/unicode/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,73 @@ namespace scn {
return input.begin() + offset;
}

template <typename SourceCharT, typename DestCharT>
std::optional<std::size_t> transcode_possibly_invalid(
std::basic_string_view<SourceCharT> input,
span<DestCharT> output)
{
if (input.empty()) {
return 0;
}

constexpr auto src_enc = get_encoding<SourceCharT>();
constexpr auto dest_enc = get_encoding<DestCharT>();

if constexpr (src_enc == dest_enc) {
SCN_EXPECT(output.size() >= input.size());
std::memcpy(output.data(), input.data(),
input.size() * sizeof(SourceCharT));
return input.size();
}

const auto result = [&]() {
if constexpr (src_enc == encoding::utf8) {
if constexpr (dest_enc == encoding::utf16) {
return simdutf::convert_utf8_to_utf16_with_errors(
input.data(), input.size(),
reinterpret_cast<char16_t*>(output.data()));
}
else {
return simdutf::convert_utf8_to_utf32_with_errors(
input.data(), input.size(),
reinterpret_cast<char32_t*>(output.data()));
}
}
else if constexpr (src_enc == encoding::utf16) {
if constexpr (dest_enc == encoding::utf8) {
return simdutf::convert_utf16_to_utf8_with_errors(
reinterpret_cast<const char16_t*>(input.data()),
input.size(), output.data());
}
else {
return simdutf::convert_utf16_to_utf32_with_errors(
reinterpret_cast<const char16_t*>(input.data()),
input.size(),
reinterpret_cast<char32_t*>(output.data()));
}
}
else if constexpr (src_enc == encoding::utf32) {
if constexpr (dest_enc == encoding::utf8) {
return simdutf::convert_utf32_to_utf8_with_errors(
reinterpret_cast<const char32_t*>(input.data()),
input.size(), output.data());
}
else {
return simdutf::convert_utf32_to_utf16_with_errors(
reinterpret_cast<const char32_t*>(input.data()),
input.size(),
reinterpret_cast<char16_t*>(output.data()));
}
}
}();

if (SCN_UNLIKELY(result.error != simdutf::SUCCESS)) {
return std::nullopt;
}
SCN_ENSURE(result.count <= output.size());
return result.count;
}

template <typename SourceCharT, typename DestCharT>
std::size_t transcode_valid(std::basic_string_view<SourceCharT> input,
span<DestCharT> output)
Expand Down Expand Up @@ -632,6 +699,7 @@ namespace scn {
template <typename CharT, typename Cb>
void for_each_code_point(std::basic_string_view<CharT> input, Cb&& cb)
{
// TODO: Could be optimized by being eager
auto it = input.begin();
while (it != input.end()) {
auto res = get_next_code_point(
Expand Down
8 changes: 7 additions & 1 deletion tests/unittests/format_string_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ TEST(FormatStringTest, TooManyArgsInArgListLiteral)
}
#endif

TEST(FormatStringTest, EscapedBraces){
TEST(FormatStringTest, EscapedBraces)
{
auto result = scn::scan<int>("{}123", scn::runtime("{{}}{}"));
ASSERT_TRUE(result);
EXPECT_EQ(result->value(), 123);
Expand Down Expand Up @@ -274,3 +275,8 @@ TEST(FormatStringTest, LongFormatString4)
ASSERT_TRUE(result);
EXPECT_EQ(result->value(), '0');
}

TEST(FormatStringTest, MatchLiteralInvalidEncoding)
{
auto result = scn::scan<>("\xc3\na\xa4", "\xc3\n\a\xa4");
}
18 changes: 17 additions & 1 deletion tests/unittests/string_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,13 @@ TEST(StringTest, WonkyInput)
auto input = std::string_view{source, sizeof(source)};

auto result = scn::scan<std::string>(input, "{:64c}");
ASSERT_FALSE(result);
EXPECT_EQ(result.error().code(), scn::scan_error::invalid_scanned_value);
#if 0
ASSERT_TRUE(result);
EXPECT_TRUE(result->range().empty());
EXPECT_EQ(result->value(), input);
#endif
}

TEST(StringTest, WonkyInputAndFormatWithTranscoding)
Expand All @@ -165,7 +169,7 @@ TEST(StringTest, WonkyInputAndFormatWithTranscoding)

auto result = scn::scan<std::wstring>(input, scn::runtime(input));
ASSERT_FALSE(result);
EXPECT_EQ(result.error().code(), scn::scan_error::invalid_format_string);
EXPECT_EQ(result.error().code(), scn::scan_error::invalid_scanned_value);
}

TEST(StringTest, WonkyInput2)
Expand All @@ -174,12 +178,16 @@ TEST(StringTest, WonkyInput2)
std::string_view{"\303 \245å\377åä\3035\377ååíääccccc\307c\244c"};

auto result = scn::scan<std::string_view>(input, "{}");
ASSERT_FALSE(result);
EXPECT_EQ(result.error().code(), scn::scan_error::invalid_scanned_value);
#if 0
ASSERT_TRUE(result);
EXPECT_EQ(result->value(), "\303");

result = scn::scan<std::string_view>(result->range(), "{}");
ASSERT_TRUE(result);
EXPECT_EQ(result->value(), input.substr(2));
#endif
}

TEST(StringTest, WonkyInput3)
Expand All @@ -191,15 +199,23 @@ TEST(StringTest, WonkyInput3)
auto input = std::string_view{source, sizeof(source)};

auto result = scn::scan<std::string>(input, "{}");
ASSERT_FALSE(result);
EXPECT_EQ(result.error().code(), scn::scan_error::invalid_scanned_value);
#if 0
ASSERT_TRUE(result);
EXPECT_TRUE(result->range().empty());
#endif
}

TEST(StringTest, RecoveryFromInvalidEncoding)
{
const auto source = std::string_view{"a\xc3 "};
auto result = scn::scan<std::string>(source, "{}");
ASSERT_FALSE(result);
EXPECT_EQ(result.error().code(), scn::scan_error::invalid_scanned_value);
#if 0
ASSERT_TRUE(result);
EXPECT_EQ(result->value(), "a\xc3");
EXPECT_EQ(result->begin(), source.end() - 1);
#endif
}
8 changes: 8 additions & 0 deletions tests/unittests/string_view_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,13 @@ TEST(StringViewTest, InvalidUtf8)
{
auto source = std::string_view{"\x82\xf5"};
auto result = scn::scan<std::string_view>(source, "{:64c}");
ASSERT_FALSE(result);
EXPECT_EQ(result.error().code(), scn::scan_error::invalid_scanned_value);
#if 0
ASSERT_TRUE(result);
EXPECT_TRUE(result->range().empty());
EXPECT_EQ(result->value(), source);
#endif
}

TEST(StringViewTest, WonkyInput)
Expand All @@ -117,7 +121,11 @@ TEST(StringViewTest, WonkyInput2)
auto input = std::string_view{source, sizeof(source)};

auto result = scn::scan<std::string_view>(input, "{:64c}");
ASSERT_FALSE(result);
EXPECT_EQ(result.error().code(), scn::scan_error::invalid_scanned_value);
#if 0
ASSERT_TRUE(result);
EXPECT_TRUE(result->range().empty());
EXPECT_EQ(result->value(), input);
#endif
}

0 comments on commit 877efb9

Please sign in to comment.