Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

experiment with UTF-8/16 C++ iterators #3096

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions icu4c/source/common/common.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -1258,6 +1258,9 @@
<CustomBuild Include="unicode\utf16.h">
<Filter>strings</Filter>
</CustomBuild>
<CustomBuild Include="unicode\utf16cppiter.h">
<Filter>strings</Filter>
</CustomBuild>
<CustomBuild Include="unicode\utf32.h">
<Filter>strings</Filter>
</CustomBuild>
Expand Down
153 changes: 153 additions & 0 deletions icu4c/source/common/unicode/utf16cppiter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
// © 2024 and later: Unicode, Inc. and others.
// License & terms of use: https://www.unicode.org/copyright.html

// utf16cppiter.h
// created: 2024aug12 Markus W. Scherer

#ifndef __UTF16CPPITER_H__
#define __UTF16CPPITER_H__

#include <string_view>

#include "unicode/utypes.h"

#if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API

#include "unicode/utf16.h"
#include "unicode/uversion.h"

/**
* \file
* \brief C++ header-only API: C++ iterators over Unicode 16-bit strings (=UTF-16 if well-formed).
*/

#ifndef U_HIDE_DRAFT_API

namespace U_HEADER_ONLY_NAMESPACE {

// Some defined behaviors for handling ill-formed 16-bit strings.
// TODO: Maybe share with 8-bit strings, but the SURROGATE option does not have an equivalent there.
//
// TODO: A possible alternative to an enum might be some kind of function template
// which would be fully customizable.
// The operator*() return value might then want to be a template parameter as well.
// For example, for a well-formed sequence, the return value could be
// a tuple of (code point, well-formed), or a string view, or...
// (And then the caller could choose between UChar32 and char32_t.)
// However, all of that would make the API more complex and daunting.
enum U16IllFormedBehavior {
U16_BEHAVIOR_NEGATIVE,
U16_BEHAVIOR_FFFD,
U16_BEHAVIOR_SURROGATE
};

// TODO: Consider a template parameter for UChar32 vs. char32_t vs. uint32_t.

/**
* A code unit sequence for one code point returned by U16Iterator.
*
* TODO: check doxygen syntax for template parameters
* @param Unit16 char16_t or uint16_t or (on Windows) wchar_t
* @draft ICU 77
*/
template<typename Unit16>
struct U16OneSeq {
// Order of fields with padding and access frequency in mind.
UChar32 codePoint = 0;
uint8_t length = 0;
bool isWellFormed = false;
const Unit16 *data;

std::basic_string_view<Unit16> stringView() const {
return std::basic_string_view<Unit16>(data, length);
}

// TODO: std::optional<UChar32> maybeCodePoint() const ? (nullopt if !isWellFormed)
};

/**
* Validating iterator over the code points in a Unicode 16-bit string.
*
* TODO: check doxygen syntax for template parameters
* @param Unit16 char16_t or uint16_t or (on Windows) wchar_t
* @param U16IllFormedBehavior TODO
* @draft ICU 77
*/
template<typename Unit16, U16IllFormedBehavior behavior>
class U16Iterator {
public:
// TODO: make private, make friends
U16Iterator(const Unit16 *start, const Unit16 *p, const Unit16 *limit) :
start(start), p(p), limit(limit) {}
// TODO: We might try to support limit==nullptr, similar to U16_ macros supporting length<0.
// Test pointers for == or != but not < or >.

U16Iterator(const U16Iterator &other) = default;

bool operator==(const U16Iterator &other) const { return p == other.p; }
bool operator!=(const U16Iterator &other) const { return !operator==(other); }

const U16OneSeq<Unit16> operator*() const {
// TODO: assert p != limit -- more precisely: start <= p < limit
// Similar to U16_NEXT_OR_FFFD().
UChar32 c = *p;
if (!U16_IS_SURROGATE(c)) {
return {c, 1, true, p};
} else {
uint16_t c2;
if (U16_IS_SURROGATE_LEAD(c) && (p + 1) != limit && U16_IS_TRAIL(c2 = p[1])) {
c = U16_GET_SUPPLEMENTARY(c, c2);
return {c, 2, true, p};
} else {
// TODO: U16IllFormedBehavior
return {0xfffd, 1, false, p};
}
}
}

U16Iterator &operator++() { // pre-increment
// TODO: assert p != limit -- more precisely: start <= p < limit
// Similar to U16_FWD_1().
if (U16_IS_LEAD(*p++) && p != limit && U16_IS_TRAIL(*p)) {
++p;
}
return *this;
}

U16Iterator operator++(int) { // post-increment
// TODO: assert p != limit -- more precisely: start <= p < limit
U16Iterator result(*this);
// More similar to U16_NEXT_OR_FFFD() than U16_FWD_1() to try to help the compiler
// amortize work between operator*() and operator++(int) in typical *it++ usage.
// Otherwise this is slightly less efficient because it tests a lead surrogate twice.
UChar32 c = *p++;
if (U16_IS_SURROGATE(c) &&
U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(*p)) {
++p;
}
return result;
}

private:
// In a validating iterator, we need start & limit so that when we read a code point
// (forward or backward) we can test if there are enough code units.
const Unit16 *const start;
const Unit16 *p;
const Unit16 *const limit;
};

// ------------------------------------------------------------------------- ***

// TODO: Non-validating iterator over the code points in a Unicode 16-bit string.
// Assumes well-formed UTF-16. Otherwise the behavior is undefined.
// template<typename Unit16>
// class U16UnsafeIterator
// TODO: only p, no start, no limit
// TODO: can/should we read the code point only in operator*()?
// if we read it in the constructor, then we would still need start/limit...

} // namespace U_HEADER_ONLY_NAMESPACE

#endif // U_HIDE_DRAFT_API
#endif // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
#endif // __UTF16CPPITER_H__
2 changes: 1 addition & 1 deletion icu4c/source/test/intltest/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ numbertest_parse.o numbertest_doubleconversion.o numbertest_skeletons.o \
static_unisets_test.o numfmtdatadriventest.o numbertest_range.o erarulestest.o \
formattedvaluetest.o formatted_string_builder_test.o numbertest_permutation.o \
units_data_test.o units_router_test.o units_test.o displayoptions_test.o \
numbertest_simple.o uchar_type_build_test.o usetheaderonlytest.o
numbertest_simple.o uchar_type_build_test.o usetheaderonlytest.o utfcppitertest.o

DEPS = $(OBJECTS:.o=.d)

Expand Down
1 change: 1 addition & 0 deletions icu4c/source/test/intltest/intltest.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@
<ClCompile Include="sfwdchit.cpp" />
<ClCompile Include="strcase.cpp" />
<ClCompile Include="ustrtest.cpp" />
<ClCompile Include="utfcppitertest.cpp" />
<ClCompile Include="utxttest.cpp" />
<ClCompile Include="cpdtrtst.cpp" />
<ClCompile Include="ittrans.cpp" />
Expand Down
3 changes: 3 additions & 0 deletions icu4c/source/test/intltest/intltest.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,9 @@
<ClCompile Include="ustrtest.cpp">
<Filter>strings</Filter>
</ClCompile>
<ClCompile Include="utfcppitertest.cpp">
<Filter>strings</Filter>
</ClCompile>
<ClCompile Include="utxttest.cpp">
<Filter>strings</Filter>
</ClCompile>
Expand Down
2 changes: 2 additions & 0 deletions icu4c/source/test/intltest/itutil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ extern IntlTest *createPluralMapTest();
extern IntlTest *createStaticUnicodeSetsTest();
#endif
static IntlTest *createUHashTest();
extern IntlTest *createU16IteratorTest();

void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par )
{
Expand Down Expand Up @@ -84,6 +85,7 @@ void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* &
TESTCASE_AUTO_CREATE_CLASS(LocaleMatcherTest);
TESTCASE_AUTO_CREATE_CLASS(UHashTest);
TESTCASE_AUTO_CREATE_CLASS(USetHeaderOnlyTest);
TESTCASE_AUTO_CREATE_CLASS(U16IteratorTest);
TESTCASE_AUTO_END;
}

Expand Down
69 changes: 69 additions & 0 deletions icu4c/source/test/intltest/utfcppitertest.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// © 2024 and later: Unicode, Inc. and others.
// License & terms of use: https://www.unicode.org/copyright.html

// utfcppitertest.cpp
// created: 2024aug12 Markus W. Scherer

#include <string_view>

// Test header-only ICU C++ APIs. Do not use other ICU C++ APIs.
// Non-default configuration:
#define U_SHOW_CPLUSPLUS_API 0
// Default configuration:
// #define U_SHOW_CPLUSPLUS_HEADER_API 1

#include "unicode/utypes.h"
#include "unicode/utf16cppiter.h"
#include "intltest.h"

// Makes u"literal"sv std::u16string_view literals possible.
// https://en.cppreference.com/w/cpp/string/basic_string_view/operator%22%22sv
using namespace std::string_view_literals;

using U_HEADER_ONLY_NAMESPACE::U16_BEHAVIOR_NEGATIVE;
using U_HEADER_ONLY_NAMESPACE::U16Iterator;
using U_HEADER_ONLY_NAMESPACE::U16OneSeq;

class U16IteratorTest : public IntlTest {
public:
U16IteratorTest() {}

void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=nullptr) override;

void testExperiment();
};

extern IntlTest *createU16IteratorTest() {
return new U16IteratorTest();
}

void U16IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
if(exec) {
logln("TestSuite U16IteratorTest: ");
}
TESTCASE_AUTO_BEGIN;
TESTCASE_AUTO(testExperiment);
TESTCASE_AUTO_END;
}

void U16IteratorTest::testExperiment() {
IcuTestErrorCode errorCode(*this, "testExperiment");
std::u16string_view good(u"abçカ🚴"sv);
const char16_t *goodLimit = good.data() + good.length();
U16Iterator<char16_t, U16_BEHAVIOR_NEGATIVE> goodIter(good.data(), good.data(), goodLimit);
assertEquals("goodIter[0] * codePoint", u'a', (*goodIter).codePoint);
++goodIter; // pre-increment
assertEquals("goodIter[1] * codePoint", u'b', (*goodIter).codePoint);
++goodIter;
assertEquals("goodIter[2] * codePoint", u'ç', (*goodIter++).codePoint); // post-increment
assertEquals("goodIter[3] * codePoint", u'カ', (*goodIter).codePoint);
++goodIter;
const U16OneSeq<char16_t> &seq = *goodIter++;
assertEquals("goodIter[4] * codePoint", U'🚴', seq.codePoint);
assertEquals("goodIter[4] * length", 2, seq.length);
assertTrue("goodIter[4] * stringView()", seq.stringView() == u"🚴"sv);
U16Iterator<char16_t, U16_BEHAVIOR_NEGATIVE> goodEndIter(good.data(), goodLimit, goodLimit);
assertTrue("goodIter == goodEndIter", goodIter == goodEndIter);

// TODO: test ill-formed, and much more...
}
Loading