Skip to content

Commit

Permalink
latest from coda-oss
Browse files Browse the repository at this point in the history
  • Loading branch information
Dan Smith committed Jul 29, 2022
1 parent 2c9d64c commit 32e27fc
Show file tree
Hide file tree
Showing 27 changed files with 172 additions and 1,521 deletions.
10 changes: 7 additions & 3 deletions externals/coda-oss/ReleaseNotes.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@
```
# coda-oss Release Notes

## Release 2022-06-29
## Release 2022-??-??
* remove *Expat* and *libXML* modules and support in **xml.lite**; only *Xerces* was actively used.
* **xml.lite** now uses UTF-8 internally and is no longer tries to preserve incorrect behavior.

## [Release 2022-06-29](https://github.com/mdaus/coda-oss/releases/tag/2022-06-29)
* remove **modules/drivers/boost** as it was empty (and unused);
**modules/c++/serialize** depended on boost, so it has also been removed.
* Update to [zlib 1.2.12](https://www.zlib.net/zlib-1.2.12.tar.gz),
Expand All @@ -21,14 +25,14 @@
* Begin work on `CODA_OSS_API` (needed for building a shared-library/DLL)
* Add `run1D()` method to `mt::GenerationThreadPool`

## Release 2022-05-03
## [Release 2022-05-03](https://github.com/mdaus/coda-oss/releases/tag/2022-05-03)
* Fixed a bug in `Poly2D::atY()`; improved `flipXY()` behavior.
* Implement [std::filesystem::file_size()](https://en.cppreference.com/w/cpp/filesystem/file_size).
* use `inline` functions for `TEST_` macros
* force use of [64-bit `time_t`](https://en.wikipedia.org/wiki/Year_2038_problem)
* more routines now support a `std::span` overload; e.g., `io::InputStream::read()`.

## (Release 2022-02-22)
## [Release 2022-02-22](https://github.com/mdaus/coda-oss/releases/tag/2022-02-22)
* new `EnocdedString` and `EncodedStringView` to manage strings in different encodings
* XML containing UTF-8 characters can now be validated
* Update to [GSL 4.0.0](https://github.com/microsoft/GSL/releases/tag/v4.0.0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,11 @@ class CODA_OSS_API EncodedString final
return view().wstring();
}

bool empty() const
{
return s_.empty();
}

struct details final
{
static const std::string& string(const EncodedString& es) // for unit-testing
Expand Down
73 changes: 62 additions & 11 deletions externals/coda-oss/modules/c++/str/include/str/EncodedStringView.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,6 @@ class CODA_OSS_API EncodedStringView final
// doesn't expose "mIsUtf8" so there's (intentinally) no way for clients to know the encoding.
friend EncodedString;

coda_oss::u8string::const_pointer c_str() const
{
return cast<coda_oss::u8string::const_pointer>(mString.data());
}

str::W1252string w1252string() const; // c.f. std::filesystem::path::u8string()

public:
Expand All @@ -77,12 +72,10 @@ class CODA_OSS_API EncodedStringView final
// Need the const char* overloads to avoid creating temporary std::basic_string<> instances.
// Routnes always return a copy, never a reference, so there's no additional overhead
// with storing a raw pointer rather than a pointer to std::basic_string<>.
EncodedStringView(coda_oss::u8string::const_pointer);
EncodedStringView(const coda_oss::u8string&);
EncodedStringView(str::W1252string::const_pointer);
EncodedStringView(const str::W1252string&);

// Don't want to make it easy to use these; a known encoding is preferred.
explicit EncodedStringView(coda_oss::u8string::const_pointer);
explicit EncodedStringView(const coda_oss::u8string&);
explicit EncodedStringView(str::W1252string::const_pointer);
explicit EncodedStringView(const str::W1252string&);
explicit EncodedStringView(std::string::const_pointer); // Assume platform native encoding: UTF-8 on Linux, Windows-1252 on Windows
explicit EncodedStringView(const std::string&); // Assume platform native encoding: UTF-8 on Linux, Windows-1252 on Windows

Expand All @@ -109,6 +102,45 @@ class CODA_OSS_API EncodedStringView final
// Using this routine can avoid an extra copy.
str::ui16string ui16string_() const; // use sparingly!

// These are for "advanced" use, most "normal" code should use the routines above.
std::string::const_pointer c_str() const
{
return mString.data();
}
coda_oss::u8string::const_pointer c_u8str() const
{
return mIsUtf8 ? cast<coda_oss::u8string::const_pointer>(c_str()) : nullptr;
}
size_t size() const
{
return mString.size();
}

// Input is encoded as specified on all platforms.
static EncodedStringView fromUtf8(const std::string& s)
{
return EncodedStringView(str::c_str<coda_oss::u8string>(s));
}
static EncodedStringView fromUtf8(std::string::const_pointer p)
{
return EncodedStringView(str::cast<coda_oss::u8string::const_pointer>(p));
}
static EncodedStringView fromWindows1252(const std::string& s)
{
return EncodedStringView(str::c_str<str::W1252string>(s));
}
static EncodedStringView fromWindows1252(std::string::const_pointer p)
{
return EncodedStringView(str::cast<str::W1252string::const_pointer>(p));
}

std::string asUtf8() const
{
std::string retval;
return toUtf8(retval);
}
std::string asWindows1252() const;

bool operator_eq(const EncodedStringView&) const;

struct details final
Expand All @@ -131,6 +163,25 @@ inline bool operator!=(const EncodedStringView& lhs, const EncodedStringView& rh
return !(lhs == rhs);
}

// Since we'd really like to "traffic" in UTF-8 strings (at least when encoding is a consideration)
// make that comparision easy.
inline bool operator==(const EncodedStringView& lhs, const coda_oss::u8string& rhs)
{
return lhs == EncodedStringView(rhs);
}
inline bool operator!=(const EncodedStringView& lhs, const coda_oss::u8string& rhs)
{
return !(lhs == rhs);
}
inline bool operator==(const coda_oss::u8string& lhs, const EncodedStringView& rhs)
{
return rhs == lhs;
}
inline bool operator!=(const coda_oss::u8string& lhs, const EncodedStringView& rhs)
{
return !(lhs == rhs);
}

inline std::ostream& operator<<(std::ostream& os, const EncodedStringView& esv)
{
os << esv.native();
Expand Down
6 changes: 4 additions & 2 deletions externals/coda-oss/modules/c++/str/include/str/Manip.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include "config/compiler_extensions.h"
#include "config/Exports.h"
#include "coda_oss/CPlusPlus.h"
#include "coda_oss/string.h"
#include "str/Convert.h"

namespace str
Expand Down Expand Up @@ -68,8 +69,9 @@ inline const CharT* data(const std::basic_string<CharT>& s) noexcept // to make
* @param s String to trim
*/
CODA_OSS_API void trim(std::string& s);
CODA_OSS_API std::string strip(const std::string& s);
CODA_OSS_API std::string& strip(std::string& s);
CODA_OSS_API std::string trim(const std::string& s);
CODA_OSS_API void trim(coda_oss::u8string& s);
CODA_OSS_API coda_oss::u8string trim(const coda_oss::u8string& s);

/**
* Checks the end of s with match
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ str::EncodedString& str::EncodedString::operator=(const EncodedStringView& v)
{
if (v.mIsUtf8)
{
assign(v.c_str());
assign(v.c_u8str());
}
else
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ str::W1252string str::EncodedStringView::w1252string() const
{
return str::details::to_w1252string(mString.data(), mString.size(), mIsUtf8);
}
std::string str::EncodedStringView::asWindows1252() const
{
const auto result = w1252string();
return str::c_str<std::string>(result); // cast & copy
}

bool str::EncodedStringView::operator_eq(const EncodedStringView& rhs) const
{
Expand All @@ -113,7 +118,7 @@ bool str::EncodedStringView::operator_eq(const EncodedStringView& rhs) const
auto& w1252 = !lhs.mIsUtf8 ? lhs : rhs;

// If UTF-8 is native on this platform, convert to UTF-8; otherwise do a native comparision
return mNativeIsUtf8 ? utf8.c_str() == w1252.u8string() : utf8.native() == w1252.mString.data();
return mNativeIsUtf8 ? utf8.c_u8str() == w1252.u8string() : utf8.native() == w1252.mString.data();
}


31 changes: 21 additions & 10 deletions externals/coda-oss/modules/c++/str/source/Manip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,36 +66,47 @@ char toupperCheck(char c)

namespace str
{
void trim(std::string & s)

// TODO: https://stackoverflow.com/questions/31959532/best-way-to-remove-white-spaces-from-stdstring
template<typename TChar>
inline void trim_(std::basic_string<TChar> & s)
{
size_t i;
for (i = 0; i < s.length(); i++)
{
if (!iswspace(s[i]))
if (!iswspace(static_cast<wint_t>(s[i])))
break;
}
s.erase(0, i);

for (i = s.length() - 1; (int) i >= 0; i--)
{
if (!iswspace(s[i]))
if (!iswspace(static_cast<wint_t>(s[i])))
break;

}
if (i + 1 < s.length())
s.erase(i + 1);
}

// https://stackoverflow.com/questions/31959532/best-way-to-remove-white-spaces-from-stdstring
std::string& strip(std::string& str)
void trim(std::string& s)
{
trim_(s);
}
std::string trim(const std::string& str)
{
auto retval = str;
trim(retval);
return retval;
}
void trim(coda_oss::u8string& s)
{
str.erase(std::remove_if(str.begin(), str.end(), ::isspace), str.end());
return str;
trim_(s);
}
std::string strip(const std::string& str)
coda_oss::u8string trim(const coda_oss::u8string& str)
{
auto retval = str;
return strip(retval);
trim(retval);
return retval;
}

bool ends_with(const std::string& s, const std::string& match) noexcept
Expand Down
52 changes: 30 additions & 22 deletions externals/coda-oss/modules/c++/str/unittests/test_base_convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,29 @@
#include <vector>
#include <string>
#include <iterator>

#include <std/string>

#include "coda_oss/CPlusPlus.h"

#include <import/str.h>
#include <str/EncodedString.h>
#include <str/Encoding.h>

#include "TestCase.h"

static std::string to_string(const coda_oss::u8string& value)
// It seems that a macro is better than a utility routine, see https://github.com/tahonermann/char8_t-remediation
// C++20 changed the type of u8 to char8_t* https://en.cppreference.com/w/cpp/language/string_literal
// Not putting this everywhere because (1) well, it's a macro, and (2) it's mostly
// only test code that uses string literals.
#if CODA_OSS_cpp20
#define U8(ch) u8##ch
#define U8s(s) u8##s
#else
#define U8(ch) static_cast<std::char8_t>(ch)
#define U8s(s) static_cast<const std::char8_t*>(static_cast<const void*>(s))
#endif

static std::string to_string(const std::u8string& value)
{
return str::c_str<std::string>(value); // copy
}
Expand Down Expand Up @@ -75,29 +88,24 @@ TEST_CASE(testCharToString)
TEST_ASSERT_EQ(str::toString<char>(65), "A");
}

static coda_oss::u8string fromWindows1252(const std::string& s)
static inline std::u8string fromWindows1252(const std::string& s)
{
// s is Windows-1252 on ALL platforms
return str::fromWindows1252(s.c_str(), s.size());
}

template<typename T>
static constexpr std::u8string::value_type cast8(T ch)
{
static_assert(sizeof(std::u8string::value_type) == sizeof(char), "sizeof(Char8_T) != sizeof(char)");
return static_cast<std::u8string::value_type>(ch);
}
template <typename T>
static constexpr std::u32string::value_type cast32(T ch)
template<typename TChar>
static inline constexpr std::u32string::value_type U(TChar ch)
{
return static_cast<std::u32string::value_type>(ch);
}

TEST_CASE(test_string_to_u8string_ascii)
{
{
const std::string input = "|\x00"; // ASCII, "|<NULL>"
const auto actual = fromWindows1252(input);
const std::u8string expected{cast8('|')}; // '\x00' is the end of the string in C/C++
const std::u8string expected{U8('|')}; // '\x00' is the end of the string in C/C++
TEST_ASSERT_EQ(actual, expected);
}
constexpr uint8_t start_of_heading = 0x01;
Expand All @@ -106,9 +114,9 @@ TEST_CASE(test_string_to_u8string_ascii)
{
const std::string input { '|', static_cast<std::string::value_type>(ch), '|'};
const auto actual = fromWindows1252(input);
const std::u8string expected8{cast8('|'), cast8(ch), cast8('|')};
const std::u8string expected8{U8('|'), U8(ch), U8('|')};
TEST_ASSERT_EQ(actual, expected8);
const std::u32string expected{cast32('|'), cast32(ch), cast32('|')};
const std::u32string expected{U'|', U(ch), U'|'};
TEST_ASSERT_EQ(to_string(actual), to_string(expected));
}
}
Expand All @@ -119,17 +127,17 @@ TEST_CASE(test_string_to_u8string_windows_1252)
{
const std::string input = "|\x80|"; // Windows-1252, "|€|"
const auto actual = fromWindows1252(input);
const std::u8string expected8{cast8('|'), cast8('\xE2'), cast8('\x82'), cast8('\xAC'), cast8('|')}; // UTF-8, "|€|"
const std::u8string expected8{U8s("|\xE2\x82\xAC|")}; // UTF-8, "|€|"
TEST_ASSERT_EQ(actual, expected8);
const std::u32string expected{cast32('|'), 0x20AC, cast32('|')}; // UTF-32, "|€|"
const std::u32string expected{U'|', 0x20AC, U'|'}; // UTF-32, "|€|"
TEST_ASSERT_EQ(to_string(actual), to_string(expected));
}
{
const std::string input = "|\x9F|"; // Windows-1252, "|Ÿ|"
const auto actual = fromWindows1252(input);
const std::u8string expected8{cast8('|'), cast8('\xC5'), cast8('\xB8'), cast8('|')}; // UTF-8, "|Ÿ|"
const std::u8string expected8{U8s("|\xC5\xB8|")}; // UTF-8, "|Ÿ|"
TEST_ASSERT_EQ(actual, expected8);
const std::u32string expected{cast32('|'), 0x0178, cast32('|')}; // UTF-32, "|Ÿ|"
const std::u32string expected{U'|', 0x0178, U'|'}; // UTF-32, "|Ÿ|"
TEST_ASSERT_EQ(to_string(actual), to_string(expected));
}
{
Expand All @@ -138,9 +146,9 @@ TEST_CASE(test_string_to_u8string_windows_1252)
{
const std::string input{'|', ch, '|'};
const auto actual = fromWindows1252(input);
static const std::u8string expected8{cast8('|'), cast8('\xEF'), cast8('\xBF'), cast8('\xBD'), cast8('|')}; // UTF-8, "|<REPLACEMENT CHARACTER>|"
static const std::u8string expected8{U8s("|\xEF\xBF\xBD|")}; // UTF-8, "|<REPLACEMENT CHARACTER>|"
TEST_ASSERT_EQ(actual, expected8);
const std::u32string expected{cast32('|'), 0xfffd, cast32('|')}; // UTF-32, "|<REPLACEMENT CHARACTER>|"
const std::u32string expected{U'|', 0xfffd, U'|'}; // UTF-32, "|<REPLACEMENT CHARACTER>|"
TEST_ASSERT_EQ(to_string(actual), to_string(expected));
}
}
Expand Down Expand Up @@ -208,7 +216,7 @@ TEST_CASE(test_string_to_u8string_iso8859_1)
const std::string input_ { '|', static_cast<std::string::value_type>(ch), '|'};
const str::W1252string input(str::c_str<str::W1252string>(input_));
const auto actual = to_u8string(input);
const std::u32string expected{cast32('|'), cast32(ch), cast32('|')};
const std::u32string expected{U'|', U(ch), U'|'};
TEST_ASSERT_EQ(to_string(actual), to_string(expected));

// Can't compare the values with == because TEST_ASSERT_EQ()
Expand Down Expand Up @@ -264,7 +272,7 @@ TEST_CASE(test_change_case)
// https://en.wikipedia.org/wiki/%C3%89#Character_mappings
static const str::EncodedString& classificationText_utf_8()
{
static const str::EncodedString retval(str::cast<coda_oss::u8string::const_pointer>("A\xc3\x89IOU")); // UTF-8 "AÉIOU"
static const str::EncodedString retval(str::cast<std::u8string::const_pointer>("A\xc3\x89IOU")); // UTF-8 "AÉIOU"
return retval;
}
static const str::EncodedString& classificationText_iso8859_1()
Expand Down
Loading

0 comments on commit 32e27fc

Please sign in to comment.