latest from coda-oss

ngageoint · Jul 29, 2022 · 32e27fc · 32e27fc
1 parent 2c9d64c
commit 32e27fc
Show file tree

Hide file tree

Showing 27 changed files with 172 additions and 1,521 deletions.
diff --git a/externals/coda-oss/ReleaseNotes.md b/externals/coda-oss/ReleaseNotes.md
@@ -11,7 +11,11 @@
  ```
 # coda-oss Release Notes
 
-## Release 2022-06-29
+## Release 2022-??-??
+* remove *Expat* and *libXML* modules and support in **xml.lite**; only *Xerces* was actively used.
+* **xml.lite** now uses UTF-8 internally and is no longer tries to preserve incorrect behavior.
+
+## [Release 2022-06-29](https://github.com/mdaus/coda-oss/releases/tag/2022-06-29)
 * remove **modules/drivers/boost** as it was empty (and unused);
   **modules/c++/serialize** depended on boost, so it has also been removed.
 * Update to [zlib 1.2.12](https://www.zlib.net/zlib-1.2.12.tar.gz),
@@ -21,14 +25,14 @@
 * Begin work on `CODA_OSS_API` (needed for building a shared-library/DLL)
 * Add `run1D()` method to `mt::GenerationThreadPool` 
 
-## Release 2022-05-03
+## [Release 2022-05-03](https://github.com/mdaus/coda-oss/releases/tag/2022-05-03)
 * Fixed a bug in `Poly2D::atY()`; improved `flipXY()` behavior.
 * Implement [std::filesystem::file_size()](https://en.cppreference.com/w/cpp/filesystem/file_size).
 * use `inline` functions for `TEST_` macros
 * force use of [64-bit `time_t`](https://en.wikipedia.org/wiki/Year_2038_problem)
 * more routines now support a `std::span` overload; e.g., `io::InputStream::read()`.
 
-## (Release 2022-02-22)
+## [Release 2022-02-22](https://github.com/mdaus/coda-oss/releases/tag/2022-02-22)
 * new `EnocdedString` and `EncodedStringView` to manage strings in different encodings
 * XML containing UTF-8 characters can now be validated
 * Update to [GSL 4.0.0](https://github.com/microsoft/GSL/releases/tag/v4.0.0)

diff --git a/externals/coda-oss/modules/c++/str/include/str/EncodedString.h b/externals/coda-oss/modules/c++/str/include/str/EncodedString.h
@@ -125,6 +125,11 @@ class CODA_OSS_API EncodedString final
         return view().wstring();
     }
 
+    bool empty() const
+    {
+        return s_.empty();
+    }
+
     struct details final
     {
         static const std::string& string(const EncodedString& es) // for unit-testing

diff --git a/externals/coda-oss/modules/c++/str/include/str/EncodedStringView.h b/externals/coda-oss/modules/c++/str/include/str/EncodedStringView.h
@@ -59,11 +59,6 @@ class CODA_OSS_API EncodedStringView final
     // doesn't expose "mIsUtf8" so there's (intentinally) no way for clients to know the encoding.
     friend EncodedString;
 
-    coda_oss::u8string::const_pointer c_str() const
-    {
-        return cast<coda_oss::u8string::const_pointer>(mString.data());
-    }
-
     str::W1252string w1252string() const;  // c.f. std::filesystem::path::u8string()
 
 public:
@@ -77,12 +72,10 @@ class CODA_OSS_API EncodedStringView final
     // Need the const char* overloads to avoid creating temporary std::basic_string<> instances.
     // Routnes always return a copy, never a reference, so there's no additional overhead
     // with storing a raw pointer rather than a pointer to  std::basic_string<>.
-    EncodedStringView(coda_oss::u8string::const_pointer);
-    EncodedStringView(const coda_oss::u8string&);
-    EncodedStringView(str::W1252string::const_pointer);
-    EncodedStringView(const str::W1252string&);
-
-    // Don't want to make it easy to use these; a known encoding is preferred.
+    explicit EncodedStringView(coda_oss::u8string::const_pointer);
+    explicit EncodedStringView(const coda_oss::u8string&);
+    explicit EncodedStringView(str::W1252string::const_pointer);
+    explicit EncodedStringView(const str::W1252string&);
     explicit EncodedStringView(std::string::const_pointer);  // Assume platform native encoding: UTF-8 on Linux, Windows-1252 on Windows
     explicit EncodedStringView(const std::string&);  // Assume platform native encoding: UTF-8 on Linux, Windows-1252 on Windows
 
@@ -109,6 +102,45 @@ class CODA_OSS_API EncodedStringView final
     // Using this routine can avoid an extra copy.
     str::ui16string ui16string_() const; // use sparingly!
 
+    // These are for "advanced" use, most "normal" code should use the routines above.
+    std::string::const_pointer c_str() const
+    {
+        return mString.data();
+    }
+    coda_oss::u8string::const_pointer c_u8str() const
+    {
+        return mIsUtf8 ? cast<coda_oss::u8string::const_pointer>(c_str()) : nullptr;
+    }
+    size_t size() const
+    {
+        return mString.size();
+    }
+
+    // Input is encoded as specified on all platforms.
+    static EncodedStringView fromUtf8(const std::string& s)
+    {
+        return EncodedStringView(str::c_str<coda_oss::u8string>(s));
+    }
+    static EncodedStringView fromUtf8(std::string::const_pointer p)
+    {
+        return EncodedStringView(str::cast<coda_oss::u8string::const_pointer>(p));
+    }
+    static EncodedStringView fromWindows1252(const std::string& s)
+    {
+        return EncodedStringView(str::c_str<str::W1252string>(s));
+    }
+    static EncodedStringView fromWindows1252(std::string::const_pointer p)
+    {
+        return EncodedStringView(str::cast<str::W1252string::const_pointer>(p));
+    }
+
+    std::string asUtf8() const
+    {
+        std::string retval;
+        return toUtf8(retval);
+    }
+    std::string asWindows1252() const;
+
     bool operator_eq(const EncodedStringView&) const;
 
     struct details final
@@ -131,6 +163,25 @@ inline bool operator!=(const EncodedStringView& lhs, const EncodedStringView& rh
     return !(lhs == rhs);
 }
 
+// Since we'd really like to "traffic" in UTF-8 strings (at least when encoding is a consideration)
+// make that comparision easy.
+inline bool operator==(const EncodedStringView& lhs, const coda_oss::u8string& rhs)
+{
+    return lhs == EncodedStringView(rhs);
+}
+inline bool operator!=(const EncodedStringView& lhs, const coda_oss::u8string& rhs)
+{
+    return !(lhs == rhs);
+}
+inline bool operator==(const coda_oss::u8string& lhs, const EncodedStringView& rhs)
+{
+    return rhs == lhs;
+}
+inline bool operator!=(const coda_oss::u8string& lhs, const EncodedStringView& rhs)
+{
+    return !(lhs == rhs);
+}
+
 inline std::ostream& operator<<(std::ostream& os, const EncodedStringView& esv)
 {
     os << esv.native();

diff --git a/externals/coda-oss/modules/c++/str/include/str/Manip.h b/externals/coda-oss/modules/c++/str/include/str/Manip.h
@@ -32,6 +32,7 @@
 #include "config/compiler_extensions.h"
 #include "config/Exports.h"
 #include "coda_oss/CPlusPlus.h"
+#include "coda_oss/string.h"
 #include "str/Convert.h"
 
 namespace str
@@ -68,8 +69,9 @@ inline const CharT* data(const std::basic_string<CharT>& s) noexcept // to make
  *  @param  s  String to trim
  */
 CODA_OSS_API void trim(std::string& s);
-CODA_OSS_API std::string strip(const std::string& s);
-CODA_OSS_API std::string& strip(std::string& s);
+CODA_OSS_API std::string trim(const std::string& s);
+CODA_OSS_API void trim(coda_oss::u8string& s);
+CODA_OSS_API coda_oss::u8string trim(const coda_oss::u8string& s);
 
 /**
  *  Checks the end of s with match

diff --git a/externals/coda-oss/modules/c++/str/source/EncodedString.cpp b/externals/coda-oss/modules/c++/str/source/EncodedString.cpp
@@ -79,7 +79,7 @@ str::EncodedString& str::EncodedString::operator=(const EncodedStringView& v)
 {
     if (v.mIsUtf8)
     {
-        assign(v.c_str());
+        assign(v.c_u8str());
     }
     else
     {

diff --git a/externals/coda-oss/modules/c++/str/source/EncodedStringView.cpp b/externals/coda-oss/modules/c++/str/source/EncodedStringView.cpp
@@ -91,6 +91,11 @@ str::W1252string str::EncodedStringView::w1252string() const
 {
     return str::details::to_w1252string(mString.data(), mString.size(), mIsUtf8);
 }
+std::string str::EncodedStringView::asWindows1252() const
+{
+    const auto result = w1252string();
+    return str::c_str<std::string>(result); // cast & copy
+}
 
 bool str::EncodedStringView::operator_eq(const EncodedStringView& rhs) const
 {
@@ -113,7 +118,7 @@ bool str::EncodedStringView::operator_eq(const EncodedStringView& rhs) const
     auto& w1252 = !lhs.mIsUtf8 ? lhs : rhs;
 
     // If UTF-8 is native on this platform, convert to UTF-8; otherwise do a native comparision
-    return mNativeIsUtf8 ? utf8.c_str() == w1252.u8string() : utf8.native() == w1252.mString.data();
+    return mNativeIsUtf8 ? utf8.c_u8str() == w1252.u8string() : utf8.native() == w1252.mString.data();
 }
 
 
diff --git a/externals/coda-oss/modules/c++/str/source/Manip.cpp b/externals/coda-oss/modules/c++/str/source/Manip.cpp
@@ -66,36 +66,47 @@ char toupperCheck(char c)
 
 namespace str
 {
-void trim(std::string & s)
+
+// TODO: https://stackoverflow.com/questions/31959532/best-way-to-remove-white-spaces-from-stdstring
+template<typename TChar>
+inline void trim_(std::basic_string<TChar> & s)
 {
     size_t i;
     for (i = 0; i < s.length(); i++)
     {
-        if (!iswspace(s[i]))
+        if (!iswspace(static_cast<wint_t>(s[i])))
             break;
     }
     s.erase(0, i);
 
     for (i = s.length() - 1; (int) i >= 0; i--)
     {
-        if (!iswspace(s[i]))
+        if (!iswspace(static_cast<wint_t>(s[i])))
             break;
 
     }
     if (i + 1 < s.length())
         s.erase(i + 1);
 }
-
-// https://stackoverflow.com/questions/31959532/best-way-to-remove-white-spaces-from-stdstring
-std::string& strip(std::string& str)
+void trim(std::string& s)
+{
+    trim_(s);
+}
+std::string trim(const std::string& str)
+{
+    auto retval = str;
+    trim(retval);
+    return retval;
+}
+void trim(coda_oss::u8string& s)
 {
-    str.erase(std::remove_if(str.begin(), str.end(), ::isspace), str.end());
-    return str;
+    trim_(s);
 }
-std::string strip(const std::string& str)
+coda_oss::u8string trim(const coda_oss::u8string& str)
 {
     auto retval = str;
-    return strip(retval);
+    trim(retval);
+    return retval;
 }
 
 bool ends_with(const std::string& s, const std::string& match) noexcept

diff --git a/externals/coda-oss/modules/c++/str/unittests/test_base_convert.cpp b/externals/coda-oss/modules/c++/str/unittests/test_base_convert.cpp
@@ -25,16 +25,29 @@
 #include <vector>
 #include <string>
 #include <iterator>
-
 #include <std/string>
 
+#include "coda_oss/CPlusPlus.h"
+
 #include <import/str.h>
 #include <str/EncodedString.h>
 #include <str/Encoding.h>
 
 #include "TestCase.h"
 
-static std::string to_string(const coda_oss::u8string& value)
+// It seems that a macro is better than a utility routine, see https://github.com/tahonermann/char8_t-remediation
+// C++20 changed the type of u8 to char8_t* https://en.cppreference.com/w/cpp/language/string_literal
+// Not putting this everywhere because (1) well, it's a macro, and (2) it's mostly
+// only test code that uses string literals.
+#if CODA_OSS_cpp20
+#define U8(ch) u8##ch
+#define U8s(s) u8##s
+#else
+#define U8(ch) static_cast<std::char8_t>(ch)
+#define U8s(s) static_cast<const std::char8_t*>(static_cast<const void*>(s))
+#endif
+
+static std::string to_string(const std::u8string& value)
 {
     return str::c_str<std::string>(value);  // copy
 }
@@ -75,29 +88,24 @@ TEST_CASE(testCharToString)
     TEST_ASSERT_EQ(str::toString<char>(65), "A");
 }
 
-static coda_oss::u8string fromWindows1252(const std::string& s)
+static inline std::u8string fromWindows1252(const std::string& s)
 {
     // s is Windows-1252 on ALL platforms
     return str::fromWindows1252(s.c_str(), s.size());
 }
 
-template<typename T>
-static constexpr std::u8string::value_type cast8(T ch)
-{
-    static_assert(sizeof(std::u8string::value_type) == sizeof(char), "sizeof(Char8_T) != sizeof(char)");
-    return static_cast<std::u8string::value_type>(ch);
-}
-template <typename T>
-static constexpr std::u32string::value_type cast32(T ch)
+template<typename TChar>
+static inline constexpr std::u32string::value_type U(TChar ch)
 {
     return static_cast<std::u32string::value_type>(ch);
 }
+
 TEST_CASE(test_string_to_u8string_ascii)
 {
     {
         const std::string input = "|\x00";  //  ASCII, "|<NULL>"
         const auto actual = fromWindows1252(input);
-        const std::u8string expected{cast8('|')}; // '\x00' is the end of the string in C/C++
+        const std::u8string expected{U8('|')}; // '\x00' is the end of the string in C/C++
         TEST_ASSERT_EQ(actual, expected);
     }
     constexpr uint8_t start_of_heading = 0x01;
@@ -106,9 +114,9 @@ TEST_CASE(test_string_to_u8string_ascii)
     {
         const std::string input { '|', static_cast<std::string::value_type>(ch), '|'};
         const auto actual = fromWindows1252(input);
-        const std::u8string expected8{cast8('|'), cast8(ch), cast8('|')}; 
+        const std::u8string expected8{U8('|'), U8(ch), U8('|')}; 
         TEST_ASSERT_EQ(actual, expected8);
-        const std::u32string expected{cast32('|'), cast32(ch), cast32('|')};
+        const std::u32string expected{U'|', U(ch), U'|'};
         TEST_ASSERT_EQ(to_string(actual), to_string(expected));
     }
 }
@@ -119,17 +127,17 @@ TEST_CASE(test_string_to_u8string_windows_1252)
     {
         const std::string input = "|\x80|";  // Windows-1252, "|€|"
         const auto actual = fromWindows1252(input);
-        const std::u8string expected8{cast8('|'), cast8('\xE2'), cast8('\x82'), cast8('\xAC'), cast8('|')};  // UTF-8,  "|€|"
+        const std::u8string expected8{U8s("|\xE2\x82\xAC|")};  // UTF-8,  "|€|"
         TEST_ASSERT_EQ(actual, expected8);
-        const std::u32string expected{cast32('|'), 0x20AC, cast32('|')};  // UTF-32,  "|€|"
+        const std::u32string expected{U'|', 0x20AC, U'|'};  // UTF-32,  "|€|"
         TEST_ASSERT_EQ(to_string(actual), to_string(expected));
     }
     {
         const std::string input = "|\x9F|";  // Windows-1252, "|Ÿ|"
         const auto actual = fromWindows1252(input);
-        const std::u8string expected8{cast8('|'), cast8('\xC5'), cast8('\xB8'), cast8('|')};  // UTF-8,  "|Ÿ|"
+        const std::u8string expected8{U8s("|\xC5\xB8|")};  // UTF-8,  "|Ÿ|"
         TEST_ASSERT_EQ(actual, expected8);
-        const std::u32string expected{cast32('|'), 0x0178, cast32('|')};  // UTF-32,  "|Ÿ|"
+        const std::u32string expected{U'|', 0x0178, U'|'};  // UTF-32,  "|Ÿ|"
         TEST_ASSERT_EQ(to_string(actual), to_string(expected));
     }
     {
@@ -138,9 +146,9 @@ TEST_CASE(test_string_to_u8string_windows_1252)
         {
             const std::string input{'|', ch, '|'};
             const auto actual = fromWindows1252(input);
-            static const std::u8string expected8{cast8('|'), cast8('\xEF'), cast8('\xBF'), cast8('\xBD'), cast8('|')};  // UTF-8,  "|<REPLACEMENT CHARACTER>|"
+            static const std::u8string expected8{U8s("|\xEF\xBF\xBD|")};  // UTF-8,  "|<REPLACEMENT CHARACTER>|"
             TEST_ASSERT_EQ(actual, expected8);
-            const std::u32string expected{cast32('|'), 0xfffd, cast32('|')};  // UTF-32,  "|<REPLACEMENT CHARACTER>|"
+            const std::u32string expected{U'|', 0xfffd, U'|'};  // UTF-32,  "|<REPLACEMENT CHARACTER>|"
             TEST_ASSERT_EQ(to_string(actual), to_string(expected));
         }    
     }
@@ -208,7 +216,7 @@ TEST_CASE(test_string_to_u8string_iso8859_1)
         const std::string input_ { '|', static_cast<std::string::value_type>(ch), '|'};
         const str::W1252string input(str::c_str<str::W1252string>(input_));
         const auto actual = to_u8string(input);
-        const std::u32string expected{cast32('|'), cast32(ch), cast32('|')};
+        const std::u32string expected{U'|', U(ch), U'|'};
         TEST_ASSERT_EQ(to_string(actual), to_string(expected));
 
         // Can't compare the values with == because TEST_ASSERT_EQ()
@@ -264,7 +272,7 @@ TEST_CASE(test_change_case)
 // https://en.wikipedia.org/wiki/%C3%89#Character_mappings
 static const str::EncodedString& classificationText_utf_8()
 {
-    static const str::EncodedString retval(str::cast<coda_oss::u8string::const_pointer>("A\xc3\x89IOU")); // UTF-8 "AÉIOU"
+    static const str::EncodedString retval(str::cast<std::u8string::const_pointer>("A\xc3\x89IOU")); // UTF-8 "AÉIOU"
     return retval;
  }
 static const str::EncodedString& classificationText_iso8859_1()
-Original file line number
+Diff line change
@@ Expand Up @@
     {
         if (v.mIsUtf8)
         {
-            assign(v.c_str());
+            assign(v.c_u8str());
         }
         else
         {
@@ Expand Down @@