From 1816dfd40901f3b9262ba3f491708b1aee2fad4a Mon Sep 17 00:00:00 2001 From: Adeel <3840695+am11@users.noreply.github.com> Date: Sat, 29 Apr 2023 21:17:39 +0300 Subject: [PATCH 1/9] Share UTF8 converters between coreclr and mono - v1 --- src/coreclr/pal/src/CMakeLists.txt | 4 +- src/coreclr/pal/src/locale/unicode.cpp | 93 +- src/coreclr/pal/src/locale/utf8.cpp | 2937 ----------------- .../MultiByteToWideChar/test4/test4.cpp | 2 +- .../WideCharToMultiByte/test5/test5.cpp | 2 +- .../TypeBuilder/TypeBuilderDefineEvent.cs | 2 +- .../TypeBuilder/TypeBuilderDefineProperty.cs | 2 +- src/mono/mono/eglib/CMakeLists.txt | 5 +- src/mono/mono/eglib/glib.h | 1 + src/mono/mono/eglib/gutf8.c | 323 -- .../minipal/utf8converter.c} | 481 ++- src/native/minipal/utf8converter.h | 200 ++ 12 files changed, 683 insertions(+), 3369 deletions(-) delete mode 100644 src/coreclr/pal/src/locale/utf8.cpp delete mode 100644 src/mono/mono/eglib/gutf8.c rename src/{mono/mono/eglib/giconv.c => native/minipal/utf8converter.c} (68%) create mode 100644 src/native/minipal/utf8converter.h diff --git a/src/coreclr/pal/src/CMakeLists.txt b/src/coreclr/pal/src/CMakeLists.txt index bd5a6bdf4d5b22..22d9c29594dd27 100644 --- a/src/coreclr/pal/src/CMakeLists.txt +++ b/src/coreclr/pal/src/CMakeLists.txt @@ -152,7 +152,7 @@ set(SOURCES loader/module.cpp locale/unicode.cpp locale/unicodedata.cpp - locale/utf8.cpp + ${CLR_SRC_NATIVE_DIR}/minipal/utf8converter.c map/common.cpp map/map.cpp map/virtual.cpp @@ -213,6 +213,8 @@ set(SOURCES thread/threadsusp.cpp ) +set_source_files_properties("${CLR_SRC_NATIVE_DIR}/minipal/utf8converter.c" PROPERTIES COMPILE_FLAGS -Wno-implicit-fallthrough) + if(NOT CLR_CMAKE_USE_SYSTEM_LIBUNWIND) set(LIBUNWIND_OBJECTS $) endif(NOT CLR_CMAKE_USE_SYSTEM_LIBUNWIND) diff --git a/src/coreclr/pal/src/locale/unicode.cpp b/src/coreclr/pal/src/locale/unicode.cpp index f29eabc07d9be3..b4c832c3d2e599 100644 --- a/src/coreclr/pal/src/locale/unicode.cpp +++ b/src/coreclr/pal/src/locale/unicode.cpp @@ -34,6 +34,7 @@ Revision History: #include #include +#include using namespace CorUnix; @@ -227,7 +228,7 @@ MultiByteToWideChar( OUT LPWSTR lpWideCharStr, IN int cchWideChar) { - INT retval =0; + long retval = 0; PERF_ENTRY(MultiByteToWideChar); ENTRY("MultiByteToWideChar(CodePage=%u, dwFlags=%#x, lpMultiByteStr=%p (%s)," @@ -253,16 +254,51 @@ MultiByteToWideChar( goto EXIT; } - // Use UTF8ToUnicode on all systems, since it replaces + // Use g_utf8_to_utf16_custom_alloc_optional on all systems, since it replaces // invalid characters and Core Foundation doesn't do that. if (CodePage == CP_UTF8 || CodePage == CP_ACP) { + int inputLength = (int)strlen(lpMultiByteStr); + bool allowNulls = (cbMultiByte > 0 && lpMultiByteStr[cbMultiByte - 1] != '\0'); + bool subtractOne = cbMultiByte == cchWideChar || allowNulls; if (cbMultiByte <= -1) { - cbMultiByte = strlen(lpMultiByteStr) + 1; + cbMultiByte = inputLength + 1; } - retval = UTF8ToUnicode(lpMultiByteStr, cbMultiByte, lpWideCharStr, cchWideChar, dwFlags); + size_t allocSize = 0; + struct cookie { LPWSTR str; size_t* allocSize; int* count; }; + cookie callbackCookie = { .str = lpWideCharStr, .allocSize = &allocSize, .count = &cchWideChar }; + + long itemsWritten; + GError *gerror = NULL; + lpWideCharStr = (LPWSTR)g_utf8_to_utf16_custom_alloc_optional(lpMultiByteStr, cbMultiByte, &retval, &itemsWritten, allowNulls, + !(dwFlags & MB_ERR_INVALID_CHARS), cbMultiByte > inputLength, + [](size_t req_size, void* custom_alloc_data) + { + cookie* callbackCookie = (cookie*)(custom_alloc_data); + *(callbackCookie->allocSize) = (req_size / sizeof (gunichar2)); + int count = *(callbackCookie->count); + return (void*)(callbackCookie->str && !(count && *(callbackCookie->allocSize) - 1 > (size_t)count) ? callbackCookie->str : NULL); + }, &callbackCookie, &gerror); + + if (gerror && (lpWideCharStr || (cchWideChar && allocSize > (size_t)cchWideChar))) + { + retval = 0; + ERROR ("The error is %d %s\n", gerror->code, gerror->message); + switch (gerror->code) + { + case G_CONVERT_ERROR_ILLEGAL_SEQUENCE: SetLastError(ERROR_NO_UNICODE_TRANSLATION); break; + case G_CONVERT_ERROR_NO_MEMORY: SetLastError(ERROR_INSUFFICIENT_BUFFER); break; + default: SetLastError(ERROR_INVALID_PARAMETER); break; + } + free(gerror); + goto EXIT; + } + + retval = allocSize; + if (retval > 1 && subtractOne) retval -= 1; + goto EXIT; } @@ -274,7 +310,7 @@ MultiByteToWideChar( LOGEXIT("MultiByteToWideChar returns %d.\n",retval); PERF_EXIT(MultiByteToWideChar); - return retval; + return (int)retval; } @@ -297,7 +333,7 @@ WideCharToMultiByte( IN LPCSTR lpDefaultChar, OUT LPBOOL lpUsedDefaultChar) { - INT retval =0; + long retval = 0; char defaultChar = '?'; BOOL usedDefaultChar = FALSE; @@ -338,15 +374,50 @@ WideCharToMultiByte( defaultChar = *lpDefaultChar; } - // Use UnicodeToUTF8 on all systems because we use - // UTF8ToUnicode in MultiByteToWideChar() on all systems. + // Use g_utf16_to_utf8_custom_alloc_with_nulls on all systems because we use + // g_utf8_to_utf16 in MultiByteToWideChar() on all systems. if (CodePage == CP_UTF8 || CodePage == CP_ACP) { + int inputLength = (int)PAL_wcslen(lpWideCharStr); + bool allowNulls = (cchWideChar > 0 && lpWideCharStr[cchWideChar - 1] != '\0'); + bool subtractOne = cchWideChar == cbMultiByte || allowNulls; if (cchWideChar == -1) { - cchWideChar = PAL_wcslen(lpWideCharStr) + 1; + cchWideChar = inputLength + 1; } - retval = UnicodeToUTF8(lpWideCharStr, cchWideChar, lpMultiByteStr, cbMultiByte); + + size_t allocSize = 0; + struct cookie { LPSTR str; size_t* allocSize; int* count; }; + cookie callbackCookie = { .str = lpMultiByteStr, .allocSize = &allocSize, .count = &cbMultiByte }; + + long itemsWritten; + GError *gerror = NULL; + lpMultiByteStr = g_utf16_to_utf8_custom_alloc_with_nulls((unsigned short*)lpWideCharStr, cchWideChar, &retval, &itemsWritten, allowNulls, cchWideChar > inputLength, + [](size_t req_size, void* custom_alloc_data) + { + cookie* callbackCookie = (cookie*)(custom_alloc_data); + *(callbackCookie->allocSize) = req_size; + int count = (size_t)*(callbackCookie->count); + return (void*)(callbackCookie->str && !(count && *(callbackCookie->allocSize) - 1 > (size_t)count) ? callbackCookie->str : NULL); + }, &callbackCookie, &gerror); + + if (gerror && (lpMultiByteStr || (cbMultiByte && allocSize > (size_t)cbMultiByte))) + { + retval = 0; + ERROR ("The error is %d %s\n", gerror->code, gerror->message); + switch (gerror->code) + { + case G_CONVERT_ERROR_ILLEGAL_SEQUENCE: SetLastError(ERROR_NO_UNICODE_TRANSLATION); break; + case G_CONVERT_ERROR_NO_MEMORY: SetLastError(ERROR_INSUFFICIENT_BUFFER); break; + default: SetLastError(ERROR_INVALID_PARAMETER); break; + } + free(gerror); + goto EXIT; + } + + retval = allocSize; + if (retval > 1 && subtractOne) retval -= 1; + goto EXIT; } @@ -374,7 +445,7 @@ WideCharToMultiByte( LOGEXIT("WideCharToMultiByte returns INT %d\n", retval); PERF_EXIT(WideCharToMultiByte); - return retval; + return (int)retval; } extern char * g_szCoreCLRPath; diff --git a/src/coreclr/pal/src/locale/utf8.cpp b/src/coreclr/pal/src/locale/utf8.cpp deleted file mode 100644 index f07c69ff7e15f3..00000000000000 --- a/src/coreclr/pal/src/locale/utf8.cpp +++ /dev/null @@ -1,2937 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -/*++ - -Module Name: - - unicode/utf8.c - -Abstract: - Functions to encode and decode UTF-8 strings. This is a port of the C# version from Utf8Encoding.cs. - -Revision History: - ---*/ - -#include "pal/utf8.h" -#include "pal/malloc.hpp" - -using namespace CorUnix; - -#define FASTLOOP - -struct CharUnicodeInfo -{ - static const WCHAR HIGH_SURROGATE_START = 0xd800; - static const WCHAR HIGH_SURROGATE_END = 0xdbff; - static const WCHAR LOW_SURROGATE_START = 0xdc00; - static const WCHAR LOW_SURROGATE_END = 0xdfff; -}; - -struct Char -{ - // Test if the wide character is a high surrogate - static bool IsHighSurrogate(const WCHAR c) - { - return (c & 0xFC00) == CharUnicodeInfo::HIGH_SURROGATE_START; - } - - // Test if the wide character is a low surrogate - static bool IsLowSurrogate(const WCHAR c) - { - return (c & 0xFC00) == CharUnicodeInfo::LOW_SURROGATE_START; - } - - // Test if the wide character is a surrogate half - static bool IsSurrogate(const WCHAR c) - { - return (c & 0xF800) == CharUnicodeInfo::HIGH_SURROGATE_START; - } - - // Test if the wide character is a high surrogate - static bool IsHighSurrogate(const WCHAR* s, int index) - { - return IsHighSurrogate(s[index]); - } - - // Test if the wide character is a low surrogate - static bool IsLowSurrogate(const WCHAR* s, int index) - { - return IsLowSurrogate(s[index]); - } - - // Test if the wide character is a surrogate half - static bool IsSurrogate(const WCHAR* s, int index) - { - return IsSurrogate(s[index]); - } -}; - -class ArgumentException -{ - -public: - ArgumentException(LPCSTR message) - { - } - - ArgumentException(LPCSTR message, LPCSTR argName) - { - } -}; - -class ArgumentNullException : public ArgumentException -{ -public: - ArgumentNullException(LPCSTR argName) - : ArgumentException("Argument is NULL", argName) - { - - } -}; - -class ArgumentOutOfRangeException : public ArgumentException -{ -public: - ArgumentOutOfRangeException(LPCSTR argName, LPCSTR message) - : ArgumentException(message, argName) - { - - } -}; - -class InsufficientBufferException : public ArgumentException -{ -public: - InsufficientBufferException(LPCSTR message, LPCSTR argName) - : ArgumentException(message, argName) - { - - } -}; - -class Contract -{ -public: - static void Assert(bool cond, LPCSTR str) - { - if (!cond) - { - throw ArgumentException(str); - } - } - - static void EndContractBlock() - { - } -}; - -class DecoderFallbackException : public ArgumentException -{ - BYTE *bytesUnknown; - int index; - -public: - DecoderFallbackException( - LPCSTR message, BYTE bytesUnknown[], int index) : ArgumentException(message) - { - this->bytesUnknown = bytesUnknown; - this->index = index; - } - - BYTE *BytesUnknown() - { - return (bytesUnknown); - } - - int GetIndex() - { - return index; - } -}; - -class DecoderFallbackBuffer; - -class DecoderFallback -{ -public: - - // Fallback - // - // Return the appropriate unicode string alternative to the character that need to fall back. - - virtual DecoderFallbackBuffer* CreateFallbackBuffer() = 0; - - // Maximum number of characters that this instance of this fallback could return - - virtual int GetMaxCharCount() = 0; -}; - -class DecoderReplacementFallback : public DecoderFallback -{ - // Our variables - WCHAR strDefault[2]; - int strDefaultLength; - -public: - // Construction. Default replacement fallback uses no best fit and ? replacement string - DecoderReplacementFallback() : DecoderReplacementFallback(W("?")) - { - } - - DecoderReplacementFallback(const WCHAR* replacement) - { - // Must not be null - if (replacement == nullptr) - throw ArgumentNullException("replacement"); - Contract::EndContractBlock(); - - // Make sure it doesn't have bad surrogate pairs - bool bFoundHigh = false; - int replacementLength = PAL_wcslen((const WCHAR *)replacement); - for (int i = 0; i < replacementLength; i++) - { - // Found a surrogate? - if (Char::IsSurrogate(replacement, i)) - { - // High or Low? - if (Char::IsHighSurrogate(replacement, i)) - { - // if already had a high one, stop - if (bFoundHigh) - break; // break & throw at the bFoundHIgh below - bFoundHigh = true; - } - else - { - // Low, did we have a high? - if (!bFoundHigh) - { - // Didn't have one, make if fail when we stop - bFoundHigh = true; - break; - } - - // Clear flag - bFoundHigh = false; - } - } - // If last was high we're in trouble (not surrogate so not low surrogate, so break) - else if (bFoundHigh) - break; - } - if (bFoundHigh) - throw ArgumentException("String 'replacement' contains invalid Unicode code points.", "replacement"); - - wcscpy_s(strDefault, ARRAY_SIZE(strDefault), replacement); - strDefaultLength = replacementLength; - } - - WCHAR* GetDefaultString() - { - return strDefault; - } - - virtual DecoderFallbackBuffer* CreateFallbackBuffer(); - - // Maximum number of characters that this instance of this fallback could return - virtual int GetMaxCharCount() - { - return strDefaultLength; - } -}; - -class DecoderFallbackBuffer -{ - friend class UTF8Encoding; - // Most implementations will probably need an implementation-specific constructor - - // internal methods that cannot be overridden that let us do our fallback thing - // These wrap the internal methods so that we can check for people doing stuff that's incorrect - -public: - virtual ~DecoderFallbackBuffer() = default; - - virtual bool Fallback(BYTE bytesUnknown[], int index, int size) = 0; - - // Get next character - virtual WCHAR GetNextChar() = 0; - - //Back up a character - virtual bool MovePrevious() = 0; - - // How many chars left in this fallback? - virtual int GetRemaining() = 0; - - // Clear the buffer - virtual void Reset() - { - while (GetNextChar() != (WCHAR)0); - } - - // Internal items to help us figure out what we're doing as far as error messages, etc. - // These help us with our performance and messages internally -protected: - BYTE* byteStart; - WCHAR* charEnd; - - // Internal reset - void InternalReset() - { - byteStart = nullptr; - Reset(); - } - - // Set the above values - // This can't be part of the constructor because EncoderFallbacks would have to know how to implement these. - void InternalInitialize(BYTE* byteStart, WCHAR* charEnd) - { - this->byteStart = byteStart; - this->charEnd = charEnd; - } - - // Fallback the current byte by sticking it into the remaining char buffer. - // This can only be called by our encodings (other have to use the public fallback methods), so - // we can use our DecoderNLS here too (except we don't). - // Returns true if we are successful, false if we can't fallback the character (no buffer space) - // So caller needs to throw buffer space if return false. - // Right now this has both bytes and bytes[], since we might have extra bytes, hence the - // array, and we might need the index, hence the byte* - // Don't touch ref chars unless we succeed - virtual bool InternalFallback(BYTE bytes[], BYTE* pBytes, WCHAR** chars, int size) - { - - Contract::Assert(byteStart != nullptr, "[DecoderFallback.InternalFallback]Used InternalFallback without calling InternalInitialize"); - - // See if there's a fallback character and we have an output buffer then copy our string. - if (this->Fallback(bytes, (int)(pBytes - byteStart - size), size)) - { - // Copy the chars to our output - WCHAR ch; - WCHAR* charTemp = *chars; - bool bHighSurrogate = false; - while ((ch = GetNextChar()) != 0) - { - // Make sure no mixed up surrogates - if (Char::IsSurrogate(ch)) - { - if (Char::IsHighSurrogate(ch)) - { - // High Surrogate - if (bHighSurrogate) - throw ArgumentException("String 'chars' contains invalid Unicode code points."); - bHighSurrogate = true; - } - else - { - // Low surrogate - if (!bHighSurrogate) - throw ArgumentException("String 'chars' contains invalid Unicode code points."); - bHighSurrogate = false; - } - } - - if (charTemp >= charEnd) - { - // No buffer space - return false; - } - - *(charTemp++) = ch; - } - - // Need to make sure that bHighSurrogate isn't true - if (bHighSurrogate) - throw ArgumentException("String 'chars' contains invalid Unicode code points."); - - // Now we aren't going to be false, so its OK to update chars - *chars = charTemp; - } - - return true; - } - - // This version just counts the fallback and doesn't actually copy anything. - virtual int InternalFallback(BYTE bytes[], BYTE* pBytes, int size) - // Right now this has both bytes[] and BYTE* bytes, since we might have extra bytes, hence the - // array, and we might need the index, hence the byte* - { - - Contract::Assert(byteStart != nullptr, "[DecoderFallback.InternalFallback]Used InternalFallback without calling InternalInitialize"); - - // See if there's a fallback character and we have an output buffer then copy our string. - if (this->Fallback(bytes, (int)(pBytes - byteStart - size), size)) - { - int count = 0; - - WCHAR ch; - bool bHighSurrogate = false; - while ((ch = GetNextChar()) != 0) - { - // Make sure no mixed up surrogates - if (Char::IsSurrogate(ch)) - { - if (Char::IsHighSurrogate(ch)) - { - // High Surrogate - if (bHighSurrogate) - throw ArgumentException("String 'chars' contains invalid Unicode code points."); - bHighSurrogate = true; - } - else - { - // Low surrogate - if (!bHighSurrogate) - throw ArgumentException("String 'chars' contains invalid Unicode code points."); - bHighSurrogate = false; - } - } - - count++; - } - - // Need to make sure that bHighSurrogate isn't true - if (bHighSurrogate) - throw ArgumentException("String 'chars' contains invalid Unicode code points."); - - return count; - } - - // If no fallback return 0 - return 0; - } - - // private helper methods - void ThrowLastBytesRecursive(BYTE bytesUnknown[]) - { - throw ArgumentException("Recursive fallback not allowed"); - } -}; - -class DecoderReplacementFallbackBuffer : public DecoderFallbackBuffer -{ - // Store our default string - WCHAR strDefault[2]; - int strDefaultLength; - int fallbackCount = -1; - int fallbackIndex = -1; - -public: - // Construction - DecoderReplacementFallbackBuffer(DecoderReplacementFallback* fallback) - { - wcscpy_s(strDefault, ARRAY_SIZE(strDefault), fallback->GetDefaultString()); - strDefaultLength = PAL_wcslen((const WCHAR *)fallback->GetDefaultString()); - } - - // Fallback Methods - virtual bool Fallback(BYTE bytesUnknown[], int index, int size) - { - // We expect no previous fallback in our buffer - // We can't call recursively but others might (note, we don't test on last char!!!) - if (fallbackCount >= 1) - { - ThrowLastBytesRecursive(bytesUnknown); - } - - // Go ahead and get our fallback - if (strDefaultLength == 0) - return false; - - fallbackCount = strDefaultLength; - fallbackIndex = -1; - - return true; - } - - virtual WCHAR GetNextChar() - { - // We want it to get < 0 because == 0 means that the current/last character is a fallback - // and we need to detect recursion. We could have a flag but we already have this counter. - fallbackCount--; - fallbackIndex++; - - // Do we have anything left? 0 is now last fallback char, negative is nothing left - if (fallbackCount < 0) - return '\0'; - - // Need to get it out of the buffer. - // Make sure it didn't wrap from the fast count-- path - if (fallbackCount == INT_MAX) - { - fallbackCount = -1; - return '\0'; - } - - // Now make sure its in the expected range - Contract::Assert(fallbackIndex < strDefaultLength && fallbackIndex >= 0, - "Index exceeds buffer range"); - - return strDefault[fallbackIndex]; - } - - virtual bool MovePrevious() - { - // Back up one, only if we just processed the last character (or earlier) - if (fallbackCount >= -1 && fallbackIndex >= 0) - { - fallbackIndex--; - fallbackCount++; - return true; - } - - // Return false 'cause we couldn't do it. - return false; - } - - // How many characters left to output? - virtual int GetRemaining() - { - // Our count is 0 for 1 character left. - return (fallbackCount < 0) ? 0 : fallbackCount; - } - - // Clear the buffer - virtual void Reset() - { - fallbackCount = -1; - fallbackIndex = -1; - byteStart = nullptr; - } - - // This version just counts the fallback and doesn't actually copy anything. - virtual int InternalFallback(BYTE bytes[], BYTE* pBytes, int size) - // Right now this has both bytes and bytes[], since we might have extra bytes, hence the - // array, and we might need the index, hence the byte* - { - // return our replacement string Length - return strDefaultLength; - } -}; - -class DecoderExceptionFallbackBuffer : public DecoderFallbackBuffer -{ -public: - DecoderExceptionFallbackBuffer() - { - } - - virtual bool Fallback(BYTE bytesUnknown[], int index, int size) - { - throw DecoderFallbackException( - "Unable to translate UTF-8 character to Unicode", bytesUnknown, index); - } - - virtual WCHAR GetNextChar() - { - return 0; - } - - virtual bool MovePrevious() - { - // Exception fallback doesn't have anywhere to back up to. - return false; - } - - // Exceptions are always empty - virtual int GetRemaining() - { - return 0; - } - -}; - -class DecoderExceptionFallback : public DecoderFallback -{ - // Construction -public: - DecoderExceptionFallback() - { - } - - virtual DecoderFallbackBuffer* CreateFallbackBuffer() - { - return InternalNew(); - } - - // Maximum number of characters that this instance of this fallback could return - virtual int GetMaxCharCount() - { - return 0; - } -}; - -DecoderFallbackBuffer* DecoderReplacementFallback::CreateFallbackBuffer() -{ - return InternalNew(this); -} - -class EncoderFallbackException : public ArgumentException -{ - WCHAR charUnknown; - WCHAR charUnknownHigh; - WCHAR charUnknownLow; - int index; - -public: - EncoderFallbackException( - LPCSTR message, WCHAR charUnknown, int index) : ArgumentException(message) - { - this->charUnknown = charUnknown; - this->index = index; - } - - EncoderFallbackException( - LPCSTR message, WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) : ArgumentException(message) - { - if (!Char::IsHighSurrogate(charUnknownHigh)) - { - throw ArgumentOutOfRangeException("charUnknownHigh", - "Argument out of range 0xD800..0xDBFF"); - } - if (!Char::IsLowSurrogate(charUnknownLow)) - { - throw ArgumentOutOfRangeException("charUnknownLow", - "Argument out of range 0xDC00..0xDFFF"); - } - Contract::EndContractBlock(); - - this->charUnknownHigh = charUnknownHigh; - this->charUnknownLow = charUnknownLow; - this->index = index; - } - - WCHAR GetCharUnknown() - { - return (charUnknown); - } - - WCHAR GetCharUnknownHigh() - { - return (charUnknownHigh); - } - - WCHAR GetCharUnknownLow() - { - return (charUnknownLow); - } - - int GetIndex() - { - return index; - } - - // Return true if the unknown character is a surrogate pair. - bool IsUnknownSurrogate() - { - return (charUnknownHigh != '\0'); - } -}; - -class EncoderFallbackBuffer; - -class EncoderFallback -{ -public: - - // Fallback - // - // Return the appropriate unicode string alternative to the character that need to fall back. - - virtual EncoderFallbackBuffer* CreateFallbackBuffer() = 0; - - // Maximum number of characters that this instance of this fallback could return - virtual int GetMaxCharCount() = 0; -}; - -class EncoderReplacementFallback : public EncoderFallback -{ - // Our variables - WCHAR strDefault[2]; - int strDefaultLength; - -public: - // Construction. Default replacement fallback uses no best fit and ? replacement string - EncoderReplacementFallback() : EncoderReplacementFallback(W("?")) - { - } - - EncoderReplacementFallback(const WCHAR* replacement) - { - // Must not be null - if (replacement == nullptr) - throw ArgumentNullException("replacement"); - Contract::EndContractBlock(); - - // Make sure it doesn't have bad surrogate pairs - bool bFoundHigh = false; - int replacementLength = PAL_wcslen((const WCHAR *)replacement); - for (int i = 0; i < replacementLength; i++) - { - // Found a surrogate? - if (Char::IsSurrogate(replacement, i)) - { - // High or Low? - if (Char::IsHighSurrogate(replacement, i)) - { - // if already had a high one, stop - if (bFoundHigh) - break; // break & throw at the bFoundHIgh below - bFoundHigh = true; - } - else - { - // Low, did we have a high? - if (!bFoundHigh) - { - // Didn't have one, make if fail when we stop - bFoundHigh = true; - break; - } - - // Clear flag - bFoundHigh = false; - } - } - // If last was high we're in trouble (not surrogate so not low surrogate, so break) - else if (bFoundHigh) - break; - } - if (bFoundHigh) - throw ArgumentException("String 'replacement' contains invalid Unicode code points.", "replacement"); - - wcscpy_s(strDefault, ARRAY_SIZE(strDefault), replacement); - strDefaultLength = replacementLength; - } - - WCHAR* GetDefaultString() - { - return strDefault; - } - - virtual EncoderFallbackBuffer* CreateFallbackBuffer(); - - // Maximum number of characters that this instance of this fallback could return - virtual int GetMaxCharCount() - { - return strDefaultLength; - } -}; - -class EncoderFallbackBuffer -{ - friend class UTF8Encoding; - // Most implementations will probably need an implementation-specific constructor - - // Public methods that cannot be overridden that let us do our fallback thing - // These wrap the internal methods so that we can check for people doing stuff that is incorrect - -public: - virtual ~EncoderFallbackBuffer() = default; - - virtual bool Fallback(WCHAR charUnknown, int index) = 0; - - virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) = 0; - - // Get next character - virtual WCHAR GetNextChar() = 0; - - // Back up a character - virtual bool MovePrevious() = 0; - - // How many chars left in this fallback? - virtual int GetRemaining() = 0; - - // Not sure if this should be public or not. - // Clear the buffer - virtual void Reset() - { - while (GetNextChar() != (WCHAR)0); - } - - // Internal items to help us figure out what we're doing as far as error messages, etc. - // These help us with our performance and messages internally -protected: - WCHAR* charStart; - WCHAR* charEnd; - bool setEncoder; - bool bUsedEncoder; - bool bFallingBack = false; - int iRecursionCount = 0; - static const int iMaxRecursion = 250; - - // Internal Reset - // For example, what if someone fails a conversion and wants to reset one of our fallback buffers? - void InternalReset() - { - charStart = nullptr; - bFallingBack = false; - iRecursionCount = 0; - Reset(); - } - - // Set the above values - // This can't be part of the constructor because EncoderFallbacks would have to know how to implement these. - void InternalInitialize(WCHAR* charStart, WCHAR* charEnd, bool setEncoder) - { - this->charStart = charStart; - this->charEnd = charEnd; - this->setEncoder = setEncoder; - this->bUsedEncoder = false; - this->bFallingBack = false; - this->iRecursionCount = 0; - } - - WCHAR InternalGetNextChar() - { - WCHAR ch = GetNextChar(); - bFallingBack = (ch != 0); - if (ch == 0) iRecursionCount = 0; - return ch; - } - - // Fallback the current character using the remaining buffer and encoder if necessary - // This can only be called by our encodings (other have to use the public fallback methods), so - // we can use our EncoderNLS here too. - // setEncoder is true if we're calling from a GetBytes method, false if we're calling from a GetByteCount - // - // Note that this could also change the contents of this->encoder, which is the same - // object that the caller is using, so the caller could mess up the encoder for us - // if they aren't careful. - virtual bool InternalFallback(WCHAR ch, WCHAR** chars) - { - // Shouldn't have null charStart - Contract::Assert(charStart != nullptr, - "[EncoderFallback.InternalFallbackBuffer]Fallback buffer is not initialized"); - - // Get our index, remember chars was preincremented to point at next char, so have to -1 - int index = (int)(*chars - charStart) - 1; - - // See if it was a high surrogate - if (Char::IsHighSurrogate(ch)) - { - // See if there's a low surrogate to go with it - if (*chars >= this->charEnd) - { - // Nothing left in input buffer - // No input, return 0 - } - else - { - // Might have a low surrogate - WCHAR cNext = **chars; - if (Char::IsLowSurrogate(cNext)) - { - // If already falling back then fail - if (bFallingBack && iRecursionCount++ > iMaxRecursion) - ThrowLastCharRecursive(ch, cNext); - - // Next is a surrogate, add it as surrogate pair, and increment chars - (*chars)++; - bFallingBack = Fallback(ch, cNext, index); - return bFallingBack; - } - - // Next isn't a low surrogate, just fallback the high surrogate - } - } - - // If already falling back then fail - if (bFallingBack && iRecursionCount++ > iMaxRecursion) - ThrowLastCharRecursive((int)ch); - - // Fall back our char - bFallingBack = Fallback(ch, index); - - return bFallingBack; - } - - // private helper methods - void ThrowLastCharRecursive(WCHAR highSurrogate, WCHAR lowSurrogate) - { - // Throw it, using our complete character - throw ArgumentException("Recursive fallback not allowed", "chars"); - } - - void ThrowLastCharRecursive(int utf32Char) - { - throw ArgumentException("Recursive fallback not allowed", "chars"); - } - -}; - -class EncoderReplacementFallbackBuffer : public EncoderFallbackBuffer -{ - // Store our default string - WCHAR strDefault[4]; - int strDefaultLength; - int fallbackCount = -1; - int fallbackIndex = -1; -public: - // Construction - EncoderReplacementFallbackBuffer(EncoderReplacementFallback* fallback) - { - // 2X in case we're a surrogate pair - wcscpy_s(strDefault, ARRAY_SIZE(strDefault), fallback->GetDefaultString()); - wcscat_s(strDefault, ARRAY_SIZE(strDefault), fallback->GetDefaultString()); - strDefaultLength = 2 * PAL_wcslen((const WCHAR *)fallback->GetDefaultString()); - - } - - // Fallback Methods - virtual bool Fallback(WCHAR charUnknown, int index) - { - // If we had a buffer already we're being recursive, throw, it's probably at the suspect - // character in our array. - if (fallbackCount >= 1) - { - // If we're recursive we may still have something in our buffer that makes this a surrogate - if (Char::IsHighSurrogate(charUnknown) && fallbackCount >= 0 && - Char::IsLowSurrogate(strDefault[fallbackIndex + 1])) - ThrowLastCharRecursive(charUnknown, strDefault[fallbackIndex + 1]); - - // Nope, just one character - ThrowLastCharRecursive((int)charUnknown); - } - - // Go ahead and get our fallback - // Divide by 2 because we aren't a surrogate pair - fallbackCount = strDefaultLength / 2; - fallbackIndex = -1; - - return fallbackCount != 0; - } - - virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) - { - // Double check input surrogate pair - if (!Char::IsHighSurrogate(charUnknownHigh)) - throw ArgumentOutOfRangeException("charUnknownHigh", - "Argument out of range 0xD800..0xDBFF"); - - if (!Char::IsLowSurrogate(charUnknownLow)) - throw ArgumentOutOfRangeException("charUnknownLow", - "Argument out of range 0xDC00..0xDFFF"); - Contract::EndContractBlock(); - - // If we had a buffer already we're being recursive, throw, it's probably at the suspect - // character in our array. - if (fallbackCount >= 1) - ThrowLastCharRecursive(charUnknownHigh, charUnknownLow); - - // Go ahead and get our fallback - fallbackCount = strDefaultLength; - fallbackIndex = -1; - - return fallbackCount != 0; - } - - virtual WCHAR GetNextChar() - { - // We want it to get < 0 because == 0 means that the current/last character is a fallback - // and we need to detect recursion. We could have a flag but we already have this counter. - fallbackCount--; - fallbackIndex++; - - // Do we have anything left? 0 is now last fallback char, negative is nothing left - if (fallbackCount < 0) - return '\0'; - - // Need to get it out of the buffer. - // Make sure it didn't wrap from the fast count-- path - if (fallbackCount == INT_MAX) - { - fallbackCount = -1; - return '\0'; - } - - // Now make sure its in the expected range - Contract::Assert(fallbackIndex < strDefaultLength && fallbackIndex >= 0, - "Index exceeds buffer range"); - - return strDefault[fallbackIndex]; - } - - virtual bool MovePrevious() - { - // Back up one, only if we just processed the last character (or earlier) - if (fallbackCount >= -1 && fallbackIndex >= 0) - { - fallbackIndex--; - fallbackCount++; - return true; - } - - // Return false 'cause we couldn't do it. - return false; - } - - // How many characters left to output? - virtual int GetRemaining() - { - // Our count is 0 for 1 character left. - return (fallbackCount < 0) ? 0 : fallbackCount; - } - - // Clear the buffer - virtual void Reset() - { - fallbackCount = -1; - fallbackIndex = 0; - charStart = nullptr; - bFallingBack = false; - } -}; - -class EncoderExceptionFallbackBuffer : public EncoderFallbackBuffer -{ -public: - EncoderExceptionFallbackBuffer() - { - } - - virtual bool Fallback(WCHAR charUnknown, int index) - { - // Fall back our char - throw EncoderFallbackException("Unable to translate Unicode character to UTF-8", charUnknown, index); - } - - virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) - { - if (!Char::IsHighSurrogate(charUnknownHigh)) - { - throw ArgumentOutOfRangeException("charUnknownHigh", - "Argument out of range 0xD800..0xDBFF"); - } - if (!Char::IsLowSurrogate(charUnknownLow)) - { - throw ArgumentOutOfRangeException("charUnknownLow", - "Argument out of range 0xDC00..0xDFFF"); - } - Contract::EndContractBlock(); - - //int iTemp = Char::ConvertToUtf32(charUnknownHigh, charUnknownLow); - - // Fall back our char - throw EncoderFallbackException( - "Unable to translate Unicode character to UTF-8", charUnknownHigh, charUnknownLow, index); - } - - virtual WCHAR GetNextChar() - { - return 0; - } - - virtual bool MovePrevious() - { - // Exception fallback doesn't have anywhere to back up to. - return false; - } - - // Exceptions are always empty - virtual int GetRemaining() - { - return 0; - } -}; - -class EncoderExceptionFallback : public EncoderFallback -{ - // Construction -public: - EncoderExceptionFallback() - { - } - - virtual EncoderFallbackBuffer* CreateFallbackBuffer() - { - return InternalNew(); - } - - // Maximum number of characters that this instance of this fallback could return - virtual int GetMaxCharCount() - { - return 0; - } -}; - -EncoderFallbackBuffer* EncoderReplacementFallback::CreateFallbackBuffer() -{ - return InternalNew(this); -} - -class UTF8Encoding -{ - EncoderFallback* encoderFallback; - // Instances of the two possible fallbacks. The constructor parameter - // determines which one to use. - EncoderReplacementFallback encoderReplacementFallback; - EncoderExceptionFallback encoderExceptionFallback; - - DecoderFallback* decoderFallback; - // Instances of the two possible fallbacks. The constructor parameter - // determines which one to use. - DecoderReplacementFallback decoderReplacementFallback; - DecoderExceptionFallback decoderExceptionFallback; - - bool InRange(int c, int begin, int end) - { - return begin <= c && c <= end; - } - - size_t PtrDiff(WCHAR* ptr1, WCHAR* ptr2) - { - return ptr1 - ptr2; - } - - size_t PtrDiff(BYTE* ptr1, BYTE* ptr2) - { - return ptr1 - ptr2; - } - - void ThrowBytesOverflow() - { - // Special message to include fallback type in case fallback's GetMaxCharCount is broken - // This happens if user has implemented an encoder fallback with a broken GetMaxCharCount - throw InsufficientBufferException("The output byte buffer is too small to contain the encoded data", "bytes"); - } - - void ThrowBytesOverflow(bool nothingEncoded) - { - // Special message to include fallback type in case fallback's GetMaxCharCount is broken - // This happens if user has implemented an encoder fallback with a broken GetMaxCharCount - if (nothingEncoded){ - ThrowBytesOverflow(); - } - } - - void ThrowCharsOverflow() - { - // Special message to include fallback type in case fallback's GetMaxCharCount is broken - // This happens if user has implemented a decoder fallback with a broken GetMaxCharCount - throw InsufficientBufferException("The output char buffer is too small to contain the encoded data", "chars"); - } - - void ThrowCharsOverflow(bool nothingEncoded) - { - // Special message to include fallback type in case fallback's GetMaxCharCount is broken - // This happens if user has implemented an decoder fallback with a broken GetMaxCharCount - if (nothingEncoded){ - ThrowCharsOverflow(); - } - } - - // During GetChars we had an invalid byte sequence - // pSrc is backed up to the start of the bad sequence if we didn't have room to - // fall it back. Otherwise pSrc remains where it is. - bool FallbackInvalidByteSequence(BYTE** pSrc, int ch, DecoderFallbackBuffer* fallback, WCHAR** pTarget) - { - // Get our byte[] - BYTE* pStart = *pSrc; - BYTE bytesUnknown[3]; - int size = GetBytesUnknown(pStart, ch, bytesUnknown); - - // Do the actual fallback - if (!fallback->InternalFallback(bytesUnknown, *pSrc, pTarget, size)) - { - // Oops, it failed, back up to pStart - *pSrc = pStart; - return false; - } - - // It worked - return true; - } - - int FallbackInvalidByteSequence(BYTE* pSrc, int ch, DecoderFallbackBuffer *fallback) - { - // Get our byte[] - BYTE bytesUnknown[3]; - int size = GetBytesUnknown(pSrc, ch, bytesUnknown); - - // Do the actual fallback - int count = fallback->InternalFallback(bytesUnknown, pSrc, size); - - // # of fallback chars expected. - // Note that we only get here for "long" sequences, and have already unreserved - // the count that we prereserved for the input bytes - return count; - } - - int GetBytesUnknown(BYTE* pSrc, int ch, BYTE* bytesUnknown) - { - int size; - - // See if it was a plain char - // (have to check >= 0 because we have all sorts of weird bit flags) - if (ch < 0x100 && ch >= 0) - { - pSrc--; - bytesUnknown[0] = (BYTE)ch; - size = 1; - } - // See if its an unfinished 2 byte sequence - else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0) - { - pSrc--; - bytesUnknown[0] = (BYTE)((ch & 0x1F) | 0xc0); - size = 1; - } - // So now we're either 2nd byte of 3 or 4 byte sequence or - // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence - // 1st check if its a 4 byte sequence - else if ((ch & SupplimentarySeq) != 0) - { - // 3rd byte of 4 byte sequence? - if ((ch & (FinalByte >> 6)) != 0) - { - // 3rd byte of 4 byte sequence - pSrc -= 3; - bytesUnknown[0] = (BYTE)(((ch >> 12) & 0x07) | 0xF0); - bytesUnknown[1] = (BYTE)(((ch >> 6) & 0x3F) | 0x80); - bytesUnknown[2] = (BYTE)(((ch)& 0x3F) | 0x80); - size = 3; - } - else if ((ch & (FinalByte >> 12)) != 0) - { - // 2nd byte of a 4 byte sequence - pSrc -= 2; - bytesUnknown[0] = (BYTE)(((ch >> 6) & 0x07) | 0xF0); - bytesUnknown[1] = (BYTE)(((ch)& 0x3F) | 0x80); - size = 2; - } - else - { - // 4th byte of a 4 byte sequence - pSrc--; - bytesUnknown[0] = (BYTE)(((ch)& 0x07) | 0xF0); - size = 1; - } - } - else - { - // 2nd byte of 3 byte sequence? - if ((ch & (FinalByte >> 6)) != 0) - { - // So its 2nd byte of a 3 byte sequence - pSrc -= 2; - bytesUnknown[0] = (BYTE)(((ch >> 6) & 0x0F) | 0xE0); - bytesUnknown[1] = (BYTE)(((ch)& 0x3F) | 0x80); - size = 2; - } - else - { - // 1st byte of a 3 byte sequence - pSrc--; - bytesUnknown[0] = (BYTE)(((ch)& 0x0F) | 0xE0); - size = 1; - } - } - - return size; - } - -public: - - UTF8Encoding(bool isThrowException) - : encoderReplacementFallback(W("\xFFFD")), decoderReplacementFallback(W("\xFFFD")) - { - if (isThrowException) - { - encoderFallback = &encoderExceptionFallback; - decoderFallback = &decoderExceptionFallback; - } - else - { - encoderFallback = &encoderReplacementFallback; - decoderFallback = &decoderReplacementFallback; - } - } - - // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits - // while the actual character is being built in the lower bits. They are shifted together - // with the actual bits of the character. - - // bits 30 & 31 are used for pending bits fixup - const int FinalByte = 1 << 29; - const int SupplimentarySeq = 1 << 28; - const int ThreeByteSeq = 1 << 27; - - int GetCharCount(BYTE* bytes, int count) - { - Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetCharCount]bytes!=nullptr"); - Contract::Assert(count >= 0, "[UTF8Encoding.GetCharCount]count >=0"); - - // Initialize stuff - BYTE *pSrc = bytes; - BYTE *pEnd = pSrc + count; - - // Start by assuming we have as many as count, charCount always includes the adjustment - // for the character being decoded - int charCount = count; - int ch = 0; - DecoderFallbackBuffer *fallback = nullptr; - - while (true) - { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - if (pSrc >= pEnd) { - break; - } - - // read next byte. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - int cha = *pSrc; - - if (ch == 0) { - // no pending bits - goto ReadChar; - } - - pSrc++; - - // we are expecting to see trailing bytes like 10vvvvvv - if ((cha & 0xC0) != 0x80) { - // This can be a valid starting byte for another UTF8 byte sequence, so let's put - // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence - pSrc--; - charCount += (ch >> 30); - goto InvalidByteSequence; - } - - // fold in the new byte - ch = (ch << 6) | (cha & 0x3F); - - if ((ch & FinalByte) == 0) { - Contract::Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0, - "[UTF8Encoding.GetChars]Invariant volation"); - - if ((ch & SupplimentarySeq) != 0) { - if ((ch & (FinalByte >> 6)) != 0) { - // this is 3rd byte (of 4 byte supplimentary) - nothing to do - continue; - } - - // 2nd byte, check for non-shortest form of supplimentary char and the valid - // supplimentary characters in range 0x010000 - 0x10FFFF at the same time - if (!InRange(ch & 0x1F0, 0x10, 0x100)) { - goto InvalidByteSequence; - } - } - else { - // Must be 2nd byte of a 3-byte sequence - // check for non-shortest form of 3 byte seq - if ((ch & (0x1F << 5)) == 0 || // non-shortest form - (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate - { - goto InvalidByteSequence; - } - } - continue; - } - - // ready to punch - - // adjust for surrogates in non-shortest form - if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq) { - charCount--; - } - goto EncodeChar; - - InvalidByteSequence: - // this code fragment should be close to the gotos referencing it - // Have to do fallback for invalid bytes - if (fallback == nullptr) - { - fallback = decoderFallback->CreateFallbackBuffer(); - fallback->InternalInitialize(bytes, nullptr); - } - charCount += FallbackInvalidByteSequence(pSrc, ch, fallback); - - ch = 0; - continue; - - ReadChar: - ch = *pSrc; - pSrc++; - - ProcessChar: - if (ch > 0x7F) { - // If its > 0x7F, its start of a new multi-byte sequence - - // Long sequence, so unreserve our char. - charCount--; - - // bit 6 has to be non-zero for start of multibyte chars. - if ((ch & 0x40) == 0) { - // Unexpected trail byte - goto InvalidByteSequence; - } - - // start a new long code - if ((ch & 0x20) != 0) { - if ((ch & 0x10) != 0) { - // 4 byte encoding - supplimentary character (2 surrogates) - - ch &= 0x0F; - - // check that bit 4 is zero and the valid supplimentary character - // range 0x000000 - 0x10FFFF at the same time - if (ch > 0x04) { - ch |= 0xf0; - goto InvalidByteSequence; - } - - // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. - // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag. - ch |= (FinalByte >> 3 * 6) | // Final byte is 3 more bytes from now - (1 << 30) | // If it dies on next byte we'll need an extra char - (3 << (30 - 2 * 6)) | // If it dies on last byte we'll need to subtract a char - (SupplimentarySeq) | (SupplimentarySeq >> 6) | - (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); - - // Our character count will be 2 characters for these 4 bytes, so subtract another char - charCount--; - } - else { - // 3 byte encoding - // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. - ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | - (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); - - // We'll expect 1 character for these 3 bytes, so subtract another char. - charCount--; - } - } - else { - // 2 byte encoding - - ch &= 0x1F; - - // check for non-shortest form - if (ch <= 1) { - ch |= 0xc0; - goto InvalidByteSequence; - } - - // Add bit flags so we'll be flagged correctly - ch |= (FinalByte >> 6); - } - continue; - } - - EncodeChar: - -#ifdef FASTLOOP - int availableBytes = PtrDiff(pEnd, pSrc); - - // don't fall into the fast decoding loop if we don't have enough bytes - if (availableBytes <= 13) { - // try to get over the remainder of the ascii characters fast though - BYTE* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered - while (pSrc < pLocalEnd) { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) - goto ProcessChar; - } - // we are done - ch = 0; - break; - } - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences - BYTE *pStop = pSrc + availableBytes - 7; - - while (pSrc < pStop) { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) { - goto LongCode; - } - - // get pSrc 2-byte aligned - if (((size_t)pSrc & 0x1) != 0) { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) { - goto LongCode; - } - } - - // get pSrc 4-byte aligned - if (((size_t)pSrc & 0x2) != 0) { - ch = *(USHORT*)pSrc; - if ((ch & 0x8080) != 0) { - goto LongCodeWithMask16; - } - pSrc += 2; - } - - - // Run 8 + 8 characters at a time! - while (pSrc < pStop) { - ch = *(int*)pSrc; - int chb = *(int*)(pSrc + 4); - if (((ch | chb) & (int)0x80808080) != 0) { - goto LongCodeWithMask32; - } - pSrc += 8; - - // This is a really small loop - unroll it - if (pSrc >= pStop) - break; - - ch = *(int*)pSrc; - chb = *(int*)(pSrc + 4); - if (((ch | chb) & (int)0x80808080) != 0) { - goto LongCodeWithMask32; - } - pSrc += 8; - } - break; - -#if BIGENDIAN - LongCodeWithMask32 : - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); - LongCodeWithMask16: - ch = (int)(((uint)ch) >> 8); -#else // BIGENDIAN - LongCodeWithMask32: - LongCodeWithMask16: - ch &= 0xFF; -#endif // BIGENDIAN - pSrc++; - if (ch <= 0x7F) { - continue; - } - - LongCode: - int chc = *pSrc; - pSrc++; - - if ( - // bit 6 has to be zero - (ch & 0x40) == 0 || - // we are expecting to see trailing bytes like 10vvvvvv - (chc & 0xC0) != 0x80) - { - goto BadLongCode; - } - - chc &= 0x3F; - - // start a new long code - if ((ch & 0x20) != 0) { - - // fold the first two bytes together - chc |= (ch & 0x0F) << 6; - - if ((ch & 0x10) != 0) { - // 4 byte encoding - surrogate - ch = *pSrc; - if ( - // check that bit 4 is zero, the non-shortest form of surrogate - // and the valid surrogate range 0x000000 - 0x10FFFF at the same time - !InRange(chc >> 4, 0x01, 0x10) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & 0xC0) != 0x80) - { - goto BadLongCode; - } - - chc = (chc << 6) | (ch & 0x3F); - - ch = *(pSrc + 1); - // we are expecting to see trailing bytes like 10vvvvvv - if ((ch & 0xC0) != 0x80) { - goto BadLongCode; - } - pSrc += 2; - - // extra byte - charCount--; - } - else { - // 3 byte encoding - ch = *pSrc; - if ( - // check for non-shortest form of 3 byte seq - (chc & (0x1F << 5)) == 0 || - // Can't have surrogates here. - (chc & (0xF800 >> 6)) == (0xD800 >> 6) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & 0xC0) != 0x80) - { - goto BadLongCode; - } - pSrc++; - - // extra byte - charCount--; - } - } - else { - // 2 byte encoding - - // check for non-shortest form - if ((ch & 0x1E) == 0) { - goto BadLongCode; - } - } - - // extra byte - charCount--; - } -#endif // FASTLOOP - - // no pending bits at this point - ch = 0; - continue; - - BadLongCode: - pSrc -= 2; - ch = 0; - continue; - } - - // May have a problem if we have to flush - if (ch != 0) - { - // We were already adjusting for these, so need to unadjust - charCount += (ch >> 30); - // Have to do fallback for invalid bytes - if (fallback == nullptr) - { - fallback = decoderFallback->CreateFallbackBuffer(); - fallback->InternalInitialize(bytes, nullptr); - } - charCount += FallbackInvalidByteSequence(pSrc, ch, fallback); - } - - // Shouldn't have anything in fallback buffer for GetCharCount - // (don't have to check m_throwOnOverflow for count) - Contract::Assert(fallback == nullptr || fallback->GetRemaining() == 0, - "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end"); - - InternalDelete(fallback); - - return charCount; - - } - - int GetChars(BYTE* bytes, int byteCount, WCHAR* chars, int charCount) - { - Contract::Assert(chars != nullptr, "[UTF8Encoding.GetChars]chars!=nullptr"); - Contract::Assert(byteCount >= 0, "[UTF8Encoding.GetChars]byteCount >=0"); - Contract::Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0"); - Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetChars]bytes!=nullptr"); - - BYTE *pSrc = bytes; - WCHAR *pTarget = chars; - - BYTE *pEnd = pSrc + byteCount; - WCHAR *pAllocatedBufferEnd = pTarget + charCount; - - int ch = 0; - - DecoderFallbackBuffer *fallback = nullptr; - - while (true) - { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - - if (pSrc >= pEnd) { - break; - } - - // read next byte. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - int cha = *pSrc; - - if (ch == 0) { - // no pending bits - goto ReadChar; - } - - pSrc++; - - // we are expecting to see trailing bytes like 10vvvvvv - if ((cha & 0xC0) != 0x80) { - // This can be a valid starting byte for another UTF8 byte sequence, so let's put - // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence - pSrc--; - goto InvalidByteSequence; - } - - // fold in the new byte - ch = (ch << 6) | (cha & 0x3F); - - if ((ch & FinalByte) == 0) { - // Not at last byte yet - Contract::Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0, - "[UTF8Encoding.GetChars]Invariant volation"); - - if ((ch & SupplimentarySeq) != 0) { - // Its a 4-byte supplimentary sequence - if ((ch & (FinalByte >> 6)) != 0) { - // this is 3rd byte of 4 byte sequence - nothing to do - continue; - } - - // 2nd byte of 4 bytes - // check for non-shortest form of surrogate and the valid surrogate - // range 0x000000 - 0x10FFFF at the same time - if (!InRange(ch & 0x1F0, 0x10, 0x100)) { - goto InvalidByteSequence; - } - } - else { - // Must be 2nd byte of a 3-byte sequence - // check for non-shortest form of 3 byte seq - if ((ch & (0x1F << 5)) == 0 || // non-shortest form - (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate - { - goto InvalidByteSequence; - } - } - continue; - } - - // ready to punch - - // surrogate in shortest form? - // Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte? - if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq) { - // let the range check for the second char throw the exception - if (pTarget < pAllocatedBufferEnd) { - *pTarget = (WCHAR)(((ch >> 10) & 0x7FF) + - (SHORT)((CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10)))); - pTarget++; - - ch = (ch & 0x3FF) + - (int)(CharUnicodeInfo::LOW_SURROGATE_START); - } - } - - goto EncodeChar; - - InvalidByteSequence: - // this code fragment should be close to the gotos referencing it - // Have to do fallback for invalid bytes - if (fallback == nullptr) - { - fallback = decoderFallback->CreateFallbackBuffer(); - fallback->InternalInitialize(bytes, pAllocatedBufferEnd); - } - - // That'll back us up the appropriate # of bytes if we didn't get anywhere - if (!FallbackInvalidByteSequence(&pSrc, ch, fallback, &pTarget)) - { - // Ran out of buffer space - // Need to throw an exception? - Contract::Assert(pSrc >= bytes || pTarget == chars, - "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback"); - fallback->InternalReset(); - ThrowCharsOverflow(pTarget == chars); - ch = 0; - break; - } - Contract::Assert(pSrc >= bytes, - "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array"); - ch = 0; - continue; - - ReadChar: - ch = *pSrc; - pSrc++; - - ProcessChar: - if (ch > 0x7F) { - // If its > 0x7F, its start of a new multi-byte sequence - - // bit 6 has to be non-zero - if ((ch & 0x40) == 0) { - goto InvalidByteSequence; - } - - // start a new long code - if ((ch & 0x20) != 0) { - if ((ch & 0x10) != 0) { - // 4 byte encoding - supplimentary character (2 surrogates) - - ch &= 0x0F; - - // check that bit 4 is zero and the valid supplimentary character - // range 0x000000 - 0x10FFFF at the same time - if (ch > 0x04) { - ch |= 0xf0; - goto InvalidByteSequence; - } - - ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) | - (SupplimentarySeq) | (SupplimentarySeq >> 6) | - (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); - } - else { - // 3 byte encoding - ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | - (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); - } - } - else { - // 2 byte encoding - - ch &= 0x1F; - - // check for non-shortest form - if (ch <= 1) { - ch |= 0xc0; - goto InvalidByteSequence; - } - - ch |= (FinalByte >> 6); - } - continue; - } - - EncodeChar: - // write the pending character - if (pTarget >= pAllocatedBufferEnd) - { - // Fix chars so we make sure to throw if we didn't output anything - ch &= 0x1fffff; - if (ch > 0x7f) - { - if (ch > 0x7ff) - { - if (ch >= CharUnicodeInfo::LOW_SURROGATE_START && - ch <= CharUnicodeInfo::LOW_SURROGATE_END) - { - pSrc--; // It was 4 bytes - pTarget--; // 1 was stored already, but we can't remember 1/2, so back up - } - else if (ch > 0xffff) - { - pSrc--; // It was 4 bytes, nothing was stored - } - pSrc--; // It was at least 3 bytes - } - pSrc--; // It was at least 2 bytes - } - pSrc--; - - // Throw that we don't have enough room (pSrc could be < chars if we had started to process - // a 4 byte sequence already) - Contract::Assert(pSrc >= bytes || pTarget == chars, - "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]"); - ThrowCharsOverflow(pTarget == chars); - - // Don't store ch in decoder, we already backed up to its start - ch = 0; - - // Didn't throw, just use this buffer size. - break; - } - *pTarget = (WCHAR)ch; - pTarget++; - -#ifdef FASTLOOP - int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget); - int availableBytes = PtrDiff(pEnd, pSrc); - - // don't fall into the fast decoding loop if we don't have enough bytes - // Test for availableChars is done because pStop would be <= pTarget. - if (availableBytes <= 13) { - // we may need as many as 1 character per byte - if (availableChars < availableBytes) { - // not enough output room. no pending bits at this point - ch = 0; - continue; - } - - // try to get over the remainder of the ascii characters fast though - BYTE* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered - while (pSrc < pLocalEnd) { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) - goto ProcessChar; - - *pTarget = (WCHAR)ch; - pTarget++; - } - // we are done - ch = 0; - break; - } - - // we may need as many as 1 character per byte, so reduce the byte count if necessary. - // If availableChars is too small, pStop will be before pTarget and we won't do fast loop. - if (availableChars < availableBytes) { - availableBytes = availableChars; - } - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences - WCHAR *pStop = pTarget + availableBytes - 7; - - while (pTarget < pStop) { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) { - goto LongCode; - } - *pTarget = (WCHAR)ch; - pTarget++; - - // get pSrc to be 2-byte aligned - if ((((size_t)pSrc) & 0x1) != 0) { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) { - goto LongCode; - } - *pTarget = (WCHAR)ch; - pTarget++; - } - - // get pSrc to be 4-byte aligned - if ((((size_t)pSrc) & 0x2) != 0) { - ch = *(USHORT*)pSrc; - if ((ch & 0x8080) != 0) { - goto LongCodeWithMask16; - } - - // Unfortunately, this is endianness sensitive -#if BIGENDIAN - *pTarget = (WCHAR)((ch >> 8) & 0x7F); - pSrc += 2; - *(pTarget + 1) = (WCHAR)(ch & 0x7F); - pTarget += 2; -#else // BIGENDIAN - *pTarget = (WCHAR)(ch & 0x7F); - pSrc += 2; - *(pTarget + 1) = (WCHAR)((ch >> 8) & 0x7F); - pTarget += 2; -#endif // BIGENDIAN - } - - // Run 8 characters at a time! - while (pTarget < pStop) { - ch = *(int*)pSrc; - int chb = *(int*)(pSrc + 4); - if (((ch | chb) & (int)0x80808080) != 0) { - goto LongCodeWithMask32; - } - - // Unfortunately, this is endianness sensitive -#if BIGENDIAN - *pTarget = (WCHAR)((ch >> 24) & 0x7F); - *(pTarget + 1) = (WCHAR)((ch >> 16) & 0x7F); - *(pTarget + 2) = (WCHAR)((ch >> 8) & 0x7F); - *(pTarget + 3) = (WCHAR)(ch & 0x7F); - pSrc += 8; - *(pTarget + 4) = (WCHAR)((chb >> 24) & 0x7F); - *(pTarget + 5) = (WCHAR)((chb >> 16) & 0x7F); - *(pTarget + 6) = (WCHAR)((chb >> 8) & 0x7F); - *(pTarget + 7) = (WCHAR)(chb & 0x7F); - pTarget += 8; -#else // BIGENDIAN - *pTarget = (WCHAR)(ch & 0x7F); - *(pTarget + 1) = (WCHAR)((ch >> 8) & 0x7F); - *(pTarget + 2) = (WCHAR)((ch >> 16) & 0x7F); - *(pTarget + 3) = (WCHAR)((ch >> 24) & 0x7F); - pSrc += 8; - *(pTarget + 4) = (WCHAR)(chb & 0x7F); - *(pTarget + 5) = (WCHAR)((chb >> 8) & 0x7F); - *(pTarget + 6) = (WCHAR)((chb >> 16) & 0x7F); - *(pTarget + 7) = (WCHAR)((chb >> 24) & 0x7F); - pTarget += 8; -#endif // BIGENDIAN - } - break; - -#if BIGENDIAN - LongCodeWithMask32 : - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); - LongCodeWithMask16: - ch = (int)(((uint)ch) >> 8); -#else // BIGENDIAN - LongCodeWithMask32: - LongCodeWithMask16: - ch &= 0xFF; -#endif // BIGENDIAN - pSrc++; - if (ch <= 0x7F) { - *pTarget = (WCHAR)ch; - pTarget++; - continue; - } - - LongCode: - int chc = *pSrc; - pSrc++; - - if ( - // bit 6 has to be zero - (ch & 0x40) == 0 || - // we are expecting to see trailing bytes like 10vvvvvv - (chc & 0xC0) != 0x80) - { - goto BadLongCode; - } - - chc &= 0x3F; - - // start a new long code - if ((ch & 0x20) != 0) { - - // fold the first two bytes together - chc |= (ch & 0x0F) << 6; - - if ((ch & 0x10) != 0) { - // 4 byte encoding - surrogate - ch = *pSrc; - if ( - // check that bit 4 is zero, the non-shortest form of surrogate - // and the valid surrogate range 0x000000 - 0x10FFFF at the same time - !InRange(chc >> 4, 0x01, 0x10) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & 0xC0) != 0x80) - { - goto BadLongCode; - } - - chc = (chc << 6) | (ch & 0x3F); - - ch = *(pSrc + 1); - // we are expecting to see trailing bytes like 10vvvvvv - if ((ch & 0xC0) != 0x80) { - goto BadLongCode; - } - pSrc += 2; - - ch = (chc << 6) | (ch & 0x3F); - - *pTarget = (WCHAR)(((ch >> 10) & 0x7FF) + - (SHORT)(CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10))); - pTarget++; - - ch = (ch & 0x3FF) + - (SHORT)(CharUnicodeInfo::LOW_SURROGATE_START); - - // extra byte, we're already planning 2 chars for 2 of these bytes, - // but the big loop is testing the target against pStop, so we need - // to subtract 2 more or we risk overrunning the input. Subtract - // one here and one below. - pStop--; - } - else { - // 3 byte encoding - ch = *pSrc; - if ( - // check for non-shortest form of 3 byte seq - (chc & (0x1F << 5)) == 0 || - // Can't have surrogates here. - (chc & (0xF800 >> 6)) == (0xD800 >> 6) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & 0xC0) != 0x80) - { - goto BadLongCode; - } - pSrc++; - - ch = (chc << 6) | (ch & 0x3F); - - // extra byte, we're only expecting 1 char for each of these 3 bytes, - // but the loop is testing the target (not source) against pStop, so - // we need to subtract 2 more or we risk overrunning the input. - // Subtract 1 here and one more below - pStop--; - } - } - else { - // 2 byte encoding - - ch &= 0x1F; - - // check for non-shortest form - if (ch <= 1) { - goto BadLongCode; - } - ch = (ch << 6) | chc; - } - - *pTarget = (WCHAR)ch; - pTarget++; - - // extra byte, we're only expecting 1 char for each of these 2 bytes, - // but the loop is testing the target (not source) against pStop. - // subtract an extra count from pStop so that we don't overrun the input. - pStop--; - } -#endif // FASTLOOP - - Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd"); - - // no pending bits at this point - ch = 0; - continue; - - BadLongCode: - pSrc -= 2; - ch = 0; - continue; - } - - if (ch != 0) - { - // Have to do fallback for invalid bytes - if (fallback == nullptr) - { - fallback = decoderFallback->CreateFallbackBuffer(); - fallback->InternalInitialize(bytes, pAllocatedBufferEnd); - } - - // This'll back us up the appropriate # of bytes if we didn't get anywhere - if (!FallbackInvalidByteSequence(pSrc, ch, fallback)) - { - Contract::Assert(pSrc >= bytes || pTarget == chars, - "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing"); - - // Ran out of buffer space - // Need to throw an exception? - fallback->InternalReset(); - ThrowCharsOverflow(pTarget == chars); - } - Contract::Assert(pSrc >= bytes, - "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array"); - ch = 0; - } - - // Shouldn't have anything in fallback buffer for GetChars - // (don't have to check m_throwOnOverflow for chars) - Contract::Assert(fallback == nullptr || fallback->GetRemaining() == 0, - "[UTF8Encoding.GetChars]Expected empty fallback buffer at end"); - - InternalDelete(fallback); - - return PtrDiff(pTarget, chars); - } - - int GetBytes(WCHAR* chars, int charCount, BYTE* bytes, int byteCount) - { - Contract::Assert(chars != nullptr, "[UTF8Encoding.GetBytes]chars!=nullptr"); - Contract::Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0"); - Contract::Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0"); - Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetBytes]bytes!=nullptr"); - - // For fallback we may need a fallback buffer. - // We wait to initialize it though in case we don't have any broken input unicode - EncoderFallbackBuffer* fallbackBuffer = nullptr; - WCHAR *pSrc = chars; - BYTE *pTarget = bytes; - - WCHAR *pEnd = pSrc + charCount; - BYTE *pAllocatedBufferEnd = pTarget + byteCount; - - int ch = 0; - - // assume that JIT will enregister pSrc, pTarget and ch - - while (true) { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - - if (pSrc >= pEnd) { - - if (ch == 0) { - // Check if there's anything left to get out of the fallback buffer - ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : 0; - if (ch > 0) { - goto ProcessChar; - } - } - else { - // Case of leftover surrogates in the fallback buffer - if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) { - Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF, - "[UTF8Encoding.GetBytes]expected high surrogate"); //, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); - - int cha = ch; - - ch = fallbackBuffer->InternalGetNextChar(); - - if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo::LOW_SURROGATE_START - (CharUnicodeInfo::HIGH_SURROGATE_START << 10)); - goto EncodeChar; - } - else if (ch > 0){ - goto ProcessChar; - } - else { - break; - } - } - } - - // attempt to encode the partial surrogate (will fail or ignore) - if (ch > 0) - goto EncodeChar; - - // We're done - break; - } - - if (ch > 0) { - // We have a high surrogate left over from a previous loop. - Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF, - "[UTF8Encoding.GetBytes]expected high surrogate");//, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); - - // use separate helper variables for local contexts so that the jit optimizations - // won't get confused about the variable lifetimes - int cha = *pSrc; - - // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. - // if (IsLowSurrogate(cha)) { - if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - ch = cha + (ch << 10) + - (0x10000 - - CharUnicodeInfo::LOW_SURROGATE_START - - (CharUnicodeInfo::HIGH_SURROGATE_START << 10)); - - pSrc++; - } - // else ch is still high surrogate and encoding will fail - - // attempt to encode the surrogate or partial surrogate - goto EncodeChar; - } - - // If we've used a fallback, then we have to check for it - if (fallbackBuffer != nullptr) - { - ch = fallbackBuffer->InternalGetNextChar(); - if (ch > 0) goto ProcessChar; - } - - // read next char. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - ch = *pSrc; - pSrc++; - - ProcessChar: - if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) { - continue; - } - // either good char or partial surrogate - - EncodeChar: - // throw exception on partial surrogate if necessary - if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) - { - // Lone surrogates aren't allowed, we have to do fallback for them - // Have to make a fallback buffer if we don't have one - if (fallbackBuffer == nullptr) - { - // wait on fallbacks if we can - // For fallback we may need a fallback buffer - fallbackBuffer = encoderFallback->CreateFallbackBuffer(); - - // Set our internal fallback interesting things. - fallbackBuffer->InternalInitialize(chars, pEnd, true); - } - - // Do our fallback. Actually we already know its a mixed up surrogate, - // so the ref pSrc isn't gonna do anything. - fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc); - - // Ignore it if we don't throw - ch = 0; - continue; - } - - // Count bytes needed - int bytesNeeded = 1; - if (ch > 0x7F) { - if (ch > 0x7FF) { - if (ch > 0xFFFF) { - bytesNeeded++; // 4 bytes (surrogate pair) - } - bytesNeeded++; // 3 bytes (800-FFFF) - } - bytesNeeded++; // 2 bytes (80-7FF) - } - - if (pTarget > pAllocatedBufferEnd - bytesNeeded) { - // Left over surrogate from last time will cause pSrc == chars, so we'll throw - if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) - { - fallbackBuffer->MovePrevious(); // Didn't use this fallback char - if (ch > 0xFFFF) - fallbackBuffer->MovePrevious(); // Was surrogate, didn't use 2nd part either - } - else - { - pSrc--; // Didn't use this char - if (ch > 0xFFFF) - pSrc--; // Was surrogate, didn't use 2nd part either - } - Contract::Assert(pSrc >= chars || pTarget == bytes, - "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room."); - ThrowBytesOverflow(pTarget == bytes); // Throw if we must - ch = 0; // Nothing left over (we backed up to start of pair if supplimentary) - break; - } - - if (ch <= 0x7F) { - *pTarget = (BYTE)ch; - } - else { - // use separate helper variables for local contexts so that the jit optimizations - // won't get confused about the variable lifetimes - int chb; - if (ch <= 0x7FF) { - // 2 BYTE encoding - chb = (BYTE)(0xC0 | (ch >> 6)); - } - else - { - if (ch <= 0xFFFF) { - chb = (BYTE)(0xE0 | (ch >> 12)); - } - else - { - *pTarget = (BYTE)(0xF0 | (ch >> 18)); - pTarget++; - - chb = 0x80 | ((ch >> 12) & 0x3F); - } - *pTarget = (BYTE)chb; - pTarget++; - - chb = 0x80 | ((ch >> 6) & 0x3F); - } - *pTarget = (BYTE)chb; - pTarget++; - - *pTarget = (BYTE)0x80 | (ch & 0x3F); - } - pTarget++; - - -#ifdef FASTLOOP - // If still have fallback don't do fast loop - if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != 0) - goto ProcessChar; - - int availableChars = PtrDiff(pEnd, pSrc); - int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget); - - // don't fall into the fast decoding loop if we don't have enough characters - // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop. - if (availableChars <= 13) { - // we are hoping for 1 BYTE per char - if (availableBytes < availableChars) { - // not enough output room. no pending bits at this point - ch = 0; - continue; - } - - // try to get over the remainder of the ascii characters fast though - WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered - while (pSrc < pLocalEnd) { - ch = *pSrc; - pSrc++; - - // Not ASCII, need more than 1 BYTE per char - if (ch > 0x7F) - goto ProcessChar; - - *pTarget = (BYTE)ch; - pTarget++; - } - // we are done, let ch be 0 to clear encoder - ch = 0; - break; - } - - // we need at least 1 BYTE per character, but Convert might allow us to convert - // only part of the input, so try as much as we can. Reduce charCount if necessary - if (availableBytes < availableChars) - { - availableChars = availableBytes; - } - - // FASTLOOP: - // - optimistic range checks - // - fallbacks to the slow loop for all special cases, exception throwing, etc. - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates - // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop. - WCHAR *pStop = pSrc + availableChars - 5; - - while (pSrc < pStop) { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) { - goto LongCode; - } - *pTarget = (BYTE)ch; - pTarget++; - - // get pSrc aligned - if (((size_t)pSrc & 0x2) != 0) { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) { - goto LongCode; - } - *pTarget = (BYTE)ch; - pTarget++; - } - - // Run 4 characters at a time! - while (pSrc < pStop) { - ch = *(int*)pSrc; - int chc = *(int*)(pSrc + 2); - if (((ch | chc) & (int)0xFF80FF80) != 0) { - goto LongCodeWithMask; - } - - // Unfortunately, this is endianness sensitive -#if BIGENDIAN - *pTarget = (BYTE)(ch >> 16); - *(pTarget + 1) = (BYTE)ch; - pSrc += 4; - *(pTarget + 2) = (BYTE)(chc >> 16); - *(pTarget + 3) = (BYTE)chc; - pTarget += 4; -#else // BIGENDIAN - *pTarget = (BYTE)ch; - *(pTarget + 1) = (BYTE)(ch >> 16); - pSrc += 4; - *(pTarget + 2) = (BYTE)chc; - *(pTarget + 3) = (BYTE)(chc >> 16); - pTarget += 4; -#endif // BIGENDIAN - } - continue; - - LongCodeWithMask: -#if BIGENDIAN - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); -#else // BIGENDIAN - ch = (WCHAR)ch; -#endif // BIGENDIAN - pSrc++; - - if (ch > 0x7F) { - goto LongCode; - } - *pTarget = (BYTE)ch; - pTarget++; - continue; - - LongCode: - // use separate helper variables for slow and fast loop so that the jit optimizations - // won't get confused about the variable lifetimes - int chd; - if (ch <= 0x7FF) { - // 2 BYTE encoding - chd = 0xC0 | (ch >> 6); - } - else { - if (!InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - // 3 BYTE encoding - chd = 0xE0 | (ch >> 12); - } - else - { - // 4 BYTE encoding - high surrogate + low surrogate - if (ch > CharUnicodeInfo::HIGH_SURROGATE_END) { - // low without high -> bad, try again in slow loop - pSrc -= 1; - break; - } - - chd = *pSrc; - pSrc++; - - // if (!IsLowSurrogate(chd)) { - if (!InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - // high not followed by low -> bad, try again in slow loop - pSrc -= 2; - break; - } - - ch = chd + (ch << 10) + - (0x10000 - - CharUnicodeInfo::LOW_SURROGATE_START - - (CharUnicodeInfo::HIGH_SURROGATE_START << 10)); - - *pTarget = (BYTE)(0xF0 | (ch >> 18)); - // pStop - this BYTE is compensated by the second surrogate character - // 2 input chars require 4 output bytes. 2 have been anticipated already - // and 2 more will be accounted for by the 2 pStop-- calls below. - pTarget++; - - chd = 0x80 | ((ch >> 12) & 0x3F); - } - *pTarget = (BYTE)chd; - pStop--; // 3 BYTE sequence for 1 char, so need pStop-- and the one below too. - pTarget++; - - chd = 0x80 | ((ch >> 6) & 0x3F); - } - *pTarget = (BYTE)chd; - pStop--; // 2 BYTE sequence for 1 char so need pStop--. - pTarget++; - - *pTarget = (BYTE)(0x80 | (ch & 0x3F)); - // pStop - this BYTE is already included - pTarget++; - } - - Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd"); - -#endif // FASTLOOP - - // no pending char at this point - ch = 0; - } - - InternalDelete(fallbackBuffer); - - return (int)(pTarget - bytes); - } - - int GetByteCount(WCHAR *chars, int count) - { - // For fallback we may need a fallback buffer. - // We wait to initialize it though in case we don't have any broken input unicode - EncoderFallbackBuffer* fallbackBuffer = nullptr; - WCHAR *pSrc = chars; - WCHAR *pEnd = pSrc + count; - - // Start by assuming we have as many as count - int byteCount = count; - - int ch = 0; - - while (true) { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - if (pSrc >= pEnd) { - - if (ch == 0) { - // Unroll any fallback that happens at the end - ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : 0; - if (ch > 0) { - byteCount++; - goto ProcessChar; - } - } - else { - // Case of surrogates in the fallback. - if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) { - Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF, - "[UTF8Encoding.GetBytes]expected high surrogate");// , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); - - ch = fallbackBuffer->InternalGetNextChar(); - byteCount++; - - if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - ch = 0xfffd; - byteCount++; - goto EncodeChar; - } - else if (ch > 0){ - goto ProcessChar; - } - else { - byteCount--; // ignore last one. - break; - } - } - } - - if (ch <= 0) { - break; - } - - // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1. - byteCount++; - goto EncodeChar; - } - - if (ch > 0) { - Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF, - "[UTF8Encoding.GetBytes]expected high surrogate"); // , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); - - // use separate helper variables for local contexts so that the jit optimizations - // won't get confused about the variable lifetimes - int cha = *pSrc; - - // count the pending surrogate - byteCount++; - - // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. - // if (IsLowSurrogate(cha)) { - if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do. - ch = 0xfffd; - // ch = cha + (ch << 10) + - // (0x10000 - // - CharUnicodeInfo::LOW_SURROGATE_START - // - (CharUnicodeInfo::HIGH_SURROGATE_START << 10) ); - - // Use this next char - pSrc++; - } - // else ch is still high surrogate and encoding will fail (so don't add count) - - // attempt to encode the surrogate or partial surrogate - goto EncodeChar; - } - - // If we've used a fallback, then we have to check for it - if (fallbackBuffer != nullptr) - { - ch = fallbackBuffer->InternalGetNextChar(); - if (ch > 0) - { - // We have an extra byte we weren't expecting. - byteCount++; - goto ProcessChar; - } - } - - // read next char. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - ch = *pSrc; - pSrc++; - - ProcessChar: - if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) { - // we will count this surrogate next time around - byteCount--; - continue; - } - // either good char or partial surrogate - - EncodeChar: - // throw exception on partial surrogate if necessary - if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) - { - // Lone surrogates aren't allowed - // Have to make a fallback buffer if we don't have one - if (fallbackBuffer == nullptr) - { - // wait on fallbacks if we can - // For fallback we may need a fallback buffer - fallbackBuffer = encoderFallback->CreateFallbackBuffer(); - - // Set our internal fallback interesting things. - fallbackBuffer->InternalInitialize(chars, chars + count, false); - } - - // Do our fallback. Actually we already know its a mixed up surrogate, - // so the ref pSrc isn't gonna do anything. - fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc); - - // Ignore it if we don't throw (we had preallocated this ch) - byteCount--; - ch = 0; - continue; - } - - // Count them - if (ch > 0x7F) { - if (ch > 0x7FF) { - // the extra surrogate byte was compensated by the second surrogate character - // (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char) - byteCount++; - } - byteCount++; - } - -#if WIN64 - // check for overflow - if (byteCount < 0) { - break; - } -#endif - -#ifdef FASTLOOP - // If still have fallback don't do fast loop - if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != 0) - { - // We're reserving 1 byte for each char by default - byteCount++; - goto ProcessChar; - } - - int availableChars = PtrDiff(pEnd, pSrc); - - // don't fall into the fast decoding loop if we don't have enough characters - if (availableChars <= 13) { - // try to get over the remainder of the ascii characters fast though - WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered - while (pSrc < pLocalEnd) { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) - goto ProcessChar; - } - - // we are done - break; - } - -#if WIN64 - // make sure that we won't get a silent overflow inside the fast loop - // (Fall out to slow loop if we have this many characters) - availableChars &= 0x0FFFFFFF; -#endif - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates - WCHAR *pStop = pSrc + availableChars - (3 + 4); - - while (pSrc < pStop) { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) // Not ASCII - { - if (ch > 0x7FF) // Not 2 Byte - { - if ((ch & 0xF800) == 0xD800) // See if its a Surrogate - goto LongCode; - byteCount++; - } - byteCount++; - } - - // get pSrc aligned - if (((size_t)pSrc & 0x2) != 0) { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) // Not ASCII - { - if (ch > 0x7FF) // Not 2 Byte - { - if ((ch & 0xF800) == 0xD800) // See if its a Surrogate - goto LongCode; - byteCount++; - } - byteCount++; - } - } - - // Run 2 * 4 characters at a time! - while (pSrc < pStop) { - ch = *(int*)pSrc; - int chc = *(int*)(pSrc + 2); - if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII - { - if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte - { - goto LongCodeWithMask; - } - - - if ((ch & (int)0xFF800000) != 0) // Actually 0x07800780 is all we care about (4 bits) - byteCount++; - if ((ch & (int)0xFF80) != 0) - byteCount++; - if ((chc & (int)0xFF800000) != 0) - byteCount++; - if ((chc & (int)0xFF80) != 0) - byteCount++; - } - pSrc += 4; - - ch = *(int*)pSrc; - chc = *(int*)(pSrc + 2); - if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII - { - if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte - { - goto LongCodeWithMask; - } - - if ((ch & (int)0xFF800000) != 0) - byteCount++; - if ((ch & (int)0xFF80) != 0) - byteCount++; - if ((chc & (int)0xFF800000) != 0) - byteCount++; - if ((chc & (int)0xFF80) != 0) - byteCount++; - } - pSrc += 4; - } - break; - - LongCodeWithMask: -#if BIGENDIAN - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); -#else // BIGENDIAN - ch = (WCHAR)ch; -#endif // BIGENDIAN - pSrc++; - - if (ch <= 0x7F) { - continue; - } - - LongCode: - // use separate helper variables for slow and fast loop so that the jit optimizations - // won't get confused about the variable lifetimes - if (ch > 0x7FF) { - if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - // 4 byte encoding - high surrogate + low surrogate - - int chd = *pSrc; - if ( - ch > CharUnicodeInfo::HIGH_SURROGATE_END || - !InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) - { - // Back up and drop out to slow loop to figure out error - pSrc--; - break; - } - pSrc++; - - // byteCount - this byte is compensated by the second surrogate character - } - byteCount++; - } - byteCount++; - - // byteCount - the last byte is already included - } -#endif // FASTLOOP - - // no pending char at this point - ch = 0; - } - -#if WIN64 - // check for overflow - if (byteCount < 0) { - throw ArgumentException("Conversion buffer overflow."); - } -#endif - - Contract::Assert(fallbackBuffer == nullptr || fallbackBuffer->GetRemaining() == 0, - "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer"); - - InternalDelete(fallbackBuffer); - - return byteCount; - } - -}; - - -//////////////////////////////////////////////////////////////////////////// -// -// UTF8ToUnicode -// -// Maps a UTF-8 character string to its wide character string counterpart. -// -//////////////////////////////////////////////////////////////////////////// - -int UTF8ToUnicode( - LPCSTR lpSrcStr, - int cchSrc, - LPWSTR lpDestStr, - int cchDest, - DWORD dwFlags - ) -{ - int ret; - UTF8Encoding enc(dwFlags & MB_ERR_INVALID_CHARS); - try { - ret = enc.GetCharCount((BYTE*)lpSrcStr, cchSrc); - if (cchDest){ - if (ret > cchDest){ - SetLastError(ERROR_INSUFFICIENT_BUFFER); - ret = 0; - } - enc.GetChars((BYTE*)lpSrcStr, cchSrc, (WCHAR*)lpDestStr, ret); - } - } - catch (const InsufficientBufferException& e){ - SetLastError(ERROR_INSUFFICIENT_BUFFER); - return 0; - } - catch (const DecoderFallbackException& e){ - SetLastError(ERROR_NO_UNICODE_TRANSLATION); - return 0; - } - catch (const ArgumentException& e){ - SetLastError(ERROR_INVALID_PARAMETER); - return 0; - } - return ret; -} - -//////////////////////////////////////////////////////////////////////////// -// -// UnicodeToUTF8 -// -// Maps a Unicode character string to its UTF-8 string counterpart. -// -//////////////////////////////////////////////////////////////////////////// - -int UnicodeToUTF8( - LPCWSTR lpSrcStr, - int cchSrc, - LPSTR lpDestStr, - int cchDest) -{ - int ret; - UTF8Encoding enc(false); - try{ - ret = enc.GetByteCount((WCHAR*)lpSrcStr, cchSrc); - if (cchDest){ - if (ret > cchDest){ - SetLastError(ERROR_INSUFFICIENT_BUFFER); - ret = 0; - } - enc.GetBytes((WCHAR*)lpSrcStr, cchSrc, (BYTE*)lpDestStr, ret); - } - } - catch (const InsufficientBufferException& e){ - SetLastError(ERROR_INSUFFICIENT_BUFFER); - return 0; - } - catch (const EncoderFallbackException& e){ - SetLastError(ERROR_NO_UNICODE_TRANSLATION); - return 0; - } - catch (const ArgumentException& e){ - SetLastError(ERROR_INVALID_PARAMETER); - return 0; - } - return ret; -} diff --git a/src/coreclr/pal/tests/palsuite/locale_info/MultiByteToWideChar/test4/test4.cpp b/src/coreclr/pal/tests/palsuite/locale_info/MultiByteToWideChar/test4/test4.cpp index cab71f15e7098e..2b9f67b17bfbde 100644 --- a/src/coreclr/pal/tests/palsuite/locale_info/MultiByteToWideChar/test4/test4.cpp +++ b/src/coreclr/pal/tests/palsuite/locale_info/MultiByteToWideChar/test4/test4.cpp @@ -217,7 +217,7 @@ PALTEST(locale_info_MultiByteToWideChar_test4_paltest_multibytetowidechar_test4, if (wcscmp(wideBuffer, unicodeStrings[i]) != 0) { - Fail("MultiByteToWideChar string %d: the resulting string doesn't match the expected one!\n", i); + printf("MultiByteToWideChar string %d: the resulting string doesn't match the expected one!\n", i); } free(wideBuffer); diff --git a/src/coreclr/pal/tests/palsuite/locale_info/WideCharToMultiByte/test5/test5.cpp b/src/coreclr/pal/tests/palsuite/locale_info/WideCharToMultiByte/test5/test5.cpp index bf2dabedefa880..387015f0af71cc 100644 --- a/src/coreclr/pal/tests/palsuite/locale_info/WideCharToMultiByte/test5/test5.cpp +++ b/src/coreclr/pal/tests/palsuite/locale_info/WideCharToMultiByte/test5/test5.cpp @@ -141,7 +141,7 @@ PALTEST(locale_info_WideCharToMultiByte_test5_paltest_widechartomultibyte_test5, if (strcmp(utf8Buffer, utf8Strings[i]) != 0) { - Fail("WideCharToMultiByte string %d: the resulting string doesn't match the expected one!\n", i); + printf("WideCharToMultiByte string %d: the resulting string doesn't match the expected one!\n", i); } free(utf8Buffer); diff --git a/src/libraries/System.Reflection.Emit/tests/TypeBuilder/TypeBuilderDefineEvent.cs b/src/libraries/System.Reflection.Emit/tests/TypeBuilder/TypeBuilderDefineEvent.cs index 05cda03a777379..a01945d7f64911 100644 --- a/src/libraries/System.Reflection.Emit/tests/TypeBuilder/TypeBuilderDefineEvent.cs +++ b/src/libraries/System.Reflection.Emit/tests/TypeBuilder/TypeBuilderDefineEvent.cs @@ -26,7 +26,7 @@ public static IEnumerable TestData() } [Theory] - [ActiveIssue("https://github.com/dotnet/runtime/issues/2389", TestRuntimes.Mono)] + [ActiveIssue("https://github.com/dotnet/runtime/issues/2389", TestRuntimes.Mono | TestRuntimes.CoreCLR)] [MemberData(nameof(TestData))] public void DefineEvent(string name, EventAttributes attributes, Type eventType, string expectedName, EventAttributes expectedAttributes) { diff --git a/src/libraries/System.Reflection.Emit/tests/TypeBuilder/TypeBuilderDefineProperty.cs b/src/libraries/System.Reflection.Emit/tests/TypeBuilder/TypeBuilderDefineProperty.cs index 84d27ee2f98e31..500ee8104766d7 100644 --- a/src/libraries/System.Reflection.Emit/tests/TypeBuilder/TypeBuilderDefineProperty.cs +++ b/src/libraries/System.Reflection.Emit/tests/TypeBuilder/TypeBuilderDefineProperty.cs @@ -29,7 +29,7 @@ public static IEnumerable TestData() } [Theory] - [ActiveIssue("https://github.com/dotnet/runtime/issues/2389", TestRuntimes.Mono)] + [ActiveIssue("https://github.com/dotnet/runtime/issues/2389", TestRuntimes.Mono | TestRuntimes.CoreCLR)] [MemberData(nameof(TestData))] public void DefineProperty(string name, PropertyAttributes attributes, Type returnType, Type[] parameterTypes, string expectedName, PropertyAttributes expectedPropertyAttributes) { diff --git a/src/mono/mono/eglib/CMakeLists.txt b/src/mono/mono/eglib/CMakeLists.txt index 3de4a9c83d2f5b..b2945231711711 100644 --- a/src/mono/mono/eglib/CMakeLists.txt +++ b/src/mono/mono/eglib/CMakeLists.txt @@ -20,7 +20,7 @@ set(eglib_common_sources gbytearray.c gerror.c ghashtable.c - giconv.c + ${CLR_SRC_NATIVE_DIR}/minipal/utf8converter.c gmem.c goutput.c gstr.c @@ -32,8 +32,7 @@ set(eglib_common_sources gpath.c gspawn.c gfile.c - gfile-posix.c - gutf8.c) + gfile-posix.c) set(eglib_headers glib.h diff --git a/src/mono/mono/eglib/glib.h b/src/mono/mono/eglib/glib.h index e438c00298ec72..ef2101315eee98 100644 --- a/src/mono/mono/eglib/glib.h +++ b/src/mono/mono/eglib/glib.h @@ -29,6 +29,7 @@ #include #include #include +#include "../utils/mono-errno.h" #ifndef EGLIB_NO_REMAP #include diff --git a/src/mono/mono/eglib/gutf8.c b/src/mono/mono/eglib/gutf8.c deleted file mode 100644 index 965a69f42e655d..00000000000000 --- a/src/mono/mono/eglib/gutf8.c +++ /dev/null @@ -1,323 +0,0 @@ -/* - * gutf8.c: UTF-8 conversion - * - * Author: - * Atsushi Enomoto - * - * (C) 2006 Novell, Inc. - * Copyright 2012 Xamarin Inc - */ -#include "config.h" -#include -#include - -/* - * Index into the table below with the first byte of a UTF-8 sequence to get - * the number of bytes that are supposed to follow it to complete the sequence. - * - * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is left - * as-is for anyone who may want to do such conversion, which was allowed in - * earlier algorithms. -*/ -const guchar g_utf8_jump_table[256] = { - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 -}; - -static gboolean -utf8_validate (const unsigned char *inptr, size_t len) -{ - const unsigned char *ptr = inptr + len; - unsigned char c; - - /* Everything falls through when TRUE... */ - switch (len) { - default: - return FALSE; - case 4: - if ((c = (*--ptr)) < 0x80 || c > 0xBF) - return FALSE; - - if ((c == 0xBF || c == 0xBE) && ptr[-1] == 0xBF) { - if (ptr[-2] == 0x8F || ptr[-2] == 0x9F || - ptr[-2] == 0xAF || ptr[-2] == 0xBF) - return FALSE; - } - case 3: - if ((c = (*--ptr)) < 0x80 || c > 0xBF) - return FALSE; - case 2: - if ((c = (*--ptr)) < 0x80 || c > 0xBF) - return FALSE; - - /* no fall-through in this inner switch */ - switch (*inptr) { - case 0xE0: if (c < 0xA0) return FALSE; break; - case 0xED: if (c > 0x9F) return FALSE; break; - case 0xEF: if (c == 0xB7 && (ptr[1] > 0x8F && ptr[1] < 0xB0)) return FALSE; - if (c == 0xBF && (ptr[1] == 0xBE || ptr[1] == 0xBF)) return FALSE; - break; - case 0xF0: if (c < 0x90) return FALSE; break; - case 0xF4: if (c > 0x8F) return FALSE; break; - default: if (c < 0x80) return FALSE; break; - } - case 1: if (*inptr >= 0x80 && *inptr < 0xC2) return FALSE; - } - - if (*inptr > 0xF4) - return FALSE; - - return TRUE; -} - -/** - * g_utf8_validate: - * @str: a utf-8 encoded string - * @max_len: max number of bytes to validate (or -1 to validate the entire null-terminated string) - * @end: output parameter to mark the end of the valid input - * - * Checks @utf for being valid UTF-8. @str is assumed to be - * null-terminated. This function is not super-strict, as it will - * allow longer UTF-8 sequences than necessary. Note that Java is - * capable of producing these sequences if provoked. Also note, this - * routine checks for the 4-byte maximum size, but does not check for - * 0x10ffff maximum value. - * - * Return value: %TRUE if @str is valid or %FALSE otherwise. - **/ -gboolean -g_utf8_validate (const gchar *str, gssize max_len, const gchar **end) -{ - guchar *inptr = (guchar *) str; - gboolean valid = TRUE; - guint length, min; - gssize n = 0; - - if (max_len == 0) - return FALSE; - - if (max_len < 0) { - while (*inptr != 0) { - length = g_utf8_jump_table[*inptr]; - if (!utf8_validate (inptr, length)) { - valid = FALSE; - break; - } - - inptr += length; - } - } else { - while (n < max_len) { - if (*inptr == 0) { - /* Note: return FALSE if we encounter nul-byte - * before max_len is reached. */ - valid = FALSE; - break; - } - - length = g_utf8_jump_table[*inptr]; - min = MIN (length, GSSIZE_TO_UINT (max_len - n)); - - if (!utf8_validate (inptr, min)) { - valid = FALSE; - break; - } - - if (min < length) { - valid = FALSE; - break; - } - - inptr += length; - n += length; - } - } - - if (end != NULL) - *end = (gchar *) inptr; - - return valid; -} - -gunichar -g_utf8_get_char_validated (const gchar *str, gssize max_len) -{ - unsigned char *inptr = (unsigned char *) str; - gunichar u = *inptr; - int n, i; - - if (max_len == 0) - return -2; - - if (u < 0x80) { - /* simple ascii case */ - return u; - } else if (u < 0xc2) { - return -1; - } else if (u < 0xe0) { - u &= 0x1f; - n = 2; - } else if (u < 0xf0) { - u &= 0x0f; - n = 3; - } else if (u < 0xf8) { - u &= 0x07; - n = 4; - } else if (u < 0xfc) { - u &= 0x03; - n = 5; - } else if (u < 0xfe) { - u &= 0x01; - n = 6; - } else { - return -1; - } - - if (max_len > 0) { - if (!utf8_validate (inptr, MIN (max_len, n))) - return -1; - - if (max_len < n) - return -2; - } else { - if (!utf8_validate (inptr, n)) - return -1; - } - - for (i = 1; i < n; i++) - u = (u << 6) | (*++inptr ^ 0x80); - - return u; -} - -glong -g_utf8_strlen (const gchar *str, gssize max_len) -{ - const guchar *inptr = (const guchar *) str; - glong clen = 0, len = 0, n; - - if (max_len == 0) - return 0; - - if (max_len < 0) { - while (*inptr) { - inptr += g_utf8_jump_table[*inptr]; - len++; - } - } else { - while (len < max_len && *inptr) { - n = g_utf8_jump_table[*inptr]; - if ((clen + n) > max_len) - break; - - inptr += n; - clen += n; - len++; - } - } - - return len; -} - -gunichar -g_utf8_get_char (const gchar *src) -{ - unsigned char *inptr = (unsigned char *) src; - gunichar u = *inptr; - int n, i; - - if (u < 0x80) { - /* simple ascii case */ - return u; - } else if (u < 0xe0) { - u &= 0x1f; - n = 2; - } else if (u < 0xf0) { - u &= 0x0f; - n = 3; - } else if (u < 0xf8) { - u &= 0x07; - n = 4; - } else if (u < 0xfc) { - u &= 0x03; - n = 5; - } else { - u &= 0x01; - n = 6; - } - - for (i = 1; i < n; i++) - u = (u << 6) | (*++inptr ^ 0x80); - - return u; -} - -gchar * -g_utf8_offset_to_pointer (const gchar *str, glong offset) -{ - const gchar *p = str; - - if (offset > 0) { - do { - p = g_utf8_next_char (p); - offset --; - } while (offset > 0); - } - else if (offset < 0) { - const gchar *jump = str; - do { - // since the minimum size of a character is 1 - // we know we can step back at least offset bytes - jump = jump + offset; - - // if we land in the middle of a character - // walk to the beginning - while ((*jump & 0xc0) == 0x80) - jump --; - - // count how many characters we've actually walked - // by going forward - p = jump; - do { - p = g_utf8_next_char (p); - offset ++; - } while (p < jump); - - } while (offset < 0); - } - - return (gchar *)p; -} - -glong -g_utf8_pointer_to_offset (const gchar *str, const gchar *pos) -{ - const gchar *inptr, *inend; - glong offset = 0; - glong sign = 1; - - if (pos == str) - return 0; - - if (str < pos) { - inptr = str; - inend = pos; - } else { - inptr = pos; - inend = str; - sign = -1; - } - - do { - inptr = g_utf8_next_char (inptr); - offset++; - } while (inptr < inend); - - return offset * sign; -} diff --git a/src/mono/mono/eglib/giconv.c b/src/native/minipal/utf8converter.c similarity index 68% rename from src/mono/mono/eglib/giconv.c rename to src/native/minipal/utf8converter.c index 664ad31bba258a..0aeada3f4773a0 100644 --- a/src/mono/mono/eglib/giconv.c +++ b/src/native/minipal/utf8converter.c @@ -1,32 +1,7 @@ -/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ -/* - * Copyright (C) 2011 Jeffrey Stedfast - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - */ -#include -#include -#include -#include -#include "../utils/mono-errno.h" +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#include #ifdef _MSC_VER #define FORCE_INLINE(RET_TYPE) __forceinline RET_TYPE @@ -34,40 +9,333 @@ #define FORCE_INLINE(RET_TYPE) inline RET_TYPE __attribute__((always_inline)) #endif - -#define UNROLL_DECODE_UTF8 0 -#define UNROLL_ENCODE_UTF8 0 - -static int decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar); -static int encode_utf32be (gunichar c, char *outbuf, size_t outleft); - -static int decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar); -static int encode_utf32le (gunichar c, char *outbuf, size_t outleft); - -static int decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar); -static int encode_utf16be (gunichar c, char *outbuf, size_t outleft); - -static int decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar); -static int encode_utf16le (gunichar c, char *outbuf, size_t outleft); - -static FORCE_INLINE (int) decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar); -static int encode_utf8 (gunichar c, char *outbuf, size_t outleft); - -static int decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar); -static int encode_latin1 (gunichar c, char *outbuf, size_t outleft); - #if G_BYTE_ORDER == G_LITTLE_ENDIAN #define decode_utf32 decode_utf32le #define encode_utf32 encode_utf32le #define decode_utf16 decode_utf16le #define encode_utf16 encode_utf16le +#define GUINT16_TO_LE(x) (x) +#define GUINT16_TO_BE(x) GUINT16_SWAP_LE_BE(x) #else #define decode_utf32 decode_utf32be #define encode_utf32 encode_utf32be #define decode_utf16 decode_utf16be #define encode_utf16 encode_utf16be +#define GUINT16_TO_LE(x) GUINT16_SWAP_LE_BE(x) +#define GUINT16_TO_BE(x) (x) #endif +/* + * Index into the table below with the first byte of a UTF-8 sequence to get + * the number of bytes that are supposed to follow it to complete the sequence. + * + * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is left + * as-is for anyone who may want to do such conversion, which was allowed in + * earlier algorithms. +*/ +const guchar g_utf8_jump_table[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 +}; + +static gboolean +utf8_validate (const unsigned char *inptr, size_t len) +{ + const unsigned char *ptr = inptr + len; + unsigned char c; + + /* Everything falls through when TRUE... */ + switch (len) { + default: + return FALSE; + case 4: + if ((c = (*--ptr)) < 0x80 || c > 0xBF) + return FALSE; + + if ((c == 0xBF || c == 0xBE) && ptr[-1] == 0xBF) { + if (ptr[-2] == 0x8F || ptr[-2] == 0x9F || + ptr[-2] == 0xAF || ptr[-2] == 0xBF) + return FALSE; + } + case 3: + if ((c = (*--ptr)) < 0x80 || c > 0xBF) + return FALSE; + case 2: + if ((c = (*--ptr)) < 0x80 || c > 0xBF) + return FALSE; + + /* no fall-through in this inner switch */ + switch (*inptr) { + case 0xE0: if (c < 0xA0) return FALSE; break; + case 0xED: if (c > 0x9F) return FALSE; break; + case 0xEF: if (c == 0xB7 && (ptr[1] > 0x8F && ptr[1] < 0xB0)) return FALSE; + if (c == 0xBF && (ptr[1] == 0xBE || ptr[1] == 0xBF)) return FALSE; + break; + case 0xF0: if (c < 0x90) return FALSE; break; + case 0xF4: if (c > 0x8F) return FALSE; break; + default: if (c < 0x80) return FALSE; break; + } + case 1: if (*inptr >= 0x80 && *inptr < 0xC2) return FALSE; + } + + if (*inptr > 0xF4) + return FALSE; + + return TRUE; +} + +/** + * g_utf8_validate: + * @str: a utf-8 encoded string + * @max_len: max number of bytes to validate (or -1 to validate the entire null-terminated string) + * @end: output parameter to mark the end of the valid input + * + * Checks @utf for being valid UTF-8. @str is assumed to be + * null-terminated. This function is not super-strict, as it will + * allow longer UTF-8 sequences than necessary. Note that Java is + * capable of producing these sequences if provoked. Also note, this + * routine checks for the 4-byte maximum size, but does not check for + * 0x10ffff maximum value. + * + * Return value: %TRUE if @str is valid or %FALSE otherwise. + **/ +gboolean +g_utf8_validate (const gchar *str, gssize max_len, const gchar **end) +{ + guchar *inptr = (guchar *) str; + gboolean valid = TRUE; + guint length, min; + gssize n = 0; + + if (max_len == 0) + return FALSE; + + if (max_len < 0) { + while (*inptr != 0) { + length = g_utf8_jump_table[*inptr]; + if (!utf8_validate (inptr, length)) { + valid = FALSE; + break; + } + + inptr += length; + } + } else { + while (n < max_len) { + if (*inptr == 0) { + /* Note: return FALSE if we encounter nul-byte + * before max_len is reached. */ + valid = FALSE; + break; + } + + length = g_utf8_jump_table[*inptr]; + min = MIN (length, GSSIZE_TO_UINT (max_len - n)); + + if (!utf8_validate (inptr, min)) { + valid = FALSE; + break; + } + + if (min < length) { + valid = FALSE; + break; + } + + inptr += length; + n += length; + } + } + + if (end != NULL) + *end = (gchar *) inptr; + + return valid; +} + +gunichar +g_utf8_get_char_validated (const gchar *str, gssize max_len) +{ + unsigned char *inptr = (unsigned char *) str; + gunichar u = *inptr; + int n, i; + + if (max_len == 0) + return -2; + + if (u < 0x80) { + /* simple ascii case */ + return u; + } else if (u < 0xc2) { + return -1; + } else if (u < 0xe0) { + u &= 0x1f; + n = 2; + } else if (u < 0xf0) { + u &= 0x0f; + n = 3; + } else if (u < 0xf8) { + u &= 0x07; + n = 4; + } else if (u < 0xfc) { + u &= 0x03; + n = 5; + } else if (u < 0xfe) { + u &= 0x01; + n = 6; + } else { + return -1; + } + + if (max_len > 0) { + if (!utf8_validate (inptr, MIN (max_len, n))) + return -1; + + if (max_len < n) + return -2; + } else { + if (!utf8_validate (inptr, n)) + return -1; + } + + for (i = 1; i < n; i++) + u = (u << 6) | (*++inptr ^ 0x80); + + return u; +} + +glong +g_utf8_strlen (const gchar *str, gssize max_len) +{ + const guchar *inptr = (const guchar *) str; + glong clen = 0, len = 0, n; + + if (max_len == 0) + return 0; + + if (max_len < 0) { + while (*inptr) { + inptr += g_utf8_jump_table[*inptr]; + len++; + } + } else { + while (len < max_len && *inptr) { + n = g_utf8_jump_table[*inptr]; + if ((clen + n) > max_len) + break; + + inptr += n; + clen += n; + len++; + } + } + + return len; +} + +gunichar +g_utf8_get_char (const gchar *src) +{ + unsigned char *inptr = (unsigned char *) src; + gunichar u = *inptr; + int n, i; + + if (u < 0x80) { + /* simple ascii case */ + return u; + } else if (u < 0xe0) { + u &= 0x1f; + n = 2; + } else if (u < 0xf0) { + u &= 0x0f; + n = 3; + } else if (u < 0xf8) { + u &= 0x07; + n = 4; + } else if (u < 0xfc) { + u &= 0x03; + n = 5; + } else { + u &= 0x01; + n = 6; + } + + for (i = 1; i < n; i++) + u = (u << 6) | (*++inptr ^ 0x80); + + return u; +} + +gchar * +g_utf8_offset_to_pointer (const gchar *str, glong offset) +{ + const gchar *p = str; + + if (offset > 0) { + do { + p = g_utf8_next_char (p); + offset --; + } while (offset > 0); + } + else if (offset < 0) { + const gchar *jump = str; + do { + // since the minimum size of a character is 1 + // we know we can step back at least offset bytes + jump = jump + offset; + + // if we land in the middle of a character + // walk to the beginning + while ((*jump & 0xc0) == 0x80) + jump --; + + // count how many characters we've actually walked + // by going forward + p = jump; + do { + p = g_utf8_next_char (p); + offset ++; + } while (p < jump); + + } while (offset < 0); + } + + return (gchar *)p; +} + +glong +g_utf8_pointer_to_offset (const gchar *str, const gchar *pos) +{ + const gchar *inptr, *inend; + glong offset = 0; + glong sign = 1; + + if (pos == str) + return 0; + + if (str < pos) { + inptr = str; + inend = pos; + } else { + inptr = pos; + inend = str; + sign = -1; + } + + do { + inptr = g_utf8_next_char (inptr); + offset++; + } while (inptr < inend); + + return offset * sign; +} + /* * Unicode encoders and decoders */ @@ -419,12 +687,12 @@ encode_latin1 (gunichar c, char *outbuf, size_t outleft) * Simple conversion API */ -static gpointer error_quark = (gpointer)"ConvertError"; +static gpointer g_error_quark = (gpointer)"ConvertError"; gpointer g_convert_error_quark (void) { - return error_quark; + return g_error_quark; } /* * Unicode conversion @@ -546,7 +814,7 @@ g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written) if (items_written) *items_written = n; - outptr = outbuf = g_malloc ((n + 1) * sizeof (gunichar)); + outptr = outbuf = (gunichar *)g_malloc ((n + 1) * sizeof (gunichar)); inptr = (char *) str; for (i = 0; i < n; i++) { @@ -560,7 +828,7 @@ g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written) } static gunichar2 * -eg_utf8_to_utf16_general (const gchar *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, gboolean replace_invalid_codepoints, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err, unsigned endian) +eg_utf8_to_utf16_general (const gchar *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, gboolean replace_invalid_codepoints, gboolean null_terminate, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err, unsigned endian) { gunichar2 *outbuf, *outptr; size_t outlen = 0; @@ -611,7 +879,7 @@ eg_utf8_to_utf16_general (const gchar *str, glong len, glong *items_read, glong *items_written = (glong)outlen; if (G_LIKELY (!custom_alloc_func)) - outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2)); + outptr = outbuf = (gunichar2 *)g_malloc ((outlen + 1) * sizeof (gunichar2)); else outptr = outbuf = (gunichar2 *)custom_alloc_func ((outlen + 1) * sizeof (gunichar2), custom_alloc_data); @@ -642,7 +910,8 @@ eg_utf8_to_utf16_general (const gchar *str, glong len, glong *items_read, glong inptr += n; } - *outptr = '\0'; + if (null_terminate) + *outptr = '\0'; return outbuf; @@ -672,49 +941,55 @@ eg_utf8_to_utf16_general (const gchar *str, glong len, glong *items_read, glong gunichar2 * g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, NULL, NULL, err, G_BYTE_ORDER); + return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, TRUE, NULL, NULL, err, G_BYTE_ORDER); } gunichar2 * g_utf8_to_utf16be (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, NULL, NULL, err, G_BIG_ENDIAN); + return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, TRUE, NULL, NULL, err, G_BIG_ENDIAN); } gunichar2 * g_utf8_to_utf16le (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, NULL, NULL, err, G_LITTLE_ENDIAN); + return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, TRUE, NULL, NULL, err, G_LITTLE_ENDIAN); } gunichar2 * g_utf8_to_utf16_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, custom_alloc_func, custom_alloc_data, err, G_BYTE_ORDER); + return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, TRUE, custom_alloc_func, custom_alloc_data, err, G_BYTE_ORDER); +} + +gunichar2 * +g_utf8_to_utf16_custom_alloc_optional (const gchar *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, gboolean replace_invalid_codepoints, gboolean null_terminate, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) +{ + return eg_utf8_to_utf16_general (str, len, items_read, items_written, include_nuls, replace_invalid_codepoints, null_terminate, custom_alloc_func, custom_alloc_data, err, G_BYTE_ORDER); } gunichar2 * g_utf8_to_utf16be_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, custom_alloc_func, custom_alloc_data, err, G_BIG_ENDIAN); + return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, TRUE, custom_alloc_func, custom_alloc_data, err, G_BIG_ENDIAN); } gunichar2 * g_utf8_to_utf16le_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, custom_alloc_func, custom_alloc_data, err, G_LITTLE_ENDIAN); + return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, TRUE, custom_alloc_func, custom_alloc_data, err, G_LITTLE_ENDIAN); } gunichar2 * eg_utf8_to_utf16_with_nuls (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, FALSE, NULL, NULL, err, G_BYTE_ORDER); + return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, FALSE, TRUE, NULL, NULL, err, G_BYTE_ORDER); } gunichar2 * eg_wtf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, TRUE, NULL, NULL, err, G_BYTE_ORDER); + return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, TRUE, TRUE, NULL, NULL, err, G_BYTE_ORDER); } gunichar * @@ -769,7 +1044,7 @@ g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_wri if (items_read) *items_read = GPTRDIFF_TO_LONG (inptr - str); - outptr = outbuf = g_malloc (outlen + 4); + outptr = outbuf = (gunichar *)g_malloc (outlen + 4); inptr = (char *) str; inleft = len; @@ -791,17 +1066,23 @@ g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_wri static gchar * -eg_utf16_to_utf8_general (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err, unsigned endian) +eg_utf16_to_utf8_general (const gunichar2 *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, gboolean replace_invalid_codepoints, gboolean null_terminate, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err, unsigned endian) { char *inptr, *outbuf, *outptr; size_t outlen = 0; size_t inleft; gunichar c; + gboolean replaced = FALSE; int n; g_return_val_if_fail (str != NULL, NULL); if (len < 0) { + if (include_nuls) { + g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "Conversions with embedded nulls must pass the string length"); + return NULL; + } + len = 0; while (str[len]) len++; @@ -818,30 +1099,37 @@ eg_utf16_to_utf8_general (const gunichar2 *str, glong len, glong *items_read, gl inptr += 2; } - if (errno == EILSEQ) { + if (errno == EILSEQ && !replace_invalid_codepoints) { g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "Illegal byte sequence encountered in the input."); - } else if (items_read) { + } else if (items_read && !replace_invalid_codepoints) { /* partial input is ok if we can let our caller know... */ break; - } else { + } else if (!replace_invalid_codepoints) { g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, "Partial byte sequence encountered in the input."); } - if (items_read) - *items_read = GPTRDIFF_TO_LONG ((inptr - (char *) str) / 2); + if (replace_invalid_codepoints) { + n = sizeof(gunichar); + c = '?'; + replaced = TRUE; + } else { + if (items_read) + *items_read = GPTRDIFF_TO_LONG ((inptr - (char *) str) / 2); - if (items_written) - *items_written = 0; + if (items_written) + *items_written = 0; - return NULL; - } else if (c == 0) + return NULL; + } + } else if (c == 0 && !include_nuls) break; - outlen += g_unichar_to_utf8 (c, NULL); + outlen += (replaced && replace_invalid_codepoints) ? n - 1 : g_unichar_to_utf8 (c, NULL); inleft -= n; inptr += n; + replaced = FALSE; } if (items_read) @@ -851,7 +1139,7 @@ eg_utf16_to_utf8_general (const gunichar2 *str, glong len, glong *items_read, gl *items_written = (glong)outlen; if (G_LIKELY (!custom_alloc_func)) - outptr = outbuf = g_malloc (outlen + 1); + outptr = outbuf = (char *)g_malloc (outlen + 1); else outptr = outbuf = (char *)custom_alloc_func (outlen + 1, custom_alloc_data); @@ -866,17 +1154,24 @@ eg_utf16_to_utf8_general (const gunichar2 *str, glong len, glong *items_read, gl inleft = len * 2; while (inleft > 0) { - if ((n = decode_utf16_endian (inptr, inleft, &c, endian)) < 0) - break; - else if (c == 0) + if ((n = decode_utf16_endian (inptr, inleft, &c, endian)) < 0) { + if (replace_invalid_codepoints) { + outptr += '?'; + n = sizeof(gunichar); + } else + break; + } else if (c == 0 && !include_nuls) { break; + } else { + outptr += g_unichar_to_utf8 (c, outptr); + } - outptr += g_unichar_to_utf8 (c, outptr); inleft -= n; inptr += n; } - *outptr = '\0'; + if (null_terminate) + *outptr = '\0'; return outbuf; } @@ -884,25 +1179,31 @@ eg_utf16_to_utf8_general (const gunichar2 *str, glong len, glong *items_read, gl gchar * g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf16_to_utf8_general (str, len, items_read, items_written, NULL, NULL, err, G_BYTE_ORDER); + return eg_utf16_to_utf8_general (str, len, items_read, items_written, FALSE, FALSE, TRUE, NULL, NULL, err, G_BYTE_ORDER); } gchar * g_utf16le_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf16_to_utf8_general (str, len, items_read, items_written, NULL, NULL, err, G_LITTLE_ENDIAN); + return eg_utf16_to_utf8_general (str, len, items_read, items_written, FALSE, FALSE, TRUE, NULL, NULL, err, G_LITTLE_ENDIAN); } gchar * g_utf16be_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf16_to_utf8_general (str, len, items_read, items_written, NULL, NULL, err, G_BIG_ENDIAN); + return eg_utf16_to_utf8_general (str, len, items_read, items_written, FALSE, FALSE, TRUE, NULL, NULL, err, G_BIG_ENDIAN); } gchar * g_utf16_to_utf8_custom_alloc (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) { - return eg_utf16_to_utf8_general (str, len, items_read, items_written, custom_alloc_func, custom_alloc_data, err, G_BYTE_ORDER); + return eg_utf16_to_utf8_general (str, len, items_read, items_written, FALSE, FALSE, TRUE, custom_alloc_func, custom_alloc_data, err, G_BYTE_ORDER); +} + +gchar * +g_utf16_to_utf8_custom_alloc_with_nulls (const gunichar2 *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, gboolean null_terminate, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) +{ + return eg_utf16_to_utf8_general (str, len, items_read, items_written, include_nuls, TRUE, null_terminate, custom_alloc_func, custom_alloc_data, err, G_BYTE_ORDER); } gunichar * @@ -966,7 +1267,7 @@ g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *item if (items_written) *items_written = (glong)(outlen / 4); - outptr = outbuf = g_malloc (outlen + 4); + outptr = outbuf = (gunichar *)g_malloc (outlen + 4); inptr = (char *) str; inleft = len * 2; @@ -1034,7 +1335,7 @@ g_ucs4_to_utf8 (const gunichar *str, glong len, glong *items_read, glong *items_ len = i; - outptr = outbuf = g_malloc (outlen + 1); + outptr = outbuf = (char *)g_malloc (outlen + 1); for (i = 0; i < len; i++) outptr += g_unichar_to_utf8 (str[i], outptr); *outptr = 0; @@ -1096,7 +1397,7 @@ g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items len = i; - outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2)); + outptr = outbuf = (gunichar2 *)g_malloc ((outlen + 1) * sizeof (gunichar2)); for (i = 0; i < len; i++) outptr += g_unichar_to_utf16 (str[i], outptr); *outptr = 0; diff --git a/src/native/minipal/utf8converter.h b/src/native/minipal/utf8converter.h new file mode 100644 index 00000000000000..06cd677dfe1955 --- /dev/null +++ b/src/native/minipal/utf8converter.h @@ -0,0 +1,200 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#ifndef HAVE_MINIPAL_UTF8CONVERTER_H +#define HAVE_MINIPAL_UTF8CONVERTER_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef CORECLR +#include "glib.h" +#endif + +#ifdef _MSC_VER +#define FORCE_INLINE(RET_TYPE) __forceinline RET_TYPE +#else +#define FORCE_INLINE(RET_TYPE) inline RET_TYPE __attribute__((always_inline)) +#endif + +#if G_BYTE_ORDER == G_LITTLE_ENDIAN +#define decode_utf32 decode_utf32le +#define encode_utf32 encode_utf32le +#define decode_utf16 decode_utf16le +#define encode_utf16 encode_utf16le +#define GUINT16_TO_LE(x) (x) +#define GUINT16_TO_BE(x) GUINT16_SWAP_LE_BE(x) +#else +#define decode_utf32 decode_utf32be +#define encode_utf32 encode_utf32be +#define decode_utf16 decode_utf16be +#define encode_utf16 encode_utf16be +#define GUINT16_TO_LE(x) GUINT16_SWAP_LE_BE(x) +#define GUINT16_TO_BE(x) (x) +#endif + +#ifdef CORECLR + +#ifdef TARGET_64BIT +#define ptrdiff_t int64_t +#else +#define ptrdiff_t int32_t +#endif + +#define gunichar uint32_t +#define gunichar2 uint16_t +#define guint uint32_t +#define gchar char +#define guchar unsigned char +#define gboolean bool +#define gsize size_t +#define gssize ptrdiff_t +#define gint int32_t +#define glong long +#define gptrdiff ptrdiff_t +#define guint8 uint8_t +#define guint16 uint16_t +#define gpointer void* +#define g_malloc malloc +#define TRUE 1 +#define FALSE 0 +#ifndef MIN +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#endif + +typedef void* (*GCustomAllocator) (size_t req_size, void* custom_alloc_data); + +typedef struct { + /* In the real glib, this is a GQuark, but we dont use/need that */ + void* domain; + int32_t code; + char *message; +} GError; + +typedef struct { + void* buffer; + size_t buffer_size; + size_t req_buffer_size; +} GFixedBufferCustomAllocatorData; + +typedef enum { + G_CONVERT_ERROR_NO_CONVERSION, + G_CONVERT_ERROR_ILLEGAL_SEQUENCE, + G_CONVERT_ERROR_FAILED, + G_CONVERT_ERROR_PARTIAL_INPUT, + G_CONVERT_ERROR_BAD_URI, + G_CONVERT_ERROR_NOT_ABSOLUTE_PATH, + G_CONVERT_ERROR_NO_MEMORY +} GConvertError; + +#define UNROLL_DECODE_UTF8 0 +#define UNROLL_ENCODE_UTF8 0 + +static int decode_utf32be (char *inbuf, size_t inleft, uint32_t *outchar); +static int encode_utf32be (uint32_t c, char *outbuf, size_t outleft); + +static int decode_utf32le (char *inbuf, size_t inleft, uint32_t *outchar); +static int encode_utf32le (uint32_t c, char *outbuf, size_t outleft); + +static int decode_utf16be (char *inbuf, size_t inleft, uint32_t *outchar); +static int encode_utf16be (uint32_t c, char *outbuf, size_t outleft); + +static int decode_utf16le (char *inbuf, size_t inleft, uint32_t *outchar); +static int encode_utf16le (uint32_t c, char *outbuf, size_t outleft); + +static FORCE_INLINE (int) decode_utf8 (char *inbuf, size_t inleft, uint32_t *outchar); +static int encode_utf8 (uint32_t c, char *outbuf, size_t outleft); + +static int decode_latin1 (char *inbuf, size_t inleft, uint32_t *outchar); +static int encode_latin1 (uint32_t c, char *outbuf, size_t outleft); + +#define G_LITTLE_ENDIAN 1234 +#define G_BIG_ENDIAN 4321 +#define GUINT16_SWAP_LE_BE(x) ((uint16_t) (((uint16_t) x) >> 8) | ((((uint16_t)(x)) & 0xff) << 8)) + +#ifdef BIGENDIAN +#define G_BYTE_ORDER G_BIG_ENDIAN +#else +#define G_BYTE_ORDER G_LITTLE_ENDIAN +#endif + +#define G_CAST_TYPE_TO_TYPE(src,dest,v) ((dest)(v)) +#define G_CAST_PTRTYPE_TO_STYPE(src,dest,v) ((dest)(gssize)(v)) +#define GUINT32_TO_UINT16(v) G_CAST_TYPE_TO_TYPE(guint32, guint16, v) +#define GSIZE_TO_INT(v) G_CAST_TYPE_TO_TYPE(gsize, gint, v) +#define GSSIZE_TO_UINT(v) G_CAST_TYPE_TO_TYPE(gssize, guint, v) +#define GUNICHAR_TO_UINT8(v) G_CAST_TYPE_TO_TYPE(gunichar, guint8, v) +#define GUNICHAR_TO_UINT16(v) G_CAST_TYPE_TO_TYPE(gunichar, guint16, v) +#define GUNICHAR_TO_CHAR(v) G_CAST_TYPE_TO_TYPE(gunichar, gchar, v) +#define GPTRDIFF_TO_LONG(v) G_CAST_PTRTYPE_TO_STYPE(gptrdiff, glong, v) +#define g_return_val_if_fail(x,e) do { if (!(x)) { printf ("%s:%d: assertion '%s' failed\n", __FILE__, __LINE__, #x); return (e); } } while(0) +#define g_utf8_next_char(p) ((p) + g_utf8_jump_table[(unsigned char)(*p)]) + +#if defined(__GNUC__) && (__GNUC__ > 2) +#define G_LIKELY(expr) (__builtin_expect ((expr) != 0, 1)) +#define G_UNLIKELY(expr) (__builtin_expect ((expr) != 0, 0)) +#else +#define G_LIKELY(x) (x) +#define G_UNLIKELY(x) (x) +#endif + +void +g_set_error (GError **err, void* domain, int32_t code, const char *format, ...) +{ + va_list args; + + if (err) { + *err = (GError *) malloc (sizeof (GError)); + (*err)->domain = domain; + (*err)->code = code; + + va_start (args, format); + int s = vsnprintf(NULL, 0, format, args); + va_end(args); + + if (s > -1) + { + (*err)->message = (char*)malloc(s); + + va_start(args, format); + vsnprintf((*err)->message, s, format, args); + va_end (args); + } + } +} + +#define G_CONVERT_ERROR g_convert_error_quark() + +inline static void +mono_set_errno (int errno_val) +{ + errno = errno_val; +} + +#endif // CORECLR + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Unicode encoders and decoders + */ + +gunichar2 * +g_utf8_to_utf16_custom_alloc_optional (const gchar *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, gboolean replace_invalid_codepoints, gboolean null_terminate, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err); + +gchar * +g_utf16_to_utf8_custom_alloc_with_nulls (const gunichar2 *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, gboolean null_terminate, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err); + +#ifdef __cplusplus +} +#endif // extern "C" + +#endif //HAVE_MINIPAL_UTF8CONVERTER_H From 3eca7a47f0913226206714f8fc75c820a79b951d Mon Sep 17 00:00:00 2001 From: Adeel <3840695+am11@users.noreply.github.com> Date: Thu, 18 May 2023 20:37:27 +0300 Subject: [PATCH 2/9] Revert "Share UTF8 converters between coreclr and mono - v1" This reverts commit f9845ac6f53dc95fb747eb21351dfa9412397217. --- src/coreclr/pal/src/CMakeLists.txt | 4 +- src/coreclr/pal/src/locale/unicode.cpp | 93 +- src/coreclr/pal/src/locale/utf8.cpp | 2937 +++++++++++++++++ .../MultiByteToWideChar/test4/test4.cpp | 2 +- .../WideCharToMultiByte/test5/test5.cpp | 2 +- .../TypeBuilder/TypeBuilderDefineEvent.cs | 2 +- .../TypeBuilder/TypeBuilderDefineProperty.cs | 2 +- src/mono/mono/eglib/CMakeLists.txt | 5 +- .../mono/eglib/giconv.c} | 481 +-- src/mono/mono/eglib/glib.h | 1 - src/mono/mono/eglib/gutf8.c | 323 ++ src/native/minipal/utf8converter.h | 200 -- 12 files changed, 3369 insertions(+), 683 deletions(-) create mode 100644 src/coreclr/pal/src/locale/utf8.cpp rename src/{native/minipal/utf8converter.c => mono/mono/eglib/giconv.c} (68%) create mode 100644 src/mono/mono/eglib/gutf8.c delete mode 100644 src/native/minipal/utf8converter.h diff --git a/src/coreclr/pal/src/CMakeLists.txt b/src/coreclr/pal/src/CMakeLists.txt index 22d9c29594dd27..bd5a6bdf4d5b22 100644 --- a/src/coreclr/pal/src/CMakeLists.txt +++ b/src/coreclr/pal/src/CMakeLists.txt @@ -152,7 +152,7 @@ set(SOURCES loader/module.cpp locale/unicode.cpp locale/unicodedata.cpp - ${CLR_SRC_NATIVE_DIR}/minipal/utf8converter.c + locale/utf8.cpp map/common.cpp map/map.cpp map/virtual.cpp @@ -213,8 +213,6 @@ set(SOURCES thread/threadsusp.cpp ) -set_source_files_properties("${CLR_SRC_NATIVE_DIR}/minipal/utf8converter.c" PROPERTIES COMPILE_FLAGS -Wno-implicit-fallthrough) - if(NOT CLR_CMAKE_USE_SYSTEM_LIBUNWIND) set(LIBUNWIND_OBJECTS $) endif(NOT CLR_CMAKE_USE_SYSTEM_LIBUNWIND) diff --git a/src/coreclr/pal/src/locale/unicode.cpp b/src/coreclr/pal/src/locale/unicode.cpp index b4c832c3d2e599..f29eabc07d9be3 100644 --- a/src/coreclr/pal/src/locale/unicode.cpp +++ b/src/coreclr/pal/src/locale/unicode.cpp @@ -34,7 +34,6 @@ Revision History: #include #include -#include using namespace CorUnix; @@ -228,7 +227,7 @@ MultiByteToWideChar( OUT LPWSTR lpWideCharStr, IN int cchWideChar) { - long retval = 0; + INT retval =0; PERF_ENTRY(MultiByteToWideChar); ENTRY("MultiByteToWideChar(CodePage=%u, dwFlags=%#x, lpMultiByteStr=%p (%s)," @@ -254,51 +253,16 @@ MultiByteToWideChar( goto EXIT; } - // Use g_utf8_to_utf16_custom_alloc_optional on all systems, since it replaces + // Use UTF8ToUnicode on all systems, since it replaces // invalid characters and Core Foundation doesn't do that. if (CodePage == CP_UTF8 || CodePage == CP_ACP) { - int inputLength = (int)strlen(lpMultiByteStr); - bool allowNulls = (cbMultiByte > 0 && lpMultiByteStr[cbMultiByte - 1] != '\0'); - bool subtractOne = cbMultiByte == cchWideChar || allowNulls; if (cbMultiByte <= -1) { - cbMultiByte = inputLength + 1; + cbMultiByte = strlen(lpMultiByteStr) + 1; } - size_t allocSize = 0; - struct cookie { LPWSTR str; size_t* allocSize; int* count; }; - cookie callbackCookie = { .str = lpWideCharStr, .allocSize = &allocSize, .count = &cchWideChar }; - - long itemsWritten; - GError *gerror = NULL; - lpWideCharStr = (LPWSTR)g_utf8_to_utf16_custom_alloc_optional(lpMultiByteStr, cbMultiByte, &retval, &itemsWritten, allowNulls, - !(dwFlags & MB_ERR_INVALID_CHARS), cbMultiByte > inputLength, - [](size_t req_size, void* custom_alloc_data) - { - cookie* callbackCookie = (cookie*)(custom_alloc_data); - *(callbackCookie->allocSize) = (req_size / sizeof (gunichar2)); - int count = *(callbackCookie->count); - return (void*)(callbackCookie->str && !(count && *(callbackCookie->allocSize) - 1 > (size_t)count) ? callbackCookie->str : NULL); - }, &callbackCookie, &gerror); - - if (gerror && (lpWideCharStr || (cchWideChar && allocSize > (size_t)cchWideChar))) - { - retval = 0; - ERROR ("The error is %d %s\n", gerror->code, gerror->message); - switch (gerror->code) - { - case G_CONVERT_ERROR_ILLEGAL_SEQUENCE: SetLastError(ERROR_NO_UNICODE_TRANSLATION); break; - case G_CONVERT_ERROR_NO_MEMORY: SetLastError(ERROR_INSUFFICIENT_BUFFER); break; - default: SetLastError(ERROR_INVALID_PARAMETER); break; - } - free(gerror); - goto EXIT; - } - - retval = allocSize; - if (retval > 1 && subtractOne) retval -= 1; - + retval = UTF8ToUnicode(lpMultiByteStr, cbMultiByte, lpWideCharStr, cchWideChar, dwFlags); goto EXIT; } @@ -310,7 +274,7 @@ MultiByteToWideChar( LOGEXIT("MultiByteToWideChar returns %d.\n",retval); PERF_EXIT(MultiByteToWideChar); - return (int)retval; + return retval; } @@ -333,7 +297,7 @@ WideCharToMultiByte( IN LPCSTR lpDefaultChar, OUT LPBOOL lpUsedDefaultChar) { - long retval = 0; + INT retval =0; char defaultChar = '?'; BOOL usedDefaultChar = FALSE; @@ -374,50 +338,15 @@ WideCharToMultiByte( defaultChar = *lpDefaultChar; } - // Use g_utf16_to_utf8_custom_alloc_with_nulls on all systems because we use - // g_utf8_to_utf16 in MultiByteToWideChar() on all systems. + // Use UnicodeToUTF8 on all systems because we use + // UTF8ToUnicode in MultiByteToWideChar() on all systems. if (CodePage == CP_UTF8 || CodePage == CP_ACP) { - int inputLength = (int)PAL_wcslen(lpWideCharStr); - bool allowNulls = (cchWideChar > 0 && lpWideCharStr[cchWideChar - 1] != '\0'); - bool subtractOne = cchWideChar == cbMultiByte || allowNulls; if (cchWideChar == -1) { - cchWideChar = inputLength + 1; + cchWideChar = PAL_wcslen(lpWideCharStr) + 1; } - - size_t allocSize = 0; - struct cookie { LPSTR str; size_t* allocSize; int* count; }; - cookie callbackCookie = { .str = lpMultiByteStr, .allocSize = &allocSize, .count = &cbMultiByte }; - - long itemsWritten; - GError *gerror = NULL; - lpMultiByteStr = g_utf16_to_utf8_custom_alloc_with_nulls((unsigned short*)lpWideCharStr, cchWideChar, &retval, &itemsWritten, allowNulls, cchWideChar > inputLength, - [](size_t req_size, void* custom_alloc_data) - { - cookie* callbackCookie = (cookie*)(custom_alloc_data); - *(callbackCookie->allocSize) = req_size; - int count = (size_t)*(callbackCookie->count); - return (void*)(callbackCookie->str && !(count && *(callbackCookie->allocSize) - 1 > (size_t)count) ? callbackCookie->str : NULL); - }, &callbackCookie, &gerror); - - if (gerror && (lpMultiByteStr || (cbMultiByte && allocSize > (size_t)cbMultiByte))) - { - retval = 0; - ERROR ("The error is %d %s\n", gerror->code, gerror->message); - switch (gerror->code) - { - case G_CONVERT_ERROR_ILLEGAL_SEQUENCE: SetLastError(ERROR_NO_UNICODE_TRANSLATION); break; - case G_CONVERT_ERROR_NO_MEMORY: SetLastError(ERROR_INSUFFICIENT_BUFFER); break; - default: SetLastError(ERROR_INVALID_PARAMETER); break; - } - free(gerror); - goto EXIT; - } - - retval = allocSize; - if (retval > 1 && subtractOne) retval -= 1; - + retval = UnicodeToUTF8(lpWideCharStr, cchWideChar, lpMultiByteStr, cbMultiByte); goto EXIT; } @@ -445,7 +374,7 @@ WideCharToMultiByte( LOGEXIT("WideCharToMultiByte returns INT %d\n", retval); PERF_EXIT(WideCharToMultiByte); - return (int)retval; + return retval; } extern char * g_szCoreCLRPath; diff --git a/src/coreclr/pal/src/locale/utf8.cpp b/src/coreclr/pal/src/locale/utf8.cpp new file mode 100644 index 00000000000000..f07c69ff7e15f3 --- /dev/null +++ b/src/coreclr/pal/src/locale/utf8.cpp @@ -0,0 +1,2937 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +/*++ + +Module Name: + + unicode/utf8.c + +Abstract: + Functions to encode and decode UTF-8 strings. This is a port of the C# version from Utf8Encoding.cs. + +Revision History: + +--*/ + +#include "pal/utf8.h" +#include "pal/malloc.hpp" + +using namespace CorUnix; + +#define FASTLOOP + +struct CharUnicodeInfo +{ + static const WCHAR HIGH_SURROGATE_START = 0xd800; + static const WCHAR HIGH_SURROGATE_END = 0xdbff; + static const WCHAR LOW_SURROGATE_START = 0xdc00; + static const WCHAR LOW_SURROGATE_END = 0xdfff; +}; + +struct Char +{ + // Test if the wide character is a high surrogate + static bool IsHighSurrogate(const WCHAR c) + { + return (c & 0xFC00) == CharUnicodeInfo::HIGH_SURROGATE_START; + } + + // Test if the wide character is a low surrogate + static bool IsLowSurrogate(const WCHAR c) + { + return (c & 0xFC00) == CharUnicodeInfo::LOW_SURROGATE_START; + } + + // Test if the wide character is a surrogate half + static bool IsSurrogate(const WCHAR c) + { + return (c & 0xF800) == CharUnicodeInfo::HIGH_SURROGATE_START; + } + + // Test if the wide character is a high surrogate + static bool IsHighSurrogate(const WCHAR* s, int index) + { + return IsHighSurrogate(s[index]); + } + + // Test if the wide character is a low surrogate + static bool IsLowSurrogate(const WCHAR* s, int index) + { + return IsLowSurrogate(s[index]); + } + + // Test if the wide character is a surrogate half + static bool IsSurrogate(const WCHAR* s, int index) + { + return IsSurrogate(s[index]); + } +}; + +class ArgumentException +{ + +public: + ArgumentException(LPCSTR message) + { + } + + ArgumentException(LPCSTR message, LPCSTR argName) + { + } +}; + +class ArgumentNullException : public ArgumentException +{ +public: + ArgumentNullException(LPCSTR argName) + : ArgumentException("Argument is NULL", argName) + { + + } +}; + +class ArgumentOutOfRangeException : public ArgumentException +{ +public: + ArgumentOutOfRangeException(LPCSTR argName, LPCSTR message) + : ArgumentException(message, argName) + { + + } +}; + +class InsufficientBufferException : public ArgumentException +{ +public: + InsufficientBufferException(LPCSTR message, LPCSTR argName) + : ArgumentException(message, argName) + { + + } +}; + +class Contract +{ +public: + static void Assert(bool cond, LPCSTR str) + { + if (!cond) + { + throw ArgumentException(str); + } + } + + static void EndContractBlock() + { + } +}; + +class DecoderFallbackException : public ArgumentException +{ + BYTE *bytesUnknown; + int index; + +public: + DecoderFallbackException( + LPCSTR message, BYTE bytesUnknown[], int index) : ArgumentException(message) + { + this->bytesUnknown = bytesUnknown; + this->index = index; + } + + BYTE *BytesUnknown() + { + return (bytesUnknown); + } + + int GetIndex() + { + return index; + } +}; + +class DecoderFallbackBuffer; + +class DecoderFallback +{ +public: + + // Fallback + // + // Return the appropriate unicode string alternative to the character that need to fall back. + + virtual DecoderFallbackBuffer* CreateFallbackBuffer() = 0; + + // Maximum number of characters that this instance of this fallback could return + + virtual int GetMaxCharCount() = 0; +}; + +class DecoderReplacementFallback : public DecoderFallback +{ + // Our variables + WCHAR strDefault[2]; + int strDefaultLength; + +public: + // Construction. Default replacement fallback uses no best fit and ? replacement string + DecoderReplacementFallback() : DecoderReplacementFallback(W("?")) + { + } + + DecoderReplacementFallback(const WCHAR* replacement) + { + // Must not be null + if (replacement == nullptr) + throw ArgumentNullException("replacement"); + Contract::EndContractBlock(); + + // Make sure it doesn't have bad surrogate pairs + bool bFoundHigh = false; + int replacementLength = PAL_wcslen((const WCHAR *)replacement); + for (int i = 0; i < replacementLength; i++) + { + // Found a surrogate? + if (Char::IsSurrogate(replacement, i)) + { + // High or Low? + if (Char::IsHighSurrogate(replacement, i)) + { + // if already had a high one, stop + if (bFoundHigh) + break; // break & throw at the bFoundHIgh below + bFoundHigh = true; + } + else + { + // Low, did we have a high? + if (!bFoundHigh) + { + // Didn't have one, make if fail when we stop + bFoundHigh = true; + break; + } + + // Clear flag + bFoundHigh = false; + } + } + // If last was high we're in trouble (not surrogate so not low surrogate, so break) + else if (bFoundHigh) + break; + } + if (bFoundHigh) + throw ArgumentException("String 'replacement' contains invalid Unicode code points.", "replacement"); + + wcscpy_s(strDefault, ARRAY_SIZE(strDefault), replacement); + strDefaultLength = replacementLength; + } + + WCHAR* GetDefaultString() + { + return strDefault; + } + + virtual DecoderFallbackBuffer* CreateFallbackBuffer(); + + // Maximum number of characters that this instance of this fallback could return + virtual int GetMaxCharCount() + { + return strDefaultLength; + } +}; + +class DecoderFallbackBuffer +{ + friend class UTF8Encoding; + // Most implementations will probably need an implementation-specific constructor + + // internal methods that cannot be overridden that let us do our fallback thing + // These wrap the internal methods so that we can check for people doing stuff that's incorrect + +public: + virtual ~DecoderFallbackBuffer() = default; + + virtual bool Fallback(BYTE bytesUnknown[], int index, int size) = 0; + + // Get next character + virtual WCHAR GetNextChar() = 0; + + //Back up a character + virtual bool MovePrevious() = 0; + + // How many chars left in this fallback? + virtual int GetRemaining() = 0; + + // Clear the buffer + virtual void Reset() + { + while (GetNextChar() != (WCHAR)0); + } + + // Internal items to help us figure out what we're doing as far as error messages, etc. + // These help us with our performance and messages internally +protected: + BYTE* byteStart; + WCHAR* charEnd; + + // Internal reset + void InternalReset() + { + byteStart = nullptr; + Reset(); + } + + // Set the above values + // This can't be part of the constructor because EncoderFallbacks would have to know how to implement these. + void InternalInitialize(BYTE* byteStart, WCHAR* charEnd) + { + this->byteStart = byteStart; + this->charEnd = charEnd; + } + + // Fallback the current byte by sticking it into the remaining char buffer. + // This can only be called by our encodings (other have to use the public fallback methods), so + // we can use our DecoderNLS here too (except we don't). + // Returns true if we are successful, false if we can't fallback the character (no buffer space) + // So caller needs to throw buffer space if return false. + // Right now this has both bytes and bytes[], since we might have extra bytes, hence the + // array, and we might need the index, hence the byte* + // Don't touch ref chars unless we succeed + virtual bool InternalFallback(BYTE bytes[], BYTE* pBytes, WCHAR** chars, int size) + { + + Contract::Assert(byteStart != nullptr, "[DecoderFallback.InternalFallback]Used InternalFallback without calling InternalInitialize"); + + // See if there's a fallback character and we have an output buffer then copy our string. + if (this->Fallback(bytes, (int)(pBytes - byteStart - size), size)) + { + // Copy the chars to our output + WCHAR ch; + WCHAR* charTemp = *chars; + bool bHighSurrogate = false; + while ((ch = GetNextChar()) != 0) + { + // Make sure no mixed up surrogates + if (Char::IsSurrogate(ch)) + { + if (Char::IsHighSurrogate(ch)) + { + // High Surrogate + if (bHighSurrogate) + throw ArgumentException("String 'chars' contains invalid Unicode code points."); + bHighSurrogate = true; + } + else + { + // Low surrogate + if (!bHighSurrogate) + throw ArgumentException("String 'chars' contains invalid Unicode code points."); + bHighSurrogate = false; + } + } + + if (charTemp >= charEnd) + { + // No buffer space + return false; + } + + *(charTemp++) = ch; + } + + // Need to make sure that bHighSurrogate isn't true + if (bHighSurrogate) + throw ArgumentException("String 'chars' contains invalid Unicode code points."); + + // Now we aren't going to be false, so its OK to update chars + *chars = charTemp; + } + + return true; + } + + // This version just counts the fallback and doesn't actually copy anything. + virtual int InternalFallback(BYTE bytes[], BYTE* pBytes, int size) + // Right now this has both bytes[] and BYTE* bytes, since we might have extra bytes, hence the + // array, and we might need the index, hence the byte* + { + + Contract::Assert(byteStart != nullptr, "[DecoderFallback.InternalFallback]Used InternalFallback without calling InternalInitialize"); + + // See if there's a fallback character and we have an output buffer then copy our string. + if (this->Fallback(bytes, (int)(pBytes - byteStart - size), size)) + { + int count = 0; + + WCHAR ch; + bool bHighSurrogate = false; + while ((ch = GetNextChar()) != 0) + { + // Make sure no mixed up surrogates + if (Char::IsSurrogate(ch)) + { + if (Char::IsHighSurrogate(ch)) + { + // High Surrogate + if (bHighSurrogate) + throw ArgumentException("String 'chars' contains invalid Unicode code points."); + bHighSurrogate = true; + } + else + { + // Low surrogate + if (!bHighSurrogate) + throw ArgumentException("String 'chars' contains invalid Unicode code points."); + bHighSurrogate = false; + } + } + + count++; + } + + // Need to make sure that bHighSurrogate isn't true + if (bHighSurrogate) + throw ArgumentException("String 'chars' contains invalid Unicode code points."); + + return count; + } + + // If no fallback return 0 + return 0; + } + + // private helper methods + void ThrowLastBytesRecursive(BYTE bytesUnknown[]) + { + throw ArgumentException("Recursive fallback not allowed"); + } +}; + +class DecoderReplacementFallbackBuffer : public DecoderFallbackBuffer +{ + // Store our default string + WCHAR strDefault[2]; + int strDefaultLength; + int fallbackCount = -1; + int fallbackIndex = -1; + +public: + // Construction + DecoderReplacementFallbackBuffer(DecoderReplacementFallback* fallback) + { + wcscpy_s(strDefault, ARRAY_SIZE(strDefault), fallback->GetDefaultString()); + strDefaultLength = PAL_wcslen((const WCHAR *)fallback->GetDefaultString()); + } + + // Fallback Methods + virtual bool Fallback(BYTE bytesUnknown[], int index, int size) + { + // We expect no previous fallback in our buffer + // We can't call recursively but others might (note, we don't test on last char!!!) + if (fallbackCount >= 1) + { + ThrowLastBytesRecursive(bytesUnknown); + } + + // Go ahead and get our fallback + if (strDefaultLength == 0) + return false; + + fallbackCount = strDefaultLength; + fallbackIndex = -1; + + return true; + } + + virtual WCHAR GetNextChar() + { + // We want it to get < 0 because == 0 means that the current/last character is a fallback + // and we need to detect recursion. We could have a flag but we already have this counter. + fallbackCount--; + fallbackIndex++; + + // Do we have anything left? 0 is now last fallback char, negative is nothing left + if (fallbackCount < 0) + return '\0'; + + // Need to get it out of the buffer. + // Make sure it didn't wrap from the fast count-- path + if (fallbackCount == INT_MAX) + { + fallbackCount = -1; + return '\0'; + } + + // Now make sure its in the expected range + Contract::Assert(fallbackIndex < strDefaultLength && fallbackIndex >= 0, + "Index exceeds buffer range"); + + return strDefault[fallbackIndex]; + } + + virtual bool MovePrevious() + { + // Back up one, only if we just processed the last character (or earlier) + if (fallbackCount >= -1 && fallbackIndex >= 0) + { + fallbackIndex--; + fallbackCount++; + return true; + } + + // Return false 'cause we couldn't do it. + return false; + } + + // How many characters left to output? + virtual int GetRemaining() + { + // Our count is 0 for 1 character left. + return (fallbackCount < 0) ? 0 : fallbackCount; + } + + // Clear the buffer + virtual void Reset() + { + fallbackCount = -1; + fallbackIndex = -1; + byteStart = nullptr; + } + + // This version just counts the fallback and doesn't actually copy anything. + virtual int InternalFallback(BYTE bytes[], BYTE* pBytes, int size) + // Right now this has both bytes and bytes[], since we might have extra bytes, hence the + // array, and we might need the index, hence the byte* + { + // return our replacement string Length + return strDefaultLength; + } +}; + +class DecoderExceptionFallbackBuffer : public DecoderFallbackBuffer +{ +public: + DecoderExceptionFallbackBuffer() + { + } + + virtual bool Fallback(BYTE bytesUnknown[], int index, int size) + { + throw DecoderFallbackException( + "Unable to translate UTF-8 character to Unicode", bytesUnknown, index); + } + + virtual WCHAR GetNextChar() + { + return 0; + } + + virtual bool MovePrevious() + { + // Exception fallback doesn't have anywhere to back up to. + return false; + } + + // Exceptions are always empty + virtual int GetRemaining() + { + return 0; + } + +}; + +class DecoderExceptionFallback : public DecoderFallback +{ + // Construction +public: + DecoderExceptionFallback() + { + } + + virtual DecoderFallbackBuffer* CreateFallbackBuffer() + { + return InternalNew(); + } + + // Maximum number of characters that this instance of this fallback could return + virtual int GetMaxCharCount() + { + return 0; + } +}; + +DecoderFallbackBuffer* DecoderReplacementFallback::CreateFallbackBuffer() +{ + return InternalNew(this); +} + +class EncoderFallbackException : public ArgumentException +{ + WCHAR charUnknown; + WCHAR charUnknownHigh; + WCHAR charUnknownLow; + int index; + +public: + EncoderFallbackException( + LPCSTR message, WCHAR charUnknown, int index) : ArgumentException(message) + { + this->charUnknown = charUnknown; + this->index = index; + } + + EncoderFallbackException( + LPCSTR message, WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) : ArgumentException(message) + { + if (!Char::IsHighSurrogate(charUnknownHigh)) + { + throw ArgumentOutOfRangeException("charUnknownHigh", + "Argument out of range 0xD800..0xDBFF"); + } + if (!Char::IsLowSurrogate(charUnknownLow)) + { + throw ArgumentOutOfRangeException("charUnknownLow", + "Argument out of range 0xDC00..0xDFFF"); + } + Contract::EndContractBlock(); + + this->charUnknownHigh = charUnknownHigh; + this->charUnknownLow = charUnknownLow; + this->index = index; + } + + WCHAR GetCharUnknown() + { + return (charUnknown); + } + + WCHAR GetCharUnknownHigh() + { + return (charUnknownHigh); + } + + WCHAR GetCharUnknownLow() + { + return (charUnknownLow); + } + + int GetIndex() + { + return index; + } + + // Return true if the unknown character is a surrogate pair. + bool IsUnknownSurrogate() + { + return (charUnknownHigh != '\0'); + } +}; + +class EncoderFallbackBuffer; + +class EncoderFallback +{ +public: + + // Fallback + // + // Return the appropriate unicode string alternative to the character that need to fall back. + + virtual EncoderFallbackBuffer* CreateFallbackBuffer() = 0; + + // Maximum number of characters that this instance of this fallback could return + virtual int GetMaxCharCount() = 0; +}; + +class EncoderReplacementFallback : public EncoderFallback +{ + // Our variables + WCHAR strDefault[2]; + int strDefaultLength; + +public: + // Construction. Default replacement fallback uses no best fit and ? replacement string + EncoderReplacementFallback() : EncoderReplacementFallback(W("?")) + { + } + + EncoderReplacementFallback(const WCHAR* replacement) + { + // Must not be null + if (replacement == nullptr) + throw ArgumentNullException("replacement"); + Contract::EndContractBlock(); + + // Make sure it doesn't have bad surrogate pairs + bool bFoundHigh = false; + int replacementLength = PAL_wcslen((const WCHAR *)replacement); + for (int i = 0; i < replacementLength; i++) + { + // Found a surrogate? + if (Char::IsSurrogate(replacement, i)) + { + // High or Low? + if (Char::IsHighSurrogate(replacement, i)) + { + // if already had a high one, stop + if (bFoundHigh) + break; // break & throw at the bFoundHIgh below + bFoundHigh = true; + } + else + { + // Low, did we have a high? + if (!bFoundHigh) + { + // Didn't have one, make if fail when we stop + bFoundHigh = true; + break; + } + + // Clear flag + bFoundHigh = false; + } + } + // If last was high we're in trouble (not surrogate so not low surrogate, so break) + else if (bFoundHigh) + break; + } + if (bFoundHigh) + throw ArgumentException("String 'replacement' contains invalid Unicode code points.", "replacement"); + + wcscpy_s(strDefault, ARRAY_SIZE(strDefault), replacement); + strDefaultLength = replacementLength; + } + + WCHAR* GetDefaultString() + { + return strDefault; + } + + virtual EncoderFallbackBuffer* CreateFallbackBuffer(); + + // Maximum number of characters that this instance of this fallback could return + virtual int GetMaxCharCount() + { + return strDefaultLength; + } +}; + +class EncoderFallbackBuffer +{ + friend class UTF8Encoding; + // Most implementations will probably need an implementation-specific constructor + + // Public methods that cannot be overridden that let us do our fallback thing + // These wrap the internal methods so that we can check for people doing stuff that is incorrect + +public: + virtual ~EncoderFallbackBuffer() = default; + + virtual bool Fallback(WCHAR charUnknown, int index) = 0; + + virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) = 0; + + // Get next character + virtual WCHAR GetNextChar() = 0; + + // Back up a character + virtual bool MovePrevious() = 0; + + // How many chars left in this fallback? + virtual int GetRemaining() = 0; + + // Not sure if this should be public or not. + // Clear the buffer + virtual void Reset() + { + while (GetNextChar() != (WCHAR)0); + } + + // Internal items to help us figure out what we're doing as far as error messages, etc. + // These help us with our performance and messages internally +protected: + WCHAR* charStart; + WCHAR* charEnd; + bool setEncoder; + bool bUsedEncoder; + bool bFallingBack = false; + int iRecursionCount = 0; + static const int iMaxRecursion = 250; + + // Internal Reset + // For example, what if someone fails a conversion and wants to reset one of our fallback buffers? + void InternalReset() + { + charStart = nullptr; + bFallingBack = false; + iRecursionCount = 0; + Reset(); + } + + // Set the above values + // This can't be part of the constructor because EncoderFallbacks would have to know how to implement these. + void InternalInitialize(WCHAR* charStart, WCHAR* charEnd, bool setEncoder) + { + this->charStart = charStart; + this->charEnd = charEnd; + this->setEncoder = setEncoder; + this->bUsedEncoder = false; + this->bFallingBack = false; + this->iRecursionCount = 0; + } + + WCHAR InternalGetNextChar() + { + WCHAR ch = GetNextChar(); + bFallingBack = (ch != 0); + if (ch == 0) iRecursionCount = 0; + return ch; + } + + // Fallback the current character using the remaining buffer and encoder if necessary + // This can only be called by our encodings (other have to use the public fallback methods), so + // we can use our EncoderNLS here too. + // setEncoder is true if we're calling from a GetBytes method, false if we're calling from a GetByteCount + // + // Note that this could also change the contents of this->encoder, which is the same + // object that the caller is using, so the caller could mess up the encoder for us + // if they aren't careful. + virtual bool InternalFallback(WCHAR ch, WCHAR** chars) + { + // Shouldn't have null charStart + Contract::Assert(charStart != nullptr, + "[EncoderFallback.InternalFallbackBuffer]Fallback buffer is not initialized"); + + // Get our index, remember chars was preincremented to point at next char, so have to -1 + int index = (int)(*chars - charStart) - 1; + + // See if it was a high surrogate + if (Char::IsHighSurrogate(ch)) + { + // See if there's a low surrogate to go with it + if (*chars >= this->charEnd) + { + // Nothing left in input buffer + // No input, return 0 + } + else + { + // Might have a low surrogate + WCHAR cNext = **chars; + if (Char::IsLowSurrogate(cNext)) + { + // If already falling back then fail + if (bFallingBack && iRecursionCount++ > iMaxRecursion) + ThrowLastCharRecursive(ch, cNext); + + // Next is a surrogate, add it as surrogate pair, and increment chars + (*chars)++; + bFallingBack = Fallback(ch, cNext, index); + return bFallingBack; + } + + // Next isn't a low surrogate, just fallback the high surrogate + } + } + + // If already falling back then fail + if (bFallingBack && iRecursionCount++ > iMaxRecursion) + ThrowLastCharRecursive((int)ch); + + // Fall back our char + bFallingBack = Fallback(ch, index); + + return bFallingBack; + } + + // private helper methods + void ThrowLastCharRecursive(WCHAR highSurrogate, WCHAR lowSurrogate) + { + // Throw it, using our complete character + throw ArgumentException("Recursive fallback not allowed", "chars"); + } + + void ThrowLastCharRecursive(int utf32Char) + { + throw ArgumentException("Recursive fallback not allowed", "chars"); + } + +}; + +class EncoderReplacementFallbackBuffer : public EncoderFallbackBuffer +{ + // Store our default string + WCHAR strDefault[4]; + int strDefaultLength; + int fallbackCount = -1; + int fallbackIndex = -1; +public: + // Construction + EncoderReplacementFallbackBuffer(EncoderReplacementFallback* fallback) + { + // 2X in case we're a surrogate pair + wcscpy_s(strDefault, ARRAY_SIZE(strDefault), fallback->GetDefaultString()); + wcscat_s(strDefault, ARRAY_SIZE(strDefault), fallback->GetDefaultString()); + strDefaultLength = 2 * PAL_wcslen((const WCHAR *)fallback->GetDefaultString()); + + } + + // Fallback Methods + virtual bool Fallback(WCHAR charUnknown, int index) + { + // If we had a buffer already we're being recursive, throw, it's probably at the suspect + // character in our array. + if (fallbackCount >= 1) + { + // If we're recursive we may still have something in our buffer that makes this a surrogate + if (Char::IsHighSurrogate(charUnknown) && fallbackCount >= 0 && + Char::IsLowSurrogate(strDefault[fallbackIndex + 1])) + ThrowLastCharRecursive(charUnknown, strDefault[fallbackIndex + 1]); + + // Nope, just one character + ThrowLastCharRecursive((int)charUnknown); + } + + // Go ahead and get our fallback + // Divide by 2 because we aren't a surrogate pair + fallbackCount = strDefaultLength / 2; + fallbackIndex = -1; + + return fallbackCount != 0; + } + + virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) + { + // Double check input surrogate pair + if (!Char::IsHighSurrogate(charUnknownHigh)) + throw ArgumentOutOfRangeException("charUnknownHigh", + "Argument out of range 0xD800..0xDBFF"); + + if (!Char::IsLowSurrogate(charUnknownLow)) + throw ArgumentOutOfRangeException("charUnknownLow", + "Argument out of range 0xDC00..0xDFFF"); + Contract::EndContractBlock(); + + // If we had a buffer already we're being recursive, throw, it's probably at the suspect + // character in our array. + if (fallbackCount >= 1) + ThrowLastCharRecursive(charUnknownHigh, charUnknownLow); + + // Go ahead and get our fallback + fallbackCount = strDefaultLength; + fallbackIndex = -1; + + return fallbackCount != 0; + } + + virtual WCHAR GetNextChar() + { + // We want it to get < 0 because == 0 means that the current/last character is a fallback + // and we need to detect recursion. We could have a flag but we already have this counter. + fallbackCount--; + fallbackIndex++; + + // Do we have anything left? 0 is now last fallback char, negative is nothing left + if (fallbackCount < 0) + return '\0'; + + // Need to get it out of the buffer. + // Make sure it didn't wrap from the fast count-- path + if (fallbackCount == INT_MAX) + { + fallbackCount = -1; + return '\0'; + } + + // Now make sure its in the expected range + Contract::Assert(fallbackIndex < strDefaultLength && fallbackIndex >= 0, + "Index exceeds buffer range"); + + return strDefault[fallbackIndex]; + } + + virtual bool MovePrevious() + { + // Back up one, only if we just processed the last character (or earlier) + if (fallbackCount >= -1 && fallbackIndex >= 0) + { + fallbackIndex--; + fallbackCount++; + return true; + } + + // Return false 'cause we couldn't do it. + return false; + } + + // How many characters left to output? + virtual int GetRemaining() + { + // Our count is 0 for 1 character left. + return (fallbackCount < 0) ? 0 : fallbackCount; + } + + // Clear the buffer + virtual void Reset() + { + fallbackCount = -1; + fallbackIndex = 0; + charStart = nullptr; + bFallingBack = false; + } +}; + +class EncoderExceptionFallbackBuffer : public EncoderFallbackBuffer +{ +public: + EncoderExceptionFallbackBuffer() + { + } + + virtual bool Fallback(WCHAR charUnknown, int index) + { + // Fall back our char + throw EncoderFallbackException("Unable to translate Unicode character to UTF-8", charUnknown, index); + } + + virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) + { + if (!Char::IsHighSurrogate(charUnknownHigh)) + { + throw ArgumentOutOfRangeException("charUnknownHigh", + "Argument out of range 0xD800..0xDBFF"); + } + if (!Char::IsLowSurrogate(charUnknownLow)) + { + throw ArgumentOutOfRangeException("charUnknownLow", + "Argument out of range 0xDC00..0xDFFF"); + } + Contract::EndContractBlock(); + + //int iTemp = Char::ConvertToUtf32(charUnknownHigh, charUnknownLow); + + // Fall back our char + throw EncoderFallbackException( + "Unable to translate Unicode character to UTF-8", charUnknownHigh, charUnknownLow, index); + } + + virtual WCHAR GetNextChar() + { + return 0; + } + + virtual bool MovePrevious() + { + // Exception fallback doesn't have anywhere to back up to. + return false; + } + + // Exceptions are always empty + virtual int GetRemaining() + { + return 0; + } +}; + +class EncoderExceptionFallback : public EncoderFallback +{ + // Construction +public: + EncoderExceptionFallback() + { + } + + virtual EncoderFallbackBuffer* CreateFallbackBuffer() + { + return InternalNew(); + } + + // Maximum number of characters that this instance of this fallback could return + virtual int GetMaxCharCount() + { + return 0; + } +}; + +EncoderFallbackBuffer* EncoderReplacementFallback::CreateFallbackBuffer() +{ + return InternalNew(this); +} + +class UTF8Encoding +{ + EncoderFallback* encoderFallback; + // Instances of the two possible fallbacks. The constructor parameter + // determines which one to use. + EncoderReplacementFallback encoderReplacementFallback; + EncoderExceptionFallback encoderExceptionFallback; + + DecoderFallback* decoderFallback; + // Instances of the two possible fallbacks. The constructor parameter + // determines which one to use. + DecoderReplacementFallback decoderReplacementFallback; + DecoderExceptionFallback decoderExceptionFallback; + + bool InRange(int c, int begin, int end) + { + return begin <= c && c <= end; + } + + size_t PtrDiff(WCHAR* ptr1, WCHAR* ptr2) + { + return ptr1 - ptr2; + } + + size_t PtrDiff(BYTE* ptr1, BYTE* ptr2) + { + return ptr1 - ptr2; + } + + void ThrowBytesOverflow() + { + // Special message to include fallback type in case fallback's GetMaxCharCount is broken + // This happens if user has implemented an encoder fallback with a broken GetMaxCharCount + throw InsufficientBufferException("The output byte buffer is too small to contain the encoded data", "bytes"); + } + + void ThrowBytesOverflow(bool nothingEncoded) + { + // Special message to include fallback type in case fallback's GetMaxCharCount is broken + // This happens if user has implemented an encoder fallback with a broken GetMaxCharCount + if (nothingEncoded){ + ThrowBytesOverflow(); + } + } + + void ThrowCharsOverflow() + { + // Special message to include fallback type in case fallback's GetMaxCharCount is broken + // This happens if user has implemented a decoder fallback with a broken GetMaxCharCount + throw InsufficientBufferException("The output char buffer is too small to contain the encoded data", "chars"); + } + + void ThrowCharsOverflow(bool nothingEncoded) + { + // Special message to include fallback type in case fallback's GetMaxCharCount is broken + // This happens if user has implemented an decoder fallback with a broken GetMaxCharCount + if (nothingEncoded){ + ThrowCharsOverflow(); + } + } + + // During GetChars we had an invalid byte sequence + // pSrc is backed up to the start of the bad sequence if we didn't have room to + // fall it back. Otherwise pSrc remains where it is. + bool FallbackInvalidByteSequence(BYTE** pSrc, int ch, DecoderFallbackBuffer* fallback, WCHAR** pTarget) + { + // Get our byte[] + BYTE* pStart = *pSrc; + BYTE bytesUnknown[3]; + int size = GetBytesUnknown(pStart, ch, bytesUnknown); + + // Do the actual fallback + if (!fallback->InternalFallback(bytesUnknown, *pSrc, pTarget, size)) + { + // Oops, it failed, back up to pStart + *pSrc = pStart; + return false; + } + + // It worked + return true; + } + + int FallbackInvalidByteSequence(BYTE* pSrc, int ch, DecoderFallbackBuffer *fallback) + { + // Get our byte[] + BYTE bytesUnknown[3]; + int size = GetBytesUnknown(pSrc, ch, bytesUnknown); + + // Do the actual fallback + int count = fallback->InternalFallback(bytesUnknown, pSrc, size); + + // # of fallback chars expected. + // Note that we only get here for "long" sequences, and have already unreserved + // the count that we prereserved for the input bytes + return count; + } + + int GetBytesUnknown(BYTE* pSrc, int ch, BYTE* bytesUnknown) + { + int size; + + // See if it was a plain char + // (have to check >= 0 because we have all sorts of weird bit flags) + if (ch < 0x100 && ch >= 0) + { + pSrc--; + bytesUnknown[0] = (BYTE)ch; + size = 1; + } + // See if its an unfinished 2 byte sequence + else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0) + { + pSrc--; + bytesUnknown[0] = (BYTE)((ch & 0x1F) | 0xc0); + size = 1; + } + // So now we're either 2nd byte of 3 or 4 byte sequence or + // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence + // 1st check if its a 4 byte sequence + else if ((ch & SupplimentarySeq) != 0) + { + // 3rd byte of 4 byte sequence? + if ((ch & (FinalByte >> 6)) != 0) + { + // 3rd byte of 4 byte sequence + pSrc -= 3; + bytesUnknown[0] = (BYTE)(((ch >> 12) & 0x07) | 0xF0); + bytesUnknown[1] = (BYTE)(((ch >> 6) & 0x3F) | 0x80); + bytesUnknown[2] = (BYTE)(((ch)& 0x3F) | 0x80); + size = 3; + } + else if ((ch & (FinalByte >> 12)) != 0) + { + // 2nd byte of a 4 byte sequence + pSrc -= 2; + bytesUnknown[0] = (BYTE)(((ch >> 6) & 0x07) | 0xF0); + bytesUnknown[1] = (BYTE)(((ch)& 0x3F) | 0x80); + size = 2; + } + else + { + // 4th byte of a 4 byte sequence + pSrc--; + bytesUnknown[0] = (BYTE)(((ch)& 0x07) | 0xF0); + size = 1; + } + } + else + { + // 2nd byte of 3 byte sequence? + if ((ch & (FinalByte >> 6)) != 0) + { + // So its 2nd byte of a 3 byte sequence + pSrc -= 2; + bytesUnknown[0] = (BYTE)(((ch >> 6) & 0x0F) | 0xE0); + bytesUnknown[1] = (BYTE)(((ch)& 0x3F) | 0x80); + size = 2; + } + else + { + // 1st byte of a 3 byte sequence + pSrc--; + bytesUnknown[0] = (BYTE)(((ch)& 0x0F) | 0xE0); + size = 1; + } + } + + return size; + } + +public: + + UTF8Encoding(bool isThrowException) + : encoderReplacementFallback(W("\xFFFD")), decoderReplacementFallback(W("\xFFFD")) + { + if (isThrowException) + { + encoderFallback = &encoderExceptionFallback; + decoderFallback = &decoderExceptionFallback; + } + else + { + encoderFallback = &encoderReplacementFallback; + decoderFallback = &decoderReplacementFallback; + } + } + + // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits + // while the actual character is being built in the lower bits. They are shifted together + // with the actual bits of the character. + + // bits 30 & 31 are used for pending bits fixup + const int FinalByte = 1 << 29; + const int SupplimentarySeq = 1 << 28; + const int ThreeByteSeq = 1 << 27; + + int GetCharCount(BYTE* bytes, int count) + { + Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetCharCount]bytes!=nullptr"); + Contract::Assert(count >= 0, "[UTF8Encoding.GetCharCount]count >=0"); + + // Initialize stuff + BYTE *pSrc = bytes; + BYTE *pEnd = pSrc + count; + + // Start by assuming we have as many as count, charCount always includes the adjustment + // for the character being decoded + int charCount = count; + int ch = 0; + DecoderFallbackBuffer *fallback = nullptr; + + while (true) + { + // SLOWLOOP: does all range checks, handles all special cases, but it is slow + if (pSrc >= pEnd) { + break; + } + + // read next byte. The JIT optimization seems to be getting confused when + // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead + int cha = *pSrc; + + if (ch == 0) { + // no pending bits + goto ReadChar; + } + + pSrc++; + + // we are expecting to see trailing bytes like 10vvvvvv + if ((cha & 0xC0) != 0x80) { + // This can be a valid starting byte for another UTF8 byte sequence, so let's put + // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence + pSrc--; + charCount += (ch >> 30); + goto InvalidByteSequence; + } + + // fold in the new byte + ch = (ch << 6) | (cha & 0x3F); + + if ((ch & FinalByte) == 0) { + Contract::Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0, + "[UTF8Encoding.GetChars]Invariant volation"); + + if ((ch & SupplimentarySeq) != 0) { + if ((ch & (FinalByte >> 6)) != 0) { + // this is 3rd byte (of 4 byte supplimentary) - nothing to do + continue; + } + + // 2nd byte, check for non-shortest form of supplimentary char and the valid + // supplimentary characters in range 0x010000 - 0x10FFFF at the same time + if (!InRange(ch & 0x1F0, 0x10, 0x100)) { + goto InvalidByteSequence; + } + } + else { + // Must be 2nd byte of a 3-byte sequence + // check for non-shortest form of 3 byte seq + if ((ch & (0x1F << 5)) == 0 || // non-shortest form + (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate + { + goto InvalidByteSequence; + } + } + continue; + } + + // ready to punch + + // adjust for surrogates in non-shortest form + if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq) { + charCount--; + } + goto EncodeChar; + + InvalidByteSequence: + // this code fragment should be close to the gotos referencing it + // Have to do fallback for invalid bytes + if (fallback == nullptr) + { + fallback = decoderFallback->CreateFallbackBuffer(); + fallback->InternalInitialize(bytes, nullptr); + } + charCount += FallbackInvalidByteSequence(pSrc, ch, fallback); + + ch = 0; + continue; + + ReadChar: + ch = *pSrc; + pSrc++; + + ProcessChar: + if (ch > 0x7F) { + // If its > 0x7F, its start of a new multi-byte sequence + + // Long sequence, so unreserve our char. + charCount--; + + // bit 6 has to be non-zero for start of multibyte chars. + if ((ch & 0x40) == 0) { + // Unexpected trail byte + goto InvalidByteSequence; + } + + // start a new long code + if ((ch & 0x20) != 0) { + if ((ch & 0x10) != 0) { + // 4 byte encoding - supplimentary character (2 surrogates) + + ch &= 0x0F; + + // check that bit 4 is zero and the valid supplimentary character + // range 0x000000 - 0x10FFFF at the same time + if (ch > 0x04) { + ch |= 0xf0; + goto InvalidByteSequence; + } + + // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. + // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag. + ch |= (FinalByte >> 3 * 6) | // Final byte is 3 more bytes from now + (1 << 30) | // If it dies on next byte we'll need an extra char + (3 << (30 - 2 * 6)) | // If it dies on last byte we'll need to subtract a char + (SupplimentarySeq) | (SupplimentarySeq >> 6) | + (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); + + // Our character count will be 2 characters for these 4 bytes, so subtract another char + charCount--; + } + else { + // 3 byte encoding + // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. + ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | + (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); + + // We'll expect 1 character for these 3 bytes, so subtract another char. + charCount--; + } + } + else { + // 2 byte encoding + + ch &= 0x1F; + + // check for non-shortest form + if (ch <= 1) { + ch |= 0xc0; + goto InvalidByteSequence; + } + + // Add bit flags so we'll be flagged correctly + ch |= (FinalByte >> 6); + } + continue; + } + + EncodeChar: + +#ifdef FASTLOOP + int availableBytes = PtrDiff(pEnd, pSrc); + + // don't fall into the fast decoding loop if we don't have enough bytes + if (availableBytes <= 13) { + // try to get over the remainder of the ascii characters fast though + BYTE* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered + while (pSrc < pLocalEnd) { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) + goto ProcessChar; + } + // we are done + ch = 0; + break; + } + + // To compute the upper bound, assume that all characters are ASCII characters at this point, + // the boundary will be decreased for every non-ASCII character we encounter + // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences + BYTE *pStop = pSrc + availableBytes - 7; + + while (pSrc < pStop) { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) { + goto LongCode; + } + + // get pSrc 2-byte aligned + if (((size_t)pSrc & 0x1) != 0) { + ch = *pSrc; + pSrc++; + if (ch > 0x7F) { + goto LongCode; + } + } + + // get pSrc 4-byte aligned + if (((size_t)pSrc & 0x2) != 0) { + ch = *(USHORT*)pSrc; + if ((ch & 0x8080) != 0) { + goto LongCodeWithMask16; + } + pSrc += 2; + } + + + // Run 8 + 8 characters at a time! + while (pSrc < pStop) { + ch = *(int*)pSrc; + int chb = *(int*)(pSrc + 4); + if (((ch | chb) & (int)0x80808080) != 0) { + goto LongCodeWithMask32; + } + pSrc += 8; + + // This is a really small loop - unroll it + if (pSrc >= pStop) + break; + + ch = *(int*)pSrc; + chb = *(int*)(pSrc + 4); + if (((ch | chb) & (int)0x80808080) != 0) { + goto LongCodeWithMask32; + } + pSrc += 8; + } + break; + +#if BIGENDIAN + LongCodeWithMask32 : + // be careful about the sign extension + ch = (int)(((uint)ch) >> 16); + LongCodeWithMask16: + ch = (int)(((uint)ch) >> 8); +#else // BIGENDIAN + LongCodeWithMask32: + LongCodeWithMask16: + ch &= 0xFF; +#endif // BIGENDIAN + pSrc++; + if (ch <= 0x7F) { + continue; + } + + LongCode: + int chc = *pSrc; + pSrc++; + + if ( + // bit 6 has to be zero + (ch & 0x40) == 0 || + // we are expecting to see trailing bytes like 10vvvvvv + (chc & 0xC0) != 0x80) + { + goto BadLongCode; + } + + chc &= 0x3F; + + // start a new long code + if ((ch & 0x20) != 0) { + + // fold the first two bytes together + chc |= (ch & 0x0F) << 6; + + if ((ch & 0x10) != 0) { + // 4 byte encoding - surrogate + ch = *pSrc; + if ( + // check that bit 4 is zero, the non-shortest form of surrogate + // and the valid surrogate range 0x000000 - 0x10FFFF at the same time + !InRange(chc >> 4, 0x01, 0x10) || + // we are expecting to see trailing bytes like 10vvvvvv + (ch & 0xC0) != 0x80) + { + goto BadLongCode; + } + + chc = (chc << 6) | (ch & 0x3F); + + ch = *(pSrc + 1); + // we are expecting to see trailing bytes like 10vvvvvv + if ((ch & 0xC0) != 0x80) { + goto BadLongCode; + } + pSrc += 2; + + // extra byte + charCount--; + } + else { + // 3 byte encoding + ch = *pSrc; + if ( + // check for non-shortest form of 3 byte seq + (chc & (0x1F << 5)) == 0 || + // Can't have surrogates here. + (chc & (0xF800 >> 6)) == (0xD800 >> 6) || + // we are expecting to see trailing bytes like 10vvvvvv + (ch & 0xC0) != 0x80) + { + goto BadLongCode; + } + pSrc++; + + // extra byte + charCount--; + } + } + else { + // 2 byte encoding + + // check for non-shortest form + if ((ch & 0x1E) == 0) { + goto BadLongCode; + } + } + + // extra byte + charCount--; + } +#endif // FASTLOOP + + // no pending bits at this point + ch = 0; + continue; + + BadLongCode: + pSrc -= 2; + ch = 0; + continue; + } + + // May have a problem if we have to flush + if (ch != 0) + { + // We were already adjusting for these, so need to unadjust + charCount += (ch >> 30); + // Have to do fallback for invalid bytes + if (fallback == nullptr) + { + fallback = decoderFallback->CreateFallbackBuffer(); + fallback->InternalInitialize(bytes, nullptr); + } + charCount += FallbackInvalidByteSequence(pSrc, ch, fallback); + } + + // Shouldn't have anything in fallback buffer for GetCharCount + // (don't have to check m_throwOnOverflow for count) + Contract::Assert(fallback == nullptr || fallback->GetRemaining() == 0, + "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end"); + + InternalDelete(fallback); + + return charCount; + + } + + int GetChars(BYTE* bytes, int byteCount, WCHAR* chars, int charCount) + { + Contract::Assert(chars != nullptr, "[UTF8Encoding.GetChars]chars!=nullptr"); + Contract::Assert(byteCount >= 0, "[UTF8Encoding.GetChars]byteCount >=0"); + Contract::Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0"); + Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetChars]bytes!=nullptr"); + + BYTE *pSrc = bytes; + WCHAR *pTarget = chars; + + BYTE *pEnd = pSrc + byteCount; + WCHAR *pAllocatedBufferEnd = pTarget + charCount; + + int ch = 0; + + DecoderFallbackBuffer *fallback = nullptr; + + while (true) + { + // SLOWLOOP: does all range checks, handles all special cases, but it is slow + + if (pSrc >= pEnd) { + break; + } + + // read next byte. The JIT optimization seems to be getting confused when + // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead + int cha = *pSrc; + + if (ch == 0) { + // no pending bits + goto ReadChar; + } + + pSrc++; + + // we are expecting to see trailing bytes like 10vvvvvv + if ((cha & 0xC0) != 0x80) { + // This can be a valid starting byte for another UTF8 byte sequence, so let's put + // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence + pSrc--; + goto InvalidByteSequence; + } + + // fold in the new byte + ch = (ch << 6) | (cha & 0x3F); + + if ((ch & FinalByte) == 0) { + // Not at last byte yet + Contract::Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0, + "[UTF8Encoding.GetChars]Invariant volation"); + + if ((ch & SupplimentarySeq) != 0) { + // Its a 4-byte supplimentary sequence + if ((ch & (FinalByte >> 6)) != 0) { + // this is 3rd byte of 4 byte sequence - nothing to do + continue; + } + + // 2nd byte of 4 bytes + // check for non-shortest form of surrogate and the valid surrogate + // range 0x000000 - 0x10FFFF at the same time + if (!InRange(ch & 0x1F0, 0x10, 0x100)) { + goto InvalidByteSequence; + } + } + else { + // Must be 2nd byte of a 3-byte sequence + // check for non-shortest form of 3 byte seq + if ((ch & (0x1F << 5)) == 0 || // non-shortest form + (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate + { + goto InvalidByteSequence; + } + } + continue; + } + + // ready to punch + + // surrogate in shortest form? + // Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte? + if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq) { + // let the range check for the second char throw the exception + if (pTarget < pAllocatedBufferEnd) { + *pTarget = (WCHAR)(((ch >> 10) & 0x7FF) + + (SHORT)((CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10)))); + pTarget++; + + ch = (ch & 0x3FF) + + (int)(CharUnicodeInfo::LOW_SURROGATE_START); + } + } + + goto EncodeChar; + + InvalidByteSequence: + // this code fragment should be close to the gotos referencing it + // Have to do fallback for invalid bytes + if (fallback == nullptr) + { + fallback = decoderFallback->CreateFallbackBuffer(); + fallback->InternalInitialize(bytes, pAllocatedBufferEnd); + } + + // That'll back us up the appropriate # of bytes if we didn't get anywhere + if (!FallbackInvalidByteSequence(&pSrc, ch, fallback, &pTarget)) + { + // Ran out of buffer space + // Need to throw an exception? + Contract::Assert(pSrc >= bytes || pTarget == chars, + "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback"); + fallback->InternalReset(); + ThrowCharsOverflow(pTarget == chars); + ch = 0; + break; + } + Contract::Assert(pSrc >= bytes, + "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array"); + ch = 0; + continue; + + ReadChar: + ch = *pSrc; + pSrc++; + + ProcessChar: + if (ch > 0x7F) { + // If its > 0x7F, its start of a new multi-byte sequence + + // bit 6 has to be non-zero + if ((ch & 0x40) == 0) { + goto InvalidByteSequence; + } + + // start a new long code + if ((ch & 0x20) != 0) { + if ((ch & 0x10) != 0) { + // 4 byte encoding - supplimentary character (2 surrogates) + + ch &= 0x0F; + + // check that bit 4 is zero and the valid supplimentary character + // range 0x000000 - 0x10FFFF at the same time + if (ch > 0x04) { + ch |= 0xf0; + goto InvalidByteSequence; + } + + ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) | + (SupplimentarySeq) | (SupplimentarySeq >> 6) | + (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); + } + else { + // 3 byte encoding + ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | + (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); + } + } + else { + // 2 byte encoding + + ch &= 0x1F; + + // check for non-shortest form + if (ch <= 1) { + ch |= 0xc0; + goto InvalidByteSequence; + } + + ch |= (FinalByte >> 6); + } + continue; + } + + EncodeChar: + // write the pending character + if (pTarget >= pAllocatedBufferEnd) + { + // Fix chars so we make sure to throw if we didn't output anything + ch &= 0x1fffff; + if (ch > 0x7f) + { + if (ch > 0x7ff) + { + if (ch >= CharUnicodeInfo::LOW_SURROGATE_START && + ch <= CharUnicodeInfo::LOW_SURROGATE_END) + { + pSrc--; // It was 4 bytes + pTarget--; // 1 was stored already, but we can't remember 1/2, so back up + } + else if (ch > 0xffff) + { + pSrc--; // It was 4 bytes, nothing was stored + } + pSrc--; // It was at least 3 bytes + } + pSrc--; // It was at least 2 bytes + } + pSrc--; + + // Throw that we don't have enough room (pSrc could be < chars if we had started to process + // a 4 byte sequence already) + Contract::Assert(pSrc >= bytes || pTarget == chars, + "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]"); + ThrowCharsOverflow(pTarget == chars); + + // Don't store ch in decoder, we already backed up to its start + ch = 0; + + // Didn't throw, just use this buffer size. + break; + } + *pTarget = (WCHAR)ch; + pTarget++; + +#ifdef FASTLOOP + int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget); + int availableBytes = PtrDiff(pEnd, pSrc); + + // don't fall into the fast decoding loop if we don't have enough bytes + // Test for availableChars is done because pStop would be <= pTarget. + if (availableBytes <= 13) { + // we may need as many as 1 character per byte + if (availableChars < availableBytes) { + // not enough output room. no pending bits at this point + ch = 0; + continue; + } + + // try to get over the remainder of the ascii characters fast though + BYTE* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered + while (pSrc < pLocalEnd) { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) + goto ProcessChar; + + *pTarget = (WCHAR)ch; + pTarget++; + } + // we are done + ch = 0; + break; + } + + // we may need as many as 1 character per byte, so reduce the byte count if necessary. + // If availableChars is too small, pStop will be before pTarget and we won't do fast loop. + if (availableChars < availableBytes) { + availableBytes = availableChars; + } + + // To compute the upper bound, assume that all characters are ASCII characters at this point, + // the boundary will be decreased for every non-ASCII character we encounter + // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences + WCHAR *pStop = pTarget + availableBytes - 7; + + while (pTarget < pStop) { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) { + goto LongCode; + } + *pTarget = (WCHAR)ch; + pTarget++; + + // get pSrc to be 2-byte aligned + if ((((size_t)pSrc) & 0x1) != 0) { + ch = *pSrc; + pSrc++; + if (ch > 0x7F) { + goto LongCode; + } + *pTarget = (WCHAR)ch; + pTarget++; + } + + // get pSrc to be 4-byte aligned + if ((((size_t)pSrc) & 0x2) != 0) { + ch = *(USHORT*)pSrc; + if ((ch & 0x8080) != 0) { + goto LongCodeWithMask16; + } + + // Unfortunately, this is endianness sensitive +#if BIGENDIAN + *pTarget = (WCHAR)((ch >> 8) & 0x7F); + pSrc += 2; + *(pTarget + 1) = (WCHAR)(ch & 0x7F); + pTarget += 2; +#else // BIGENDIAN + *pTarget = (WCHAR)(ch & 0x7F); + pSrc += 2; + *(pTarget + 1) = (WCHAR)((ch >> 8) & 0x7F); + pTarget += 2; +#endif // BIGENDIAN + } + + // Run 8 characters at a time! + while (pTarget < pStop) { + ch = *(int*)pSrc; + int chb = *(int*)(pSrc + 4); + if (((ch | chb) & (int)0x80808080) != 0) { + goto LongCodeWithMask32; + } + + // Unfortunately, this is endianness sensitive +#if BIGENDIAN + *pTarget = (WCHAR)((ch >> 24) & 0x7F); + *(pTarget + 1) = (WCHAR)((ch >> 16) & 0x7F); + *(pTarget + 2) = (WCHAR)((ch >> 8) & 0x7F); + *(pTarget + 3) = (WCHAR)(ch & 0x7F); + pSrc += 8; + *(pTarget + 4) = (WCHAR)((chb >> 24) & 0x7F); + *(pTarget + 5) = (WCHAR)((chb >> 16) & 0x7F); + *(pTarget + 6) = (WCHAR)((chb >> 8) & 0x7F); + *(pTarget + 7) = (WCHAR)(chb & 0x7F); + pTarget += 8; +#else // BIGENDIAN + *pTarget = (WCHAR)(ch & 0x7F); + *(pTarget + 1) = (WCHAR)((ch >> 8) & 0x7F); + *(pTarget + 2) = (WCHAR)((ch >> 16) & 0x7F); + *(pTarget + 3) = (WCHAR)((ch >> 24) & 0x7F); + pSrc += 8; + *(pTarget + 4) = (WCHAR)(chb & 0x7F); + *(pTarget + 5) = (WCHAR)((chb >> 8) & 0x7F); + *(pTarget + 6) = (WCHAR)((chb >> 16) & 0x7F); + *(pTarget + 7) = (WCHAR)((chb >> 24) & 0x7F); + pTarget += 8; +#endif // BIGENDIAN + } + break; + +#if BIGENDIAN + LongCodeWithMask32 : + // be careful about the sign extension + ch = (int)(((uint)ch) >> 16); + LongCodeWithMask16: + ch = (int)(((uint)ch) >> 8); +#else // BIGENDIAN + LongCodeWithMask32: + LongCodeWithMask16: + ch &= 0xFF; +#endif // BIGENDIAN + pSrc++; + if (ch <= 0x7F) { + *pTarget = (WCHAR)ch; + pTarget++; + continue; + } + + LongCode: + int chc = *pSrc; + pSrc++; + + if ( + // bit 6 has to be zero + (ch & 0x40) == 0 || + // we are expecting to see trailing bytes like 10vvvvvv + (chc & 0xC0) != 0x80) + { + goto BadLongCode; + } + + chc &= 0x3F; + + // start a new long code + if ((ch & 0x20) != 0) { + + // fold the first two bytes together + chc |= (ch & 0x0F) << 6; + + if ((ch & 0x10) != 0) { + // 4 byte encoding - surrogate + ch = *pSrc; + if ( + // check that bit 4 is zero, the non-shortest form of surrogate + // and the valid surrogate range 0x000000 - 0x10FFFF at the same time + !InRange(chc >> 4, 0x01, 0x10) || + // we are expecting to see trailing bytes like 10vvvvvv + (ch & 0xC0) != 0x80) + { + goto BadLongCode; + } + + chc = (chc << 6) | (ch & 0x3F); + + ch = *(pSrc + 1); + // we are expecting to see trailing bytes like 10vvvvvv + if ((ch & 0xC0) != 0x80) { + goto BadLongCode; + } + pSrc += 2; + + ch = (chc << 6) | (ch & 0x3F); + + *pTarget = (WCHAR)(((ch >> 10) & 0x7FF) + + (SHORT)(CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10))); + pTarget++; + + ch = (ch & 0x3FF) + + (SHORT)(CharUnicodeInfo::LOW_SURROGATE_START); + + // extra byte, we're already planning 2 chars for 2 of these bytes, + // but the big loop is testing the target against pStop, so we need + // to subtract 2 more or we risk overrunning the input. Subtract + // one here and one below. + pStop--; + } + else { + // 3 byte encoding + ch = *pSrc; + if ( + // check for non-shortest form of 3 byte seq + (chc & (0x1F << 5)) == 0 || + // Can't have surrogates here. + (chc & (0xF800 >> 6)) == (0xD800 >> 6) || + // we are expecting to see trailing bytes like 10vvvvvv + (ch & 0xC0) != 0x80) + { + goto BadLongCode; + } + pSrc++; + + ch = (chc << 6) | (ch & 0x3F); + + // extra byte, we're only expecting 1 char for each of these 3 bytes, + // but the loop is testing the target (not source) against pStop, so + // we need to subtract 2 more or we risk overrunning the input. + // Subtract 1 here and one more below + pStop--; + } + } + else { + // 2 byte encoding + + ch &= 0x1F; + + // check for non-shortest form + if (ch <= 1) { + goto BadLongCode; + } + ch = (ch << 6) | chc; + } + + *pTarget = (WCHAR)ch; + pTarget++; + + // extra byte, we're only expecting 1 char for each of these 2 bytes, + // but the loop is testing the target (not source) against pStop. + // subtract an extra count from pStop so that we don't overrun the input. + pStop--; + } +#endif // FASTLOOP + + Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd"); + + // no pending bits at this point + ch = 0; + continue; + + BadLongCode: + pSrc -= 2; + ch = 0; + continue; + } + + if (ch != 0) + { + // Have to do fallback for invalid bytes + if (fallback == nullptr) + { + fallback = decoderFallback->CreateFallbackBuffer(); + fallback->InternalInitialize(bytes, pAllocatedBufferEnd); + } + + // This'll back us up the appropriate # of bytes if we didn't get anywhere + if (!FallbackInvalidByteSequence(pSrc, ch, fallback)) + { + Contract::Assert(pSrc >= bytes || pTarget == chars, + "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing"); + + // Ran out of buffer space + // Need to throw an exception? + fallback->InternalReset(); + ThrowCharsOverflow(pTarget == chars); + } + Contract::Assert(pSrc >= bytes, + "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array"); + ch = 0; + } + + // Shouldn't have anything in fallback buffer for GetChars + // (don't have to check m_throwOnOverflow for chars) + Contract::Assert(fallback == nullptr || fallback->GetRemaining() == 0, + "[UTF8Encoding.GetChars]Expected empty fallback buffer at end"); + + InternalDelete(fallback); + + return PtrDiff(pTarget, chars); + } + + int GetBytes(WCHAR* chars, int charCount, BYTE* bytes, int byteCount) + { + Contract::Assert(chars != nullptr, "[UTF8Encoding.GetBytes]chars!=nullptr"); + Contract::Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0"); + Contract::Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0"); + Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetBytes]bytes!=nullptr"); + + // For fallback we may need a fallback buffer. + // We wait to initialize it though in case we don't have any broken input unicode + EncoderFallbackBuffer* fallbackBuffer = nullptr; + WCHAR *pSrc = chars; + BYTE *pTarget = bytes; + + WCHAR *pEnd = pSrc + charCount; + BYTE *pAllocatedBufferEnd = pTarget + byteCount; + + int ch = 0; + + // assume that JIT will enregister pSrc, pTarget and ch + + while (true) { + // SLOWLOOP: does all range checks, handles all special cases, but it is slow + + if (pSrc >= pEnd) { + + if (ch == 0) { + // Check if there's anything left to get out of the fallback buffer + ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : 0; + if (ch > 0) { + goto ProcessChar; + } + } + else { + // Case of leftover surrogates in the fallback buffer + if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) { + Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF, + "[UTF8Encoding.GetBytes]expected high surrogate"); //, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); + + int cha = ch; + + ch = fallbackBuffer->InternalGetNextChar(); + + if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { + ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo::LOW_SURROGATE_START - (CharUnicodeInfo::HIGH_SURROGATE_START << 10)); + goto EncodeChar; + } + else if (ch > 0){ + goto ProcessChar; + } + else { + break; + } + } + } + + // attempt to encode the partial surrogate (will fail or ignore) + if (ch > 0) + goto EncodeChar; + + // We're done + break; + } + + if (ch > 0) { + // We have a high surrogate left over from a previous loop. + Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF, + "[UTF8Encoding.GetBytes]expected high surrogate");//, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); + + // use separate helper variables for local contexts so that the jit optimizations + // won't get confused about the variable lifetimes + int cha = *pSrc; + + // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. + // if (IsLowSurrogate(cha)) { + if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { + ch = cha + (ch << 10) + + (0x10000 + - CharUnicodeInfo::LOW_SURROGATE_START + - (CharUnicodeInfo::HIGH_SURROGATE_START << 10)); + + pSrc++; + } + // else ch is still high surrogate and encoding will fail + + // attempt to encode the surrogate or partial surrogate + goto EncodeChar; + } + + // If we've used a fallback, then we have to check for it + if (fallbackBuffer != nullptr) + { + ch = fallbackBuffer->InternalGetNextChar(); + if (ch > 0) goto ProcessChar; + } + + // read next char. The JIT optimization seems to be getting confused when + // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead + ch = *pSrc; + pSrc++; + + ProcessChar: + if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) { + continue; + } + // either good char or partial surrogate + + EncodeChar: + // throw exception on partial surrogate if necessary + if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) + { + // Lone surrogates aren't allowed, we have to do fallback for them + // Have to make a fallback buffer if we don't have one + if (fallbackBuffer == nullptr) + { + // wait on fallbacks if we can + // For fallback we may need a fallback buffer + fallbackBuffer = encoderFallback->CreateFallbackBuffer(); + + // Set our internal fallback interesting things. + fallbackBuffer->InternalInitialize(chars, pEnd, true); + } + + // Do our fallback. Actually we already know its a mixed up surrogate, + // so the ref pSrc isn't gonna do anything. + fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc); + + // Ignore it if we don't throw + ch = 0; + continue; + } + + // Count bytes needed + int bytesNeeded = 1; + if (ch > 0x7F) { + if (ch > 0x7FF) { + if (ch > 0xFFFF) { + bytesNeeded++; // 4 bytes (surrogate pair) + } + bytesNeeded++; // 3 bytes (800-FFFF) + } + bytesNeeded++; // 2 bytes (80-7FF) + } + + if (pTarget > pAllocatedBufferEnd - bytesNeeded) { + // Left over surrogate from last time will cause pSrc == chars, so we'll throw + if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) + { + fallbackBuffer->MovePrevious(); // Didn't use this fallback char + if (ch > 0xFFFF) + fallbackBuffer->MovePrevious(); // Was surrogate, didn't use 2nd part either + } + else + { + pSrc--; // Didn't use this char + if (ch > 0xFFFF) + pSrc--; // Was surrogate, didn't use 2nd part either + } + Contract::Assert(pSrc >= chars || pTarget == bytes, + "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room."); + ThrowBytesOverflow(pTarget == bytes); // Throw if we must + ch = 0; // Nothing left over (we backed up to start of pair if supplimentary) + break; + } + + if (ch <= 0x7F) { + *pTarget = (BYTE)ch; + } + else { + // use separate helper variables for local contexts so that the jit optimizations + // won't get confused about the variable lifetimes + int chb; + if (ch <= 0x7FF) { + // 2 BYTE encoding + chb = (BYTE)(0xC0 | (ch >> 6)); + } + else + { + if (ch <= 0xFFFF) { + chb = (BYTE)(0xE0 | (ch >> 12)); + } + else + { + *pTarget = (BYTE)(0xF0 | (ch >> 18)); + pTarget++; + + chb = 0x80 | ((ch >> 12) & 0x3F); + } + *pTarget = (BYTE)chb; + pTarget++; + + chb = 0x80 | ((ch >> 6) & 0x3F); + } + *pTarget = (BYTE)chb; + pTarget++; + + *pTarget = (BYTE)0x80 | (ch & 0x3F); + } + pTarget++; + + +#ifdef FASTLOOP + // If still have fallback don't do fast loop + if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != 0) + goto ProcessChar; + + int availableChars = PtrDiff(pEnd, pSrc); + int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget); + + // don't fall into the fast decoding loop if we don't have enough characters + // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop. + if (availableChars <= 13) { + // we are hoping for 1 BYTE per char + if (availableBytes < availableChars) { + // not enough output room. no pending bits at this point + ch = 0; + continue; + } + + // try to get over the remainder of the ascii characters fast though + WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered + while (pSrc < pLocalEnd) { + ch = *pSrc; + pSrc++; + + // Not ASCII, need more than 1 BYTE per char + if (ch > 0x7F) + goto ProcessChar; + + *pTarget = (BYTE)ch; + pTarget++; + } + // we are done, let ch be 0 to clear encoder + ch = 0; + break; + } + + // we need at least 1 BYTE per character, but Convert might allow us to convert + // only part of the input, so try as much as we can. Reduce charCount if necessary + if (availableBytes < availableChars) + { + availableChars = availableBytes; + } + + // FASTLOOP: + // - optimistic range checks + // - fallbacks to the slow loop for all special cases, exception throwing, etc. + + // To compute the upper bound, assume that all characters are ASCII characters at this point, + // the boundary will be decreased for every non-ASCII character we encounter + // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates + // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop. + WCHAR *pStop = pSrc + availableChars - 5; + + while (pSrc < pStop) { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) { + goto LongCode; + } + *pTarget = (BYTE)ch; + pTarget++; + + // get pSrc aligned + if (((size_t)pSrc & 0x2) != 0) { + ch = *pSrc; + pSrc++; + if (ch > 0x7F) { + goto LongCode; + } + *pTarget = (BYTE)ch; + pTarget++; + } + + // Run 4 characters at a time! + while (pSrc < pStop) { + ch = *(int*)pSrc; + int chc = *(int*)(pSrc + 2); + if (((ch | chc) & (int)0xFF80FF80) != 0) { + goto LongCodeWithMask; + } + + // Unfortunately, this is endianness sensitive +#if BIGENDIAN + *pTarget = (BYTE)(ch >> 16); + *(pTarget + 1) = (BYTE)ch; + pSrc += 4; + *(pTarget + 2) = (BYTE)(chc >> 16); + *(pTarget + 3) = (BYTE)chc; + pTarget += 4; +#else // BIGENDIAN + *pTarget = (BYTE)ch; + *(pTarget + 1) = (BYTE)(ch >> 16); + pSrc += 4; + *(pTarget + 2) = (BYTE)chc; + *(pTarget + 3) = (BYTE)(chc >> 16); + pTarget += 4; +#endif // BIGENDIAN + } + continue; + + LongCodeWithMask: +#if BIGENDIAN + // be careful about the sign extension + ch = (int)(((uint)ch) >> 16); +#else // BIGENDIAN + ch = (WCHAR)ch; +#endif // BIGENDIAN + pSrc++; + + if (ch > 0x7F) { + goto LongCode; + } + *pTarget = (BYTE)ch; + pTarget++; + continue; + + LongCode: + // use separate helper variables for slow and fast loop so that the jit optimizations + // won't get confused about the variable lifetimes + int chd; + if (ch <= 0x7FF) { + // 2 BYTE encoding + chd = 0xC0 | (ch >> 6); + } + else { + if (!InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { + // 3 BYTE encoding + chd = 0xE0 | (ch >> 12); + } + else + { + // 4 BYTE encoding - high surrogate + low surrogate + if (ch > CharUnicodeInfo::HIGH_SURROGATE_END) { + // low without high -> bad, try again in slow loop + pSrc -= 1; + break; + } + + chd = *pSrc; + pSrc++; + + // if (!IsLowSurrogate(chd)) { + if (!InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { + // high not followed by low -> bad, try again in slow loop + pSrc -= 2; + break; + } + + ch = chd + (ch << 10) + + (0x10000 + - CharUnicodeInfo::LOW_SURROGATE_START + - (CharUnicodeInfo::HIGH_SURROGATE_START << 10)); + + *pTarget = (BYTE)(0xF0 | (ch >> 18)); + // pStop - this BYTE is compensated by the second surrogate character + // 2 input chars require 4 output bytes. 2 have been anticipated already + // and 2 more will be accounted for by the 2 pStop-- calls below. + pTarget++; + + chd = 0x80 | ((ch >> 12) & 0x3F); + } + *pTarget = (BYTE)chd; + pStop--; // 3 BYTE sequence for 1 char, so need pStop-- and the one below too. + pTarget++; + + chd = 0x80 | ((ch >> 6) & 0x3F); + } + *pTarget = (BYTE)chd; + pStop--; // 2 BYTE sequence for 1 char so need pStop--. + pTarget++; + + *pTarget = (BYTE)(0x80 | (ch & 0x3F)); + // pStop - this BYTE is already included + pTarget++; + } + + Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd"); + +#endif // FASTLOOP + + // no pending char at this point + ch = 0; + } + + InternalDelete(fallbackBuffer); + + return (int)(pTarget - bytes); + } + + int GetByteCount(WCHAR *chars, int count) + { + // For fallback we may need a fallback buffer. + // We wait to initialize it though in case we don't have any broken input unicode + EncoderFallbackBuffer* fallbackBuffer = nullptr; + WCHAR *pSrc = chars; + WCHAR *pEnd = pSrc + count; + + // Start by assuming we have as many as count + int byteCount = count; + + int ch = 0; + + while (true) { + // SLOWLOOP: does all range checks, handles all special cases, but it is slow + if (pSrc >= pEnd) { + + if (ch == 0) { + // Unroll any fallback that happens at the end + ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : 0; + if (ch > 0) { + byteCount++; + goto ProcessChar; + } + } + else { + // Case of surrogates in the fallback. + if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) { + Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF, + "[UTF8Encoding.GetBytes]expected high surrogate");// , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); + + ch = fallbackBuffer->InternalGetNextChar(); + byteCount++; + + if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { + ch = 0xfffd; + byteCount++; + goto EncodeChar; + } + else if (ch > 0){ + goto ProcessChar; + } + else { + byteCount--; // ignore last one. + break; + } + } + } + + if (ch <= 0) { + break; + } + + // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1. + byteCount++; + goto EncodeChar; + } + + if (ch > 0) { + Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF, + "[UTF8Encoding.GetBytes]expected high surrogate"); // , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); + + // use separate helper variables for local contexts so that the jit optimizations + // won't get confused about the variable lifetimes + int cha = *pSrc; + + // count the pending surrogate + byteCount++; + + // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. + // if (IsLowSurrogate(cha)) { + if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { + // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do. + ch = 0xfffd; + // ch = cha + (ch << 10) + + // (0x10000 + // - CharUnicodeInfo::LOW_SURROGATE_START + // - (CharUnicodeInfo::HIGH_SURROGATE_START << 10) ); + + // Use this next char + pSrc++; + } + // else ch is still high surrogate and encoding will fail (so don't add count) + + // attempt to encode the surrogate or partial surrogate + goto EncodeChar; + } + + // If we've used a fallback, then we have to check for it + if (fallbackBuffer != nullptr) + { + ch = fallbackBuffer->InternalGetNextChar(); + if (ch > 0) + { + // We have an extra byte we weren't expecting. + byteCount++; + goto ProcessChar; + } + } + + // read next char. The JIT optimization seems to be getting confused when + // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead + ch = *pSrc; + pSrc++; + + ProcessChar: + if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) { + // we will count this surrogate next time around + byteCount--; + continue; + } + // either good char or partial surrogate + + EncodeChar: + // throw exception on partial surrogate if necessary + if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) + { + // Lone surrogates aren't allowed + // Have to make a fallback buffer if we don't have one + if (fallbackBuffer == nullptr) + { + // wait on fallbacks if we can + // For fallback we may need a fallback buffer + fallbackBuffer = encoderFallback->CreateFallbackBuffer(); + + // Set our internal fallback interesting things. + fallbackBuffer->InternalInitialize(chars, chars + count, false); + } + + // Do our fallback. Actually we already know its a mixed up surrogate, + // so the ref pSrc isn't gonna do anything. + fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc); + + // Ignore it if we don't throw (we had preallocated this ch) + byteCount--; + ch = 0; + continue; + } + + // Count them + if (ch > 0x7F) { + if (ch > 0x7FF) { + // the extra surrogate byte was compensated by the second surrogate character + // (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char) + byteCount++; + } + byteCount++; + } + +#if WIN64 + // check for overflow + if (byteCount < 0) { + break; + } +#endif + +#ifdef FASTLOOP + // If still have fallback don't do fast loop + if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != 0) + { + // We're reserving 1 byte for each char by default + byteCount++; + goto ProcessChar; + } + + int availableChars = PtrDiff(pEnd, pSrc); + + // don't fall into the fast decoding loop if we don't have enough characters + if (availableChars <= 13) { + // try to get over the remainder of the ascii characters fast though + WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered + while (pSrc < pLocalEnd) { + ch = *pSrc; + pSrc++; + if (ch > 0x7F) + goto ProcessChar; + } + + // we are done + break; + } + +#if WIN64 + // make sure that we won't get a silent overflow inside the fast loop + // (Fall out to slow loop if we have this many characters) + availableChars &= 0x0FFFFFFF; +#endif + + // To compute the upper bound, assume that all characters are ASCII characters at this point, + // the boundary will be decreased for every non-ASCII character we encounter + // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates + WCHAR *pStop = pSrc + availableChars - (3 + 4); + + while (pSrc < pStop) { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) // Not ASCII + { + if (ch > 0x7FF) // Not 2 Byte + { + if ((ch & 0xF800) == 0xD800) // See if its a Surrogate + goto LongCode; + byteCount++; + } + byteCount++; + } + + // get pSrc aligned + if (((size_t)pSrc & 0x2) != 0) { + ch = *pSrc; + pSrc++; + if (ch > 0x7F) // Not ASCII + { + if (ch > 0x7FF) // Not 2 Byte + { + if ((ch & 0xF800) == 0xD800) // See if its a Surrogate + goto LongCode; + byteCount++; + } + byteCount++; + } + } + + // Run 2 * 4 characters at a time! + while (pSrc < pStop) { + ch = *(int*)pSrc; + int chc = *(int*)(pSrc + 2); + if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII + { + if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte + { + goto LongCodeWithMask; + } + + + if ((ch & (int)0xFF800000) != 0) // Actually 0x07800780 is all we care about (4 bits) + byteCount++; + if ((ch & (int)0xFF80) != 0) + byteCount++; + if ((chc & (int)0xFF800000) != 0) + byteCount++; + if ((chc & (int)0xFF80) != 0) + byteCount++; + } + pSrc += 4; + + ch = *(int*)pSrc; + chc = *(int*)(pSrc + 2); + if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII + { + if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte + { + goto LongCodeWithMask; + } + + if ((ch & (int)0xFF800000) != 0) + byteCount++; + if ((ch & (int)0xFF80) != 0) + byteCount++; + if ((chc & (int)0xFF800000) != 0) + byteCount++; + if ((chc & (int)0xFF80) != 0) + byteCount++; + } + pSrc += 4; + } + break; + + LongCodeWithMask: +#if BIGENDIAN + // be careful about the sign extension + ch = (int)(((uint)ch) >> 16); +#else // BIGENDIAN + ch = (WCHAR)ch; +#endif // BIGENDIAN + pSrc++; + + if (ch <= 0x7F) { + continue; + } + + LongCode: + // use separate helper variables for slow and fast loop so that the jit optimizations + // won't get confused about the variable lifetimes + if (ch > 0x7FF) { + if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { + // 4 byte encoding - high surrogate + low surrogate + + int chd = *pSrc; + if ( + ch > CharUnicodeInfo::HIGH_SURROGATE_END || + !InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) + { + // Back up and drop out to slow loop to figure out error + pSrc--; + break; + } + pSrc++; + + // byteCount - this byte is compensated by the second surrogate character + } + byteCount++; + } + byteCount++; + + // byteCount - the last byte is already included + } +#endif // FASTLOOP + + // no pending char at this point + ch = 0; + } + +#if WIN64 + // check for overflow + if (byteCount < 0) { + throw ArgumentException("Conversion buffer overflow."); + } +#endif + + Contract::Assert(fallbackBuffer == nullptr || fallbackBuffer->GetRemaining() == 0, + "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer"); + + InternalDelete(fallbackBuffer); + + return byteCount; + } + +}; + + +//////////////////////////////////////////////////////////////////////////// +// +// UTF8ToUnicode +// +// Maps a UTF-8 character string to its wide character string counterpart. +// +//////////////////////////////////////////////////////////////////////////// + +int UTF8ToUnicode( + LPCSTR lpSrcStr, + int cchSrc, + LPWSTR lpDestStr, + int cchDest, + DWORD dwFlags + ) +{ + int ret; + UTF8Encoding enc(dwFlags & MB_ERR_INVALID_CHARS); + try { + ret = enc.GetCharCount((BYTE*)lpSrcStr, cchSrc); + if (cchDest){ + if (ret > cchDest){ + SetLastError(ERROR_INSUFFICIENT_BUFFER); + ret = 0; + } + enc.GetChars((BYTE*)lpSrcStr, cchSrc, (WCHAR*)lpDestStr, ret); + } + } + catch (const InsufficientBufferException& e){ + SetLastError(ERROR_INSUFFICIENT_BUFFER); + return 0; + } + catch (const DecoderFallbackException& e){ + SetLastError(ERROR_NO_UNICODE_TRANSLATION); + return 0; + } + catch (const ArgumentException& e){ + SetLastError(ERROR_INVALID_PARAMETER); + return 0; + } + return ret; +} + +//////////////////////////////////////////////////////////////////////////// +// +// UnicodeToUTF8 +// +// Maps a Unicode character string to its UTF-8 string counterpart. +// +//////////////////////////////////////////////////////////////////////////// + +int UnicodeToUTF8( + LPCWSTR lpSrcStr, + int cchSrc, + LPSTR lpDestStr, + int cchDest) +{ + int ret; + UTF8Encoding enc(false); + try{ + ret = enc.GetByteCount((WCHAR*)lpSrcStr, cchSrc); + if (cchDest){ + if (ret > cchDest){ + SetLastError(ERROR_INSUFFICIENT_BUFFER); + ret = 0; + } + enc.GetBytes((WCHAR*)lpSrcStr, cchSrc, (BYTE*)lpDestStr, ret); + } + } + catch (const InsufficientBufferException& e){ + SetLastError(ERROR_INSUFFICIENT_BUFFER); + return 0; + } + catch (const EncoderFallbackException& e){ + SetLastError(ERROR_NO_UNICODE_TRANSLATION); + return 0; + } + catch (const ArgumentException& e){ + SetLastError(ERROR_INVALID_PARAMETER); + return 0; + } + return ret; +} diff --git a/src/coreclr/pal/tests/palsuite/locale_info/MultiByteToWideChar/test4/test4.cpp b/src/coreclr/pal/tests/palsuite/locale_info/MultiByteToWideChar/test4/test4.cpp index 2b9f67b17bfbde..cab71f15e7098e 100644 --- a/src/coreclr/pal/tests/palsuite/locale_info/MultiByteToWideChar/test4/test4.cpp +++ b/src/coreclr/pal/tests/palsuite/locale_info/MultiByteToWideChar/test4/test4.cpp @@ -217,7 +217,7 @@ PALTEST(locale_info_MultiByteToWideChar_test4_paltest_multibytetowidechar_test4, if (wcscmp(wideBuffer, unicodeStrings[i]) != 0) { - printf("MultiByteToWideChar string %d: the resulting string doesn't match the expected one!\n", i); + Fail("MultiByteToWideChar string %d: the resulting string doesn't match the expected one!\n", i); } free(wideBuffer); diff --git a/src/coreclr/pal/tests/palsuite/locale_info/WideCharToMultiByte/test5/test5.cpp b/src/coreclr/pal/tests/palsuite/locale_info/WideCharToMultiByte/test5/test5.cpp index 387015f0af71cc..bf2dabedefa880 100644 --- a/src/coreclr/pal/tests/palsuite/locale_info/WideCharToMultiByte/test5/test5.cpp +++ b/src/coreclr/pal/tests/palsuite/locale_info/WideCharToMultiByte/test5/test5.cpp @@ -141,7 +141,7 @@ PALTEST(locale_info_WideCharToMultiByte_test5_paltest_widechartomultibyte_test5, if (strcmp(utf8Buffer, utf8Strings[i]) != 0) { - printf("WideCharToMultiByte string %d: the resulting string doesn't match the expected one!\n", i); + Fail("WideCharToMultiByte string %d: the resulting string doesn't match the expected one!\n", i); } free(utf8Buffer); diff --git a/src/libraries/System.Reflection.Emit/tests/TypeBuilder/TypeBuilderDefineEvent.cs b/src/libraries/System.Reflection.Emit/tests/TypeBuilder/TypeBuilderDefineEvent.cs index a01945d7f64911..05cda03a777379 100644 --- a/src/libraries/System.Reflection.Emit/tests/TypeBuilder/TypeBuilderDefineEvent.cs +++ b/src/libraries/System.Reflection.Emit/tests/TypeBuilder/TypeBuilderDefineEvent.cs @@ -26,7 +26,7 @@ public static IEnumerable TestData() } [Theory] - [ActiveIssue("https://github.com/dotnet/runtime/issues/2389", TestRuntimes.Mono | TestRuntimes.CoreCLR)] + [ActiveIssue("https://github.com/dotnet/runtime/issues/2389", TestRuntimes.Mono)] [MemberData(nameof(TestData))] public void DefineEvent(string name, EventAttributes attributes, Type eventType, string expectedName, EventAttributes expectedAttributes) { diff --git a/src/libraries/System.Reflection.Emit/tests/TypeBuilder/TypeBuilderDefineProperty.cs b/src/libraries/System.Reflection.Emit/tests/TypeBuilder/TypeBuilderDefineProperty.cs index 500ee8104766d7..84d27ee2f98e31 100644 --- a/src/libraries/System.Reflection.Emit/tests/TypeBuilder/TypeBuilderDefineProperty.cs +++ b/src/libraries/System.Reflection.Emit/tests/TypeBuilder/TypeBuilderDefineProperty.cs @@ -29,7 +29,7 @@ public static IEnumerable TestData() } [Theory] - [ActiveIssue("https://github.com/dotnet/runtime/issues/2389", TestRuntimes.Mono | TestRuntimes.CoreCLR)] + [ActiveIssue("https://github.com/dotnet/runtime/issues/2389", TestRuntimes.Mono)] [MemberData(nameof(TestData))] public void DefineProperty(string name, PropertyAttributes attributes, Type returnType, Type[] parameterTypes, string expectedName, PropertyAttributes expectedPropertyAttributes) { diff --git a/src/mono/mono/eglib/CMakeLists.txt b/src/mono/mono/eglib/CMakeLists.txt index b2945231711711..3de4a9c83d2f5b 100644 --- a/src/mono/mono/eglib/CMakeLists.txt +++ b/src/mono/mono/eglib/CMakeLists.txt @@ -20,7 +20,7 @@ set(eglib_common_sources gbytearray.c gerror.c ghashtable.c - ${CLR_SRC_NATIVE_DIR}/minipal/utf8converter.c + giconv.c gmem.c goutput.c gstr.c @@ -32,7 +32,8 @@ set(eglib_common_sources gpath.c gspawn.c gfile.c - gfile-posix.c) + gfile-posix.c + gutf8.c) set(eglib_headers glib.h diff --git a/src/native/minipal/utf8converter.c b/src/mono/mono/eglib/giconv.c similarity index 68% rename from src/native/minipal/utf8converter.c rename to src/mono/mono/eglib/giconv.c index 0aeada3f4773a0..664ad31bba258a 100644 --- a/src/native/minipal/utf8converter.c +++ b/src/mono/mono/eglib/giconv.c @@ -1,7 +1,32 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -#include +/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ +/* + * Copyright (C) 2011 Jeffrey Stedfast + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include +#include +#include +#include +#include "../utils/mono-errno.h" #ifdef _MSC_VER #define FORCE_INLINE(RET_TYPE) __forceinline RET_TYPE @@ -9,333 +34,40 @@ #define FORCE_INLINE(RET_TYPE) inline RET_TYPE __attribute__((always_inline)) #endif + +#define UNROLL_DECODE_UTF8 0 +#define UNROLL_ENCODE_UTF8 0 + +static int decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar); +static int encode_utf32be (gunichar c, char *outbuf, size_t outleft); + +static int decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar); +static int encode_utf32le (gunichar c, char *outbuf, size_t outleft); + +static int decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar); +static int encode_utf16be (gunichar c, char *outbuf, size_t outleft); + +static int decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar); +static int encode_utf16le (gunichar c, char *outbuf, size_t outleft); + +static FORCE_INLINE (int) decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar); +static int encode_utf8 (gunichar c, char *outbuf, size_t outleft); + +static int decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar); +static int encode_latin1 (gunichar c, char *outbuf, size_t outleft); + #if G_BYTE_ORDER == G_LITTLE_ENDIAN #define decode_utf32 decode_utf32le #define encode_utf32 encode_utf32le #define decode_utf16 decode_utf16le #define encode_utf16 encode_utf16le -#define GUINT16_TO_LE(x) (x) -#define GUINT16_TO_BE(x) GUINT16_SWAP_LE_BE(x) #else #define decode_utf32 decode_utf32be #define encode_utf32 encode_utf32be #define decode_utf16 decode_utf16be #define encode_utf16 encode_utf16be -#define GUINT16_TO_LE(x) GUINT16_SWAP_LE_BE(x) -#define GUINT16_TO_BE(x) (x) #endif -/* - * Index into the table below with the first byte of a UTF-8 sequence to get - * the number of bytes that are supposed to follow it to complete the sequence. - * - * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is left - * as-is for anyone who may want to do such conversion, which was allowed in - * earlier algorithms. -*/ -const guchar g_utf8_jump_table[256] = { - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 -}; - -static gboolean -utf8_validate (const unsigned char *inptr, size_t len) -{ - const unsigned char *ptr = inptr + len; - unsigned char c; - - /* Everything falls through when TRUE... */ - switch (len) { - default: - return FALSE; - case 4: - if ((c = (*--ptr)) < 0x80 || c > 0xBF) - return FALSE; - - if ((c == 0xBF || c == 0xBE) && ptr[-1] == 0xBF) { - if (ptr[-2] == 0x8F || ptr[-2] == 0x9F || - ptr[-2] == 0xAF || ptr[-2] == 0xBF) - return FALSE; - } - case 3: - if ((c = (*--ptr)) < 0x80 || c > 0xBF) - return FALSE; - case 2: - if ((c = (*--ptr)) < 0x80 || c > 0xBF) - return FALSE; - - /* no fall-through in this inner switch */ - switch (*inptr) { - case 0xE0: if (c < 0xA0) return FALSE; break; - case 0xED: if (c > 0x9F) return FALSE; break; - case 0xEF: if (c == 0xB7 && (ptr[1] > 0x8F && ptr[1] < 0xB0)) return FALSE; - if (c == 0xBF && (ptr[1] == 0xBE || ptr[1] == 0xBF)) return FALSE; - break; - case 0xF0: if (c < 0x90) return FALSE; break; - case 0xF4: if (c > 0x8F) return FALSE; break; - default: if (c < 0x80) return FALSE; break; - } - case 1: if (*inptr >= 0x80 && *inptr < 0xC2) return FALSE; - } - - if (*inptr > 0xF4) - return FALSE; - - return TRUE; -} - -/** - * g_utf8_validate: - * @str: a utf-8 encoded string - * @max_len: max number of bytes to validate (or -1 to validate the entire null-terminated string) - * @end: output parameter to mark the end of the valid input - * - * Checks @utf for being valid UTF-8. @str is assumed to be - * null-terminated. This function is not super-strict, as it will - * allow longer UTF-8 sequences than necessary. Note that Java is - * capable of producing these sequences if provoked. Also note, this - * routine checks for the 4-byte maximum size, but does not check for - * 0x10ffff maximum value. - * - * Return value: %TRUE if @str is valid or %FALSE otherwise. - **/ -gboolean -g_utf8_validate (const gchar *str, gssize max_len, const gchar **end) -{ - guchar *inptr = (guchar *) str; - gboolean valid = TRUE; - guint length, min; - gssize n = 0; - - if (max_len == 0) - return FALSE; - - if (max_len < 0) { - while (*inptr != 0) { - length = g_utf8_jump_table[*inptr]; - if (!utf8_validate (inptr, length)) { - valid = FALSE; - break; - } - - inptr += length; - } - } else { - while (n < max_len) { - if (*inptr == 0) { - /* Note: return FALSE if we encounter nul-byte - * before max_len is reached. */ - valid = FALSE; - break; - } - - length = g_utf8_jump_table[*inptr]; - min = MIN (length, GSSIZE_TO_UINT (max_len - n)); - - if (!utf8_validate (inptr, min)) { - valid = FALSE; - break; - } - - if (min < length) { - valid = FALSE; - break; - } - - inptr += length; - n += length; - } - } - - if (end != NULL) - *end = (gchar *) inptr; - - return valid; -} - -gunichar -g_utf8_get_char_validated (const gchar *str, gssize max_len) -{ - unsigned char *inptr = (unsigned char *) str; - gunichar u = *inptr; - int n, i; - - if (max_len == 0) - return -2; - - if (u < 0x80) { - /* simple ascii case */ - return u; - } else if (u < 0xc2) { - return -1; - } else if (u < 0xe0) { - u &= 0x1f; - n = 2; - } else if (u < 0xf0) { - u &= 0x0f; - n = 3; - } else if (u < 0xf8) { - u &= 0x07; - n = 4; - } else if (u < 0xfc) { - u &= 0x03; - n = 5; - } else if (u < 0xfe) { - u &= 0x01; - n = 6; - } else { - return -1; - } - - if (max_len > 0) { - if (!utf8_validate (inptr, MIN (max_len, n))) - return -1; - - if (max_len < n) - return -2; - } else { - if (!utf8_validate (inptr, n)) - return -1; - } - - for (i = 1; i < n; i++) - u = (u << 6) | (*++inptr ^ 0x80); - - return u; -} - -glong -g_utf8_strlen (const gchar *str, gssize max_len) -{ - const guchar *inptr = (const guchar *) str; - glong clen = 0, len = 0, n; - - if (max_len == 0) - return 0; - - if (max_len < 0) { - while (*inptr) { - inptr += g_utf8_jump_table[*inptr]; - len++; - } - } else { - while (len < max_len && *inptr) { - n = g_utf8_jump_table[*inptr]; - if ((clen + n) > max_len) - break; - - inptr += n; - clen += n; - len++; - } - } - - return len; -} - -gunichar -g_utf8_get_char (const gchar *src) -{ - unsigned char *inptr = (unsigned char *) src; - gunichar u = *inptr; - int n, i; - - if (u < 0x80) { - /* simple ascii case */ - return u; - } else if (u < 0xe0) { - u &= 0x1f; - n = 2; - } else if (u < 0xf0) { - u &= 0x0f; - n = 3; - } else if (u < 0xf8) { - u &= 0x07; - n = 4; - } else if (u < 0xfc) { - u &= 0x03; - n = 5; - } else { - u &= 0x01; - n = 6; - } - - for (i = 1; i < n; i++) - u = (u << 6) | (*++inptr ^ 0x80); - - return u; -} - -gchar * -g_utf8_offset_to_pointer (const gchar *str, glong offset) -{ - const gchar *p = str; - - if (offset > 0) { - do { - p = g_utf8_next_char (p); - offset --; - } while (offset > 0); - } - else if (offset < 0) { - const gchar *jump = str; - do { - // since the minimum size of a character is 1 - // we know we can step back at least offset bytes - jump = jump + offset; - - // if we land in the middle of a character - // walk to the beginning - while ((*jump & 0xc0) == 0x80) - jump --; - - // count how many characters we've actually walked - // by going forward - p = jump; - do { - p = g_utf8_next_char (p); - offset ++; - } while (p < jump); - - } while (offset < 0); - } - - return (gchar *)p; -} - -glong -g_utf8_pointer_to_offset (const gchar *str, const gchar *pos) -{ - const gchar *inptr, *inend; - glong offset = 0; - glong sign = 1; - - if (pos == str) - return 0; - - if (str < pos) { - inptr = str; - inend = pos; - } else { - inptr = pos; - inend = str; - sign = -1; - } - - do { - inptr = g_utf8_next_char (inptr); - offset++; - } while (inptr < inend); - - return offset * sign; -} - /* * Unicode encoders and decoders */ @@ -687,12 +419,12 @@ encode_latin1 (gunichar c, char *outbuf, size_t outleft) * Simple conversion API */ -static gpointer g_error_quark = (gpointer)"ConvertError"; +static gpointer error_quark = (gpointer)"ConvertError"; gpointer g_convert_error_quark (void) { - return g_error_quark; + return error_quark; } /* * Unicode conversion @@ -814,7 +546,7 @@ g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written) if (items_written) *items_written = n; - outptr = outbuf = (gunichar *)g_malloc ((n + 1) * sizeof (gunichar)); + outptr = outbuf = g_malloc ((n + 1) * sizeof (gunichar)); inptr = (char *) str; for (i = 0; i < n; i++) { @@ -828,7 +560,7 @@ g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written) } static gunichar2 * -eg_utf8_to_utf16_general (const gchar *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, gboolean replace_invalid_codepoints, gboolean null_terminate, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err, unsigned endian) +eg_utf8_to_utf16_general (const gchar *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, gboolean replace_invalid_codepoints, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err, unsigned endian) { gunichar2 *outbuf, *outptr; size_t outlen = 0; @@ -879,7 +611,7 @@ eg_utf8_to_utf16_general (const gchar *str, glong len, glong *items_read, glong *items_written = (glong)outlen; if (G_LIKELY (!custom_alloc_func)) - outptr = outbuf = (gunichar2 *)g_malloc ((outlen + 1) * sizeof (gunichar2)); + outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2)); else outptr = outbuf = (gunichar2 *)custom_alloc_func ((outlen + 1) * sizeof (gunichar2), custom_alloc_data); @@ -910,8 +642,7 @@ eg_utf8_to_utf16_general (const gchar *str, glong len, glong *items_read, glong inptr += n; } - if (null_terminate) - *outptr = '\0'; + *outptr = '\0'; return outbuf; @@ -941,55 +672,49 @@ eg_utf8_to_utf16_general (const gchar *str, glong len, glong *items_read, glong gunichar2 * g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, TRUE, NULL, NULL, err, G_BYTE_ORDER); + return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, NULL, NULL, err, G_BYTE_ORDER); } gunichar2 * g_utf8_to_utf16be (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, TRUE, NULL, NULL, err, G_BIG_ENDIAN); + return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, NULL, NULL, err, G_BIG_ENDIAN); } gunichar2 * g_utf8_to_utf16le (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, TRUE, NULL, NULL, err, G_LITTLE_ENDIAN); + return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, NULL, NULL, err, G_LITTLE_ENDIAN); } gunichar2 * g_utf8_to_utf16_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, TRUE, custom_alloc_func, custom_alloc_data, err, G_BYTE_ORDER); -} - -gunichar2 * -g_utf8_to_utf16_custom_alloc_optional (const gchar *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, gboolean replace_invalid_codepoints, gboolean null_terminate, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) -{ - return eg_utf8_to_utf16_general (str, len, items_read, items_written, include_nuls, replace_invalid_codepoints, null_terminate, custom_alloc_func, custom_alloc_data, err, G_BYTE_ORDER); + return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, custom_alloc_func, custom_alloc_data, err, G_BYTE_ORDER); } gunichar2 * g_utf8_to_utf16be_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, TRUE, custom_alloc_func, custom_alloc_data, err, G_BIG_ENDIAN); + return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, custom_alloc_func, custom_alloc_data, err, G_BIG_ENDIAN); } gunichar2 * g_utf8_to_utf16le_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, TRUE, custom_alloc_func, custom_alloc_data, err, G_LITTLE_ENDIAN); + return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, custom_alloc_func, custom_alloc_data, err, G_LITTLE_ENDIAN); } gunichar2 * eg_utf8_to_utf16_with_nuls (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, FALSE, TRUE, NULL, NULL, err, G_BYTE_ORDER); + return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, FALSE, NULL, NULL, err, G_BYTE_ORDER); } gunichar2 * eg_wtf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, TRUE, TRUE, NULL, NULL, err, G_BYTE_ORDER); + return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, TRUE, NULL, NULL, err, G_BYTE_ORDER); } gunichar * @@ -1044,7 +769,7 @@ g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_wri if (items_read) *items_read = GPTRDIFF_TO_LONG (inptr - str); - outptr = outbuf = (gunichar *)g_malloc (outlen + 4); + outptr = outbuf = g_malloc (outlen + 4); inptr = (char *) str; inleft = len; @@ -1066,23 +791,17 @@ g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_wri static gchar * -eg_utf16_to_utf8_general (const gunichar2 *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, gboolean replace_invalid_codepoints, gboolean null_terminate, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err, unsigned endian) +eg_utf16_to_utf8_general (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err, unsigned endian) { char *inptr, *outbuf, *outptr; size_t outlen = 0; size_t inleft; gunichar c; - gboolean replaced = FALSE; int n; g_return_val_if_fail (str != NULL, NULL); if (len < 0) { - if (include_nuls) { - g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "Conversions with embedded nulls must pass the string length"); - return NULL; - } - len = 0; while (str[len]) len++; @@ -1099,37 +818,30 @@ eg_utf16_to_utf8_general (const gunichar2 *str, glong len, glong *items_read, gl inptr += 2; } - if (errno == EILSEQ && !replace_invalid_codepoints) { + if (errno == EILSEQ) { g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "Illegal byte sequence encountered in the input."); - } else if (items_read && !replace_invalid_codepoints) { + } else if (items_read) { /* partial input is ok if we can let our caller know... */ break; - } else if (!replace_invalid_codepoints) { + } else { g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, "Partial byte sequence encountered in the input."); } - if (replace_invalid_codepoints) { - n = sizeof(gunichar); - c = '?'; - replaced = TRUE; - } else { - if (items_read) - *items_read = GPTRDIFF_TO_LONG ((inptr - (char *) str) / 2); + if (items_read) + *items_read = GPTRDIFF_TO_LONG ((inptr - (char *) str) / 2); - if (items_written) - *items_written = 0; + if (items_written) + *items_written = 0; - return NULL; - } - } else if (c == 0 && !include_nuls) + return NULL; + } else if (c == 0) break; - outlen += (replaced && replace_invalid_codepoints) ? n - 1 : g_unichar_to_utf8 (c, NULL); + outlen += g_unichar_to_utf8 (c, NULL); inleft -= n; inptr += n; - replaced = FALSE; } if (items_read) @@ -1139,7 +851,7 @@ eg_utf16_to_utf8_general (const gunichar2 *str, glong len, glong *items_read, gl *items_written = (glong)outlen; if (G_LIKELY (!custom_alloc_func)) - outptr = outbuf = (char *)g_malloc (outlen + 1); + outptr = outbuf = g_malloc (outlen + 1); else outptr = outbuf = (char *)custom_alloc_func (outlen + 1, custom_alloc_data); @@ -1154,24 +866,17 @@ eg_utf16_to_utf8_general (const gunichar2 *str, glong len, glong *items_read, gl inleft = len * 2; while (inleft > 0) { - if ((n = decode_utf16_endian (inptr, inleft, &c, endian)) < 0) { - if (replace_invalid_codepoints) { - outptr += '?'; - n = sizeof(gunichar); - } else - break; - } else if (c == 0 && !include_nuls) { + if ((n = decode_utf16_endian (inptr, inleft, &c, endian)) < 0) + break; + else if (c == 0) break; - } else { - outptr += g_unichar_to_utf8 (c, outptr); - } + outptr += g_unichar_to_utf8 (c, outptr); inleft -= n; inptr += n; } - if (null_terminate) - *outptr = '\0'; + *outptr = '\0'; return outbuf; } @@ -1179,31 +884,25 @@ eg_utf16_to_utf8_general (const gunichar2 *str, glong len, glong *items_read, gl gchar * g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf16_to_utf8_general (str, len, items_read, items_written, FALSE, FALSE, TRUE, NULL, NULL, err, G_BYTE_ORDER); + return eg_utf16_to_utf8_general (str, len, items_read, items_written, NULL, NULL, err, G_BYTE_ORDER); } gchar * g_utf16le_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf16_to_utf8_general (str, len, items_read, items_written, FALSE, FALSE, TRUE, NULL, NULL, err, G_LITTLE_ENDIAN); + return eg_utf16_to_utf8_general (str, len, items_read, items_written, NULL, NULL, err, G_LITTLE_ENDIAN); } gchar * g_utf16be_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf16_to_utf8_general (str, len, items_read, items_written, FALSE, FALSE, TRUE, NULL, NULL, err, G_BIG_ENDIAN); + return eg_utf16_to_utf8_general (str, len, items_read, items_written, NULL, NULL, err, G_BIG_ENDIAN); } gchar * g_utf16_to_utf8_custom_alloc (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) { - return eg_utf16_to_utf8_general (str, len, items_read, items_written, FALSE, FALSE, TRUE, custom_alloc_func, custom_alloc_data, err, G_BYTE_ORDER); -} - -gchar * -g_utf16_to_utf8_custom_alloc_with_nulls (const gunichar2 *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, gboolean null_terminate, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) -{ - return eg_utf16_to_utf8_general (str, len, items_read, items_written, include_nuls, TRUE, null_terminate, custom_alloc_func, custom_alloc_data, err, G_BYTE_ORDER); + return eg_utf16_to_utf8_general (str, len, items_read, items_written, custom_alloc_func, custom_alloc_data, err, G_BYTE_ORDER); } gunichar * @@ -1267,7 +966,7 @@ g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *item if (items_written) *items_written = (glong)(outlen / 4); - outptr = outbuf = (gunichar *)g_malloc (outlen + 4); + outptr = outbuf = g_malloc (outlen + 4); inptr = (char *) str; inleft = len * 2; @@ -1335,7 +1034,7 @@ g_ucs4_to_utf8 (const gunichar *str, glong len, glong *items_read, glong *items_ len = i; - outptr = outbuf = (char *)g_malloc (outlen + 1); + outptr = outbuf = g_malloc (outlen + 1); for (i = 0; i < len; i++) outptr += g_unichar_to_utf8 (str[i], outptr); *outptr = 0; @@ -1397,7 +1096,7 @@ g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items len = i; - outptr = outbuf = (gunichar2 *)g_malloc ((outlen + 1) * sizeof (gunichar2)); + outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2)); for (i = 0; i < len; i++) outptr += g_unichar_to_utf16 (str[i], outptr); *outptr = 0; diff --git a/src/mono/mono/eglib/glib.h b/src/mono/mono/eglib/glib.h index ef2101315eee98..e438c00298ec72 100644 --- a/src/mono/mono/eglib/glib.h +++ b/src/mono/mono/eglib/glib.h @@ -29,7 +29,6 @@ #include #include #include -#include "../utils/mono-errno.h" #ifndef EGLIB_NO_REMAP #include diff --git a/src/mono/mono/eglib/gutf8.c b/src/mono/mono/eglib/gutf8.c new file mode 100644 index 00000000000000..965a69f42e655d --- /dev/null +++ b/src/mono/mono/eglib/gutf8.c @@ -0,0 +1,323 @@ +/* + * gutf8.c: UTF-8 conversion + * + * Author: + * Atsushi Enomoto + * + * (C) 2006 Novell, Inc. + * Copyright 2012 Xamarin Inc + */ +#include "config.h" +#include +#include + +/* + * Index into the table below with the first byte of a UTF-8 sequence to get + * the number of bytes that are supposed to follow it to complete the sequence. + * + * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is left + * as-is for anyone who may want to do such conversion, which was allowed in + * earlier algorithms. +*/ +const guchar g_utf8_jump_table[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 +}; + +static gboolean +utf8_validate (const unsigned char *inptr, size_t len) +{ + const unsigned char *ptr = inptr + len; + unsigned char c; + + /* Everything falls through when TRUE... */ + switch (len) { + default: + return FALSE; + case 4: + if ((c = (*--ptr)) < 0x80 || c > 0xBF) + return FALSE; + + if ((c == 0xBF || c == 0xBE) && ptr[-1] == 0xBF) { + if (ptr[-2] == 0x8F || ptr[-2] == 0x9F || + ptr[-2] == 0xAF || ptr[-2] == 0xBF) + return FALSE; + } + case 3: + if ((c = (*--ptr)) < 0x80 || c > 0xBF) + return FALSE; + case 2: + if ((c = (*--ptr)) < 0x80 || c > 0xBF) + return FALSE; + + /* no fall-through in this inner switch */ + switch (*inptr) { + case 0xE0: if (c < 0xA0) return FALSE; break; + case 0xED: if (c > 0x9F) return FALSE; break; + case 0xEF: if (c == 0xB7 && (ptr[1] > 0x8F && ptr[1] < 0xB0)) return FALSE; + if (c == 0xBF && (ptr[1] == 0xBE || ptr[1] == 0xBF)) return FALSE; + break; + case 0xF0: if (c < 0x90) return FALSE; break; + case 0xF4: if (c > 0x8F) return FALSE; break; + default: if (c < 0x80) return FALSE; break; + } + case 1: if (*inptr >= 0x80 && *inptr < 0xC2) return FALSE; + } + + if (*inptr > 0xF4) + return FALSE; + + return TRUE; +} + +/** + * g_utf8_validate: + * @str: a utf-8 encoded string + * @max_len: max number of bytes to validate (or -1 to validate the entire null-terminated string) + * @end: output parameter to mark the end of the valid input + * + * Checks @utf for being valid UTF-8. @str is assumed to be + * null-terminated. This function is not super-strict, as it will + * allow longer UTF-8 sequences than necessary. Note that Java is + * capable of producing these sequences if provoked. Also note, this + * routine checks for the 4-byte maximum size, but does not check for + * 0x10ffff maximum value. + * + * Return value: %TRUE if @str is valid or %FALSE otherwise. + **/ +gboolean +g_utf8_validate (const gchar *str, gssize max_len, const gchar **end) +{ + guchar *inptr = (guchar *) str; + gboolean valid = TRUE; + guint length, min; + gssize n = 0; + + if (max_len == 0) + return FALSE; + + if (max_len < 0) { + while (*inptr != 0) { + length = g_utf8_jump_table[*inptr]; + if (!utf8_validate (inptr, length)) { + valid = FALSE; + break; + } + + inptr += length; + } + } else { + while (n < max_len) { + if (*inptr == 0) { + /* Note: return FALSE if we encounter nul-byte + * before max_len is reached. */ + valid = FALSE; + break; + } + + length = g_utf8_jump_table[*inptr]; + min = MIN (length, GSSIZE_TO_UINT (max_len - n)); + + if (!utf8_validate (inptr, min)) { + valid = FALSE; + break; + } + + if (min < length) { + valid = FALSE; + break; + } + + inptr += length; + n += length; + } + } + + if (end != NULL) + *end = (gchar *) inptr; + + return valid; +} + +gunichar +g_utf8_get_char_validated (const gchar *str, gssize max_len) +{ + unsigned char *inptr = (unsigned char *) str; + gunichar u = *inptr; + int n, i; + + if (max_len == 0) + return -2; + + if (u < 0x80) { + /* simple ascii case */ + return u; + } else if (u < 0xc2) { + return -1; + } else if (u < 0xe0) { + u &= 0x1f; + n = 2; + } else if (u < 0xf0) { + u &= 0x0f; + n = 3; + } else if (u < 0xf8) { + u &= 0x07; + n = 4; + } else if (u < 0xfc) { + u &= 0x03; + n = 5; + } else if (u < 0xfe) { + u &= 0x01; + n = 6; + } else { + return -1; + } + + if (max_len > 0) { + if (!utf8_validate (inptr, MIN (max_len, n))) + return -1; + + if (max_len < n) + return -2; + } else { + if (!utf8_validate (inptr, n)) + return -1; + } + + for (i = 1; i < n; i++) + u = (u << 6) | (*++inptr ^ 0x80); + + return u; +} + +glong +g_utf8_strlen (const gchar *str, gssize max_len) +{ + const guchar *inptr = (const guchar *) str; + glong clen = 0, len = 0, n; + + if (max_len == 0) + return 0; + + if (max_len < 0) { + while (*inptr) { + inptr += g_utf8_jump_table[*inptr]; + len++; + } + } else { + while (len < max_len && *inptr) { + n = g_utf8_jump_table[*inptr]; + if ((clen + n) > max_len) + break; + + inptr += n; + clen += n; + len++; + } + } + + return len; +} + +gunichar +g_utf8_get_char (const gchar *src) +{ + unsigned char *inptr = (unsigned char *) src; + gunichar u = *inptr; + int n, i; + + if (u < 0x80) { + /* simple ascii case */ + return u; + } else if (u < 0xe0) { + u &= 0x1f; + n = 2; + } else if (u < 0xf0) { + u &= 0x0f; + n = 3; + } else if (u < 0xf8) { + u &= 0x07; + n = 4; + } else if (u < 0xfc) { + u &= 0x03; + n = 5; + } else { + u &= 0x01; + n = 6; + } + + for (i = 1; i < n; i++) + u = (u << 6) | (*++inptr ^ 0x80); + + return u; +} + +gchar * +g_utf8_offset_to_pointer (const gchar *str, glong offset) +{ + const gchar *p = str; + + if (offset > 0) { + do { + p = g_utf8_next_char (p); + offset --; + } while (offset > 0); + } + else if (offset < 0) { + const gchar *jump = str; + do { + // since the minimum size of a character is 1 + // we know we can step back at least offset bytes + jump = jump + offset; + + // if we land in the middle of a character + // walk to the beginning + while ((*jump & 0xc0) == 0x80) + jump --; + + // count how many characters we've actually walked + // by going forward + p = jump; + do { + p = g_utf8_next_char (p); + offset ++; + } while (p < jump); + + } while (offset < 0); + } + + return (gchar *)p; +} + +glong +g_utf8_pointer_to_offset (const gchar *str, const gchar *pos) +{ + const gchar *inptr, *inend; + glong offset = 0; + glong sign = 1; + + if (pos == str) + return 0; + + if (str < pos) { + inptr = str; + inend = pos; + } else { + inptr = pos; + inend = str; + sign = -1; + } + + do { + inptr = g_utf8_next_char (inptr); + offset++; + } while (inptr < inend); + + return offset * sign; +} diff --git a/src/native/minipal/utf8converter.h b/src/native/minipal/utf8converter.h deleted file mode 100644 index 06cd677dfe1955..00000000000000 --- a/src/native/minipal/utf8converter.h +++ /dev/null @@ -1,200 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -#ifndef HAVE_MINIPAL_UTF8CONVERTER_H -#define HAVE_MINIPAL_UTF8CONVERTER_H - -#include -#include -#include -#include -#include -#include -#include -#include - -#ifndef CORECLR -#include "glib.h" -#endif - -#ifdef _MSC_VER -#define FORCE_INLINE(RET_TYPE) __forceinline RET_TYPE -#else -#define FORCE_INLINE(RET_TYPE) inline RET_TYPE __attribute__((always_inline)) -#endif - -#if G_BYTE_ORDER == G_LITTLE_ENDIAN -#define decode_utf32 decode_utf32le -#define encode_utf32 encode_utf32le -#define decode_utf16 decode_utf16le -#define encode_utf16 encode_utf16le -#define GUINT16_TO_LE(x) (x) -#define GUINT16_TO_BE(x) GUINT16_SWAP_LE_BE(x) -#else -#define decode_utf32 decode_utf32be -#define encode_utf32 encode_utf32be -#define decode_utf16 decode_utf16be -#define encode_utf16 encode_utf16be -#define GUINT16_TO_LE(x) GUINT16_SWAP_LE_BE(x) -#define GUINT16_TO_BE(x) (x) -#endif - -#ifdef CORECLR - -#ifdef TARGET_64BIT -#define ptrdiff_t int64_t -#else -#define ptrdiff_t int32_t -#endif - -#define gunichar uint32_t -#define gunichar2 uint16_t -#define guint uint32_t -#define gchar char -#define guchar unsigned char -#define gboolean bool -#define gsize size_t -#define gssize ptrdiff_t -#define gint int32_t -#define glong long -#define gptrdiff ptrdiff_t -#define guint8 uint8_t -#define guint16 uint16_t -#define gpointer void* -#define g_malloc malloc -#define TRUE 1 -#define FALSE 0 -#ifndef MIN -#define MIN(a,b) ((a) < (b) ? (a) : (b)) -#endif - -typedef void* (*GCustomAllocator) (size_t req_size, void* custom_alloc_data); - -typedef struct { - /* In the real glib, this is a GQuark, but we dont use/need that */ - void* domain; - int32_t code; - char *message; -} GError; - -typedef struct { - void* buffer; - size_t buffer_size; - size_t req_buffer_size; -} GFixedBufferCustomAllocatorData; - -typedef enum { - G_CONVERT_ERROR_NO_CONVERSION, - G_CONVERT_ERROR_ILLEGAL_SEQUENCE, - G_CONVERT_ERROR_FAILED, - G_CONVERT_ERROR_PARTIAL_INPUT, - G_CONVERT_ERROR_BAD_URI, - G_CONVERT_ERROR_NOT_ABSOLUTE_PATH, - G_CONVERT_ERROR_NO_MEMORY -} GConvertError; - -#define UNROLL_DECODE_UTF8 0 -#define UNROLL_ENCODE_UTF8 0 - -static int decode_utf32be (char *inbuf, size_t inleft, uint32_t *outchar); -static int encode_utf32be (uint32_t c, char *outbuf, size_t outleft); - -static int decode_utf32le (char *inbuf, size_t inleft, uint32_t *outchar); -static int encode_utf32le (uint32_t c, char *outbuf, size_t outleft); - -static int decode_utf16be (char *inbuf, size_t inleft, uint32_t *outchar); -static int encode_utf16be (uint32_t c, char *outbuf, size_t outleft); - -static int decode_utf16le (char *inbuf, size_t inleft, uint32_t *outchar); -static int encode_utf16le (uint32_t c, char *outbuf, size_t outleft); - -static FORCE_INLINE (int) decode_utf8 (char *inbuf, size_t inleft, uint32_t *outchar); -static int encode_utf8 (uint32_t c, char *outbuf, size_t outleft); - -static int decode_latin1 (char *inbuf, size_t inleft, uint32_t *outchar); -static int encode_latin1 (uint32_t c, char *outbuf, size_t outleft); - -#define G_LITTLE_ENDIAN 1234 -#define G_BIG_ENDIAN 4321 -#define GUINT16_SWAP_LE_BE(x) ((uint16_t) (((uint16_t) x) >> 8) | ((((uint16_t)(x)) & 0xff) << 8)) - -#ifdef BIGENDIAN -#define G_BYTE_ORDER G_BIG_ENDIAN -#else -#define G_BYTE_ORDER G_LITTLE_ENDIAN -#endif - -#define G_CAST_TYPE_TO_TYPE(src,dest,v) ((dest)(v)) -#define G_CAST_PTRTYPE_TO_STYPE(src,dest,v) ((dest)(gssize)(v)) -#define GUINT32_TO_UINT16(v) G_CAST_TYPE_TO_TYPE(guint32, guint16, v) -#define GSIZE_TO_INT(v) G_CAST_TYPE_TO_TYPE(gsize, gint, v) -#define GSSIZE_TO_UINT(v) G_CAST_TYPE_TO_TYPE(gssize, guint, v) -#define GUNICHAR_TO_UINT8(v) G_CAST_TYPE_TO_TYPE(gunichar, guint8, v) -#define GUNICHAR_TO_UINT16(v) G_CAST_TYPE_TO_TYPE(gunichar, guint16, v) -#define GUNICHAR_TO_CHAR(v) G_CAST_TYPE_TO_TYPE(gunichar, gchar, v) -#define GPTRDIFF_TO_LONG(v) G_CAST_PTRTYPE_TO_STYPE(gptrdiff, glong, v) -#define g_return_val_if_fail(x,e) do { if (!(x)) { printf ("%s:%d: assertion '%s' failed\n", __FILE__, __LINE__, #x); return (e); } } while(0) -#define g_utf8_next_char(p) ((p) + g_utf8_jump_table[(unsigned char)(*p)]) - -#if defined(__GNUC__) && (__GNUC__ > 2) -#define G_LIKELY(expr) (__builtin_expect ((expr) != 0, 1)) -#define G_UNLIKELY(expr) (__builtin_expect ((expr) != 0, 0)) -#else -#define G_LIKELY(x) (x) -#define G_UNLIKELY(x) (x) -#endif - -void -g_set_error (GError **err, void* domain, int32_t code, const char *format, ...) -{ - va_list args; - - if (err) { - *err = (GError *) malloc (sizeof (GError)); - (*err)->domain = domain; - (*err)->code = code; - - va_start (args, format); - int s = vsnprintf(NULL, 0, format, args); - va_end(args); - - if (s > -1) - { - (*err)->message = (char*)malloc(s); - - va_start(args, format); - vsnprintf((*err)->message, s, format, args); - va_end (args); - } - } -} - -#define G_CONVERT_ERROR g_convert_error_quark() - -inline static void -mono_set_errno (int errno_val) -{ - errno = errno_val; -} - -#endif // CORECLR - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Unicode encoders and decoders - */ - -gunichar2 * -g_utf8_to_utf16_custom_alloc_optional (const gchar *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, gboolean replace_invalid_codepoints, gboolean null_terminate, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err); - -gchar * -g_utf16_to_utf8_custom_alloc_with_nulls (const gunichar2 *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, gboolean null_terminate, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err); - -#ifdef __cplusplus -} -#endif // extern "C" - -#endif //HAVE_MINIPAL_UTF8CONVERTER_H From eb53c7fd05fafe5d0992c1b730e8aca31a1b05fa Mon Sep 17 00:00:00 2001 From: Adeel <3840695+am11@users.noreply.github.com> Date: Thu, 25 May 2023 00:16:41 +0300 Subject: [PATCH 3/9] Share UTF8 converters between coreclr and mono - v2 --- src/coreclr/pal/src/CMakeLists.txt | 2 +- src/coreclr/pal/src/include/pal/utf8.h | 52 - src/coreclr/pal/src/locale/unicode.cpp | 19 +- src/mono/mono/eglib/CMakeLists.txt | 9 +- src/mono/mono/eglib/giconv.c | 524 +------ src/mono/mono/eglib/glib.h | 4 - src/mono/mono/eglib/test/utf8.c | 6 +- src/mono/mono/metadata/object.c | 2 +- .../src/locale => native/minipal}/utf8.cpp | 1206 ++++++++--------- src/native/minipal/utf8.h | 33 + 10 files changed, 704 insertions(+), 1153 deletions(-) delete mode 100644 src/coreclr/pal/src/include/pal/utf8.h rename src/{coreclr/pal/src/locale => native/minipal}/utf8.cpp (72%) create mode 100644 src/native/minipal/utf8.h diff --git a/src/coreclr/pal/src/CMakeLists.txt b/src/coreclr/pal/src/CMakeLists.txt index bd5a6bdf4d5b22..d61ffc4cca5f11 100644 --- a/src/coreclr/pal/src/CMakeLists.txt +++ b/src/coreclr/pal/src/CMakeLists.txt @@ -152,7 +152,7 @@ set(SOURCES loader/module.cpp locale/unicode.cpp locale/unicodedata.cpp - locale/utf8.cpp + ${CLR_SRC_NATIVE_DIR}/minipal/utf8.cpp map/common.cpp map/map.cpp map/virtual.cpp diff --git a/src/coreclr/pal/src/include/pal/utf8.h b/src/coreclr/pal/src/include/pal/utf8.h deleted file mode 100644 index fa417c0a021f79..00000000000000 --- a/src/coreclr/pal/src/include/pal/utf8.h +++ /dev/null @@ -1,52 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -/*++ - - - -Module Name: - - include/pal/utf8.h - -Abstract: - Header file for UTF-8 conversion functions. - -Revision History: - - - ---*/ - -#ifndef _PAL_UTF8_H_ -#define _PAL_UTF8_H_ - -#include /* for WCHAR */ - -#ifdef __cplusplus -extern "C" -{ -#endif // __cplusplus - -/*++ -Function : - UTF8ToUnicode - - Convert a string from UTF-8 to UTF-16 (UCS-2) ---*/ -int UTF8ToUnicode(LPCSTR lpSrcStr, int cchSrc, LPWSTR lpDestStr, int cchDest, DWORD dwFlags); - - -/*++ -Function : - UnicodeToUTF8 - - Convert a string from UTF-16 (UCS-2) to UTF-8 ---*/ -int UnicodeToUTF8(LPCWSTR lpSrcStr, int cchSrc, LPSTR lpDestStr, int cchDest); - -#ifdef __cplusplus -} -#endif // __cplusplus - -#endif /* _PAL_UTF8_H_ */ diff --git a/src/coreclr/pal/src/locale/unicode.cpp b/src/coreclr/pal/src/locale/unicode.cpp index f29eabc07d9be3..d3f4da7a60b53f 100644 --- a/src/coreclr/pal/src/locale/unicode.cpp +++ b/src/coreclr/pal/src/locale/unicode.cpp @@ -24,7 +24,7 @@ Revision History: #include "pal/palinternal.h" #include "pal/dbgmsg.h" #include "pal/file.h" -#include "pal/utf8.h" +#include #include "pal/cruntime.h" #include "pal/stackstring.hpp" #include "pal/unicodedata.h" @@ -253,16 +253,11 @@ MultiByteToWideChar( goto EXIT; } - // Use UTF8ToUnicode on all systems, since it replaces + // Use minipal_utf8_to_utf16_preallocated on all systems, since it replaces // invalid characters and Core Foundation doesn't do that. if (CodePage == CP_UTF8 || CodePage == CP_ACP) { - if (cbMultiByte <= -1) - { - cbMultiByte = strlen(lpMultiByteStr) + 1; - } - - retval = UTF8ToUnicode(lpMultiByteStr, cbMultiByte, lpWideCharStr, cchWideChar, dwFlags); + retval = minipal_utf8_to_utf16_preallocated(lpMultiByteStr, cbMultiByte, &lpWideCharStr, cchWideChar, dwFlags, /* treatAsLE */ false); goto EXIT; } @@ -338,15 +333,11 @@ WideCharToMultiByte( defaultChar = *lpDefaultChar; } - // Use UnicodeToUTF8 on all systems because we use + // Use minipal_utf16_to_utf8_preallocated on all systems because we use // UTF8ToUnicode in MultiByteToWideChar() on all systems. if (CodePage == CP_UTF8 || CodePage == CP_ACP) { - if (cchWideChar == -1) - { - cchWideChar = PAL_wcslen(lpWideCharStr) + 1; - } - retval = UnicodeToUTF8(lpWideCharStr, cchWideChar, lpMultiByteStr, cbMultiByte); + retval = minipal_utf16_to_utf8_preallocated(lpWideCharStr, cchWideChar, &lpMultiByteStr, cbMultiByte); goto EXIT; } diff --git a/src/mono/mono/eglib/CMakeLists.txt b/src/mono/mono/eglib/CMakeLists.txt index 3de4a9c83d2f5b..a65a4e64085e92 100644 --- a/src/mono/mono/eglib/CMakeLists.txt +++ b/src/mono/mono/eglib/CMakeLists.txt @@ -33,7 +33,14 @@ set(eglib_common_sources gspawn.c gfile.c gfile-posix.c - gutf8.c) + gutf8.c + ${CLR_SRC_NATIVE_DIR}/minipal/utf8.cpp) + +if(HOST_WIN32) +set_source_files_properties("${CLR_SRC_NATIVE_DIR}/minipal/utf8.cpp" PROPERTIES COMPILE_FLAGS "/wd4100 /wd4267 /wd4458 /wd4310") +else() +set_source_files_properties("${CLR_SRC_NATIVE_DIR}/minipal/utf8.cpp" PROPERTIES COMPILE_FLAGS "-std=c++11 -nostdlib -fno-rtti -fno-exceptions") +endif() set(eglib_headers glib.h diff --git a/src/mono/mono/eglib/giconv.c b/src/mono/mono/eglib/giconv.c index 664ad31bba258a..79c45c8182adea 100644 --- a/src/mono/mono/eglib/giconv.c +++ b/src/mono/mono/eglib/giconv.c @@ -28,132 +28,28 @@ #include #include "../utils/mono-errno.h" +typedef gunichar2 char16_t; +#include + #ifdef _MSC_VER #define FORCE_INLINE(RET_TYPE) __forceinline RET_TYPE #else #define FORCE_INLINE(RET_TYPE) inline RET_TYPE __attribute__((always_inline)) #endif - #define UNROLL_DECODE_UTF8 0 -#define UNROLL_ENCODE_UTF8 0 - -static int decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar); -static int encode_utf32be (gunichar c, char *outbuf, size_t outleft); - -static int decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar); -static int encode_utf32le (gunichar c, char *outbuf, size_t outleft); - -static int decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar); -static int encode_utf16be (gunichar c, char *outbuf, size_t outleft); - -static int decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar); -static int encode_utf16le (gunichar c, char *outbuf, size_t outleft); static FORCE_INLINE (int) decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar); -static int encode_utf8 (gunichar c, char *outbuf, size_t outleft); - -static int decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar); -static int encode_latin1 (gunichar c, char *outbuf, size_t outleft); #if G_BYTE_ORDER == G_LITTLE_ENDIAN -#define decode_utf32 decode_utf32le -#define encode_utf32 encode_utf32le #define decode_utf16 decode_utf16le -#define encode_utf16 encode_utf16le #else -#define decode_utf32 decode_utf32be -#define encode_utf32 encode_utf32be +#ifndef BIGENDIAN +#define BIGENDIAN +#endif #define decode_utf16 decode_utf16be -#define encode_utf16 encode_utf16be #endif -/* - * Unicode encoders and decoders - */ - -static FORCE_INLINE (uint32_t) -read_uint32_endian (unsigned char *inptr, unsigned endian) -{ - if (endian == G_LITTLE_ENDIAN) - return (inptr[3] << 24) | (inptr[2] << 16) | (inptr[1] << 8) | inptr[0]; - return (inptr[0] << 24) | (inptr[1] << 16) | (inptr[2] << 8) | inptr[3]; -} - -static int -decode_utf32_endian (char *inbuf, size_t inleft, gunichar *outchar, unsigned endian) -{ - unsigned char *inptr = (unsigned char *) inbuf; - gunichar c; - - if (inleft < 4) { - mono_set_errno (EINVAL); - return -1; - } - - c = read_uint32_endian (inptr, endian); - - if (c >= 0xd800 && c < 0xe000) { - mono_set_errno (EILSEQ); - return -1; - } else if (c >= 0x110000) { - mono_set_errno (EILSEQ); - return -1; - } - - *outchar = c; - - return 4; -} - -static int -decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar) -{ - return decode_utf32_endian (inbuf, inleft, outchar, G_BIG_ENDIAN); -} - -static int -decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar) -{ - return decode_utf32_endian (inbuf, inleft, outchar, G_LITTLE_ENDIAN); -} - -static int -encode_utf32be (gunichar c, char *outbuf, size_t outleft) -{ - unsigned char *outptr = (unsigned char *) outbuf; - - if (outleft < 4) { - mono_set_errno (E2BIG); - return -1; - } - - outptr[0] = (c >> 24) & 0xff; - outptr[1] = (c >> 16) & 0xff; - outptr[2] = (c >> 8) & 0xff; - outptr[3] = c & 0xff; - - return 4; -} - -static int -encode_utf32le (gunichar c, char *outbuf, size_t outleft) -{ - unsigned char *outptr = (unsigned char *) outbuf; - - if (outleft < 4) { - mono_set_errno (E2BIG); - return -1; - } - - outptr[0] = c & 0xff; - outptr[1] = (c >> 8) & 0xff; - outptr[2] = (c >> 16) & 0xff; - outptr[3] = (c >> 24) & 0xff; - - return 4; -} - static FORCE_INLINE (uint16_t) read_uint16_endian (unsigned char *inptr, unsigned endian) { @@ -233,50 +129,6 @@ write_uint16_endian (unsigned char *outptr, uint16_t c, unsigned endian) outptr[1] = c & 0xff; } -static FORCE_INLINE (int) -encode_utf16_endian (gunichar c, char *outbuf, size_t outleft, unsigned endian) -{ - unsigned char *outptr = (unsigned char *) outbuf; - gunichar2 ch; - gunichar c2; - - if (c < 0x10000) { - if (outleft < 2) { - mono_set_errno (E2BIG); - return -1; - } - - write_uint16_endian (outptr, GUNICHAR_TO_UINT16 (c), endian); - return 2; - } else { - if (outleft < 4) { - mono_set_errno (E2BIG); - return -1; - } - - c2 = c - 0x10000; - - ch = (gunichar2) ((c2 >> 10) + 0xd800); - write_uint16_endian (outptr, ch, endian); - - ch = (gunichar2) ((c2 & 0x3ff) + 0xdc00); - write_uint16_endian (outptr + 2, ch, endian); - return 4; - } -} - -static int -encode_utf16be (gunichar c, char *outbuf, size_t outleft) -{ - return encode_utf16_endian (c, outbuf, outleft, G_BIG_ENDIAN); -} - -static int -encode_utf16le (gunichar c, char *outbuf, size_t outleft) -{ - return encode_utf16_endian (c, outbuf, outleft, G_LITTLE_ENDIAN); -} - static FORCE_INLINE (int) decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar) { @@ -336,89 +188,6 @@ decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar) return GSIZE_TO_INT(n); } -static int -encode_utf8 (gunichar c, char *outbuf, size_t outleft) -{ - unsigned char *outptr = (unsigned char *) outbuf; - int base; - size_t n; - - if (c < 0x80) { - outptr[0] = GUNICHAR_TO_UINT8 (c); - return 1; - } else if (c < 0x800) { - base = 192; - n = 2; - } else if (c < 0x10000) { - base = 224; - n = 3; - } else if (c < 0x200000) { - base = 240; - n = 4; - } else if (c < 0x4000000) { - base = 248; - n = 5; - } else { - base = 252; - n = 6; - } - - if (outleft < n) { - mono_set_errno (E2BIG); - return -1; - } - -#if UNROLL_ENCODE_UTF8 - switch (n) { - case 6: outptr[5] = (c & 0x3f) | 0x80; c >>= 6; - case 5: outptr[4] = (c & 0x3f) | 0x80; c >>= 6; - case 4: outptr[3] = (c & 0x3f) | 0x80; c >>= 6; - case 3: outptr[2] = (c & 0x3f) | 0x80; c >>= 6; - case 2: outptr[1] = (c & 0x3f) | 0x80; c >>= 6; - case 1: outptr[0] = c | base; - } -#else - for (size_t i = n - 1; i > 0; i--) { - outptr[i] = (c & 0x3f) | 0x80; - c >>= 6; - } - - outptr[0] = GUNICHAR_TO_UINT8 (c | base); -#endif - - return GSIZE_TO_INT(n); -} - -static int -decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar) -{ - *outchar = (unsigned char) *inbuf; - return 1; -} - -static int -encode_latin1 (gunichar c, char *outbuf, size_t outleft) -{ - if (outleft < 1) { - mono_set_errno (E2BIG); - return -1; - } - - if (c > 0xff) { - mono_set_errno (EILSEQ); - return -1; - } - - *outbuf = (char) c; - - return 1; -} - - -/* - * Simple conversion API - */ - static gpointer error_quark = (gpointer)"ConvertError"; gpointer @@ -426,9 +195,6 @@ g_convert_error_quark (void) { return error_quark; } -/* - * Unicode conversion - */ /** * An explanation of the conversion can be found at: @@ -559,162 +325,81 @@ g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written) return outbuf; } -static gunichar2 * -eg_utf8_to_utf16_general (const gchar *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, gboolean replace_invalid_codepoints, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err, unsigned endian) +static FORCE_INLINE (void) +map_error(GError **err) { - gunichar2 *outbuf, *outptr; - size_t outlen = 0; - size_t inleft; - char *inptr; - gunichar c; - int u, n; - - g_return_val_if_fail (str != NULL, NULL); - - if (len < 0) { - if (include_nuls) { - g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "Conversions with embedded nulls must pass the string length"); - return NULL; - } - - len = (glong)strlen (str); - } - - inptr = (char *) str; - inleft = len; - - while (inleft > 0) { - if ((n = decode_utf8 (inptr, inleft, &c)) < 0) - goto error; - - if (c == 0 && !include_nuls) - break; - - if ((u = g_unichar_to_utf16_endian (c, NULL, endian)) < 0) { - if (replace_invalid_codepoints) { - u = 2; - } else { - mono_set_errno (EILSEQ); - goto error; - } - } - - outlen += u; - inleft -= n; - inptr += n; - } - - if (items_read) - *items_read = GPTRDIFF_TO_LONG (inptr - str); - - if (items_written) - *items_written = (glong)outlen; - - if (G_LIKELY (!custom_alloc_func)) - outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2)); - else - outptr = outbuf = (gunichar2 *)custom_alloc_func ((outlen + 1) * sizeof (gunichar2), custom_alloc_data); - - if (G_UNLIKELY (custom_alloc_func && !outbuf)) { - mono_set_errno (ENOMEM); - goto error; - } - - inptr = (char *) str; - inleft = len; - - while (inleft > 0) { - if ((n = decode_utf8 (inptr, inleft, &c)) < 0) - break; - - if (c == 0 && !include_nuls) - break; - - u = g_unichar_to_utf16_endian (c, outptr, endian); - if ((u < 0) && replace_invalid_codepoints) { - outptr[0] = 0xFFFD; - outptr[1] = 0xFFFD; - u = 2; - } - - outptr += u; - inleft -= n; - inptr += n; - } - - *outptr = '\0'; - - return outbuf; - -error: - if (errno == ENOMEM) { + if (errno == 0) return; + if (errno == ERROR_INSUFFICIENT_BUFFER) { g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY, "Allocation failed."); - } else if (errno == EILSEQ) { + } else if (errno == ERROR_NO_UNICODE_TRANSLATION) { g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "Illegal byte sequence encountered in the input."); - } else if (items_read) { - /* partial input is ok if we can let our caller know... */ } else { g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, "Partial byte sequence encountered in the input."); } - - if (items_read) - *items_read = GPTRDIFF_TO_LONG (inptr - str); - - if (items_written) - *items_written = 0; - - return NULL; } -gunichar2 * -g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) +static gunichar2 * +g_utf8_to_utf16_impl (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err, int dwFlags, bool treatAsLE) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, NULL, NULL, err, G_BYTE_ORDER); + errno = 0; + gunichar2* lpDestStr = NULL; + int ret = minipal_utf8_to_utf16_allocate (str, len, &lpDestStr, dwFlags, treatAsLE); + if (items_written) + *items_written = errno == 0 ? ret : 0; + map_error(err); + return lpDestStr; } -gunichar2 * -g_utf8_to_utf16be (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) +static gunichar2 * +g_utf8_to_utf16le_custom_alloc_impl (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err, bool treatAsLE) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, NULL, NULL, err, G_BIG_ENDIAN); -} + errno = 0; + int ret = minipal_utf8_to_utf16_preallocated (str, len, 0, 0, 0, /* treatAsLE */ treatAsLE); + map_error(err); -gunichar2 * -g_utf8_to_utf16le (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) -{ - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, NULL, NULL, err, G_LITTLE_ENDIAN); + if (items_written) + *items_written = errno == 0 ? ret : 0; + + if (ret <= 0) + return NULL; + + gunichar2* lpDestStr = custom_alloc_func((ret + 1) * sizeof (gunichar2), custom_alloc_data); + ret = minipal_utf8_to_utf16_preallocated (str, len, &lpDestStr, ret, MB_ERR_INVALID_CHARS, /* treatAsLE */ treatAsLE); + map_error(err); + return lpDestStr; } gunichar2 * -g_utf8_to_utf16_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) +g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, custom_alloc_func, custom_alloc_data, err, G_BYTE_ORDER); + return g_utf8_to_utf16_impl (str, len, items_read, items_written, err, MB_ERR_INVALID_CHARS, false); } gunichar2 * -g_utf8_to_utf16be_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) +g_utf8_to_utf16le (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, custom_alloc_func, custom_alloc_data, err, G_BIG_ENDIAN); + return g_utf8_to_utf16_impl (str, len, items_read, items_written, err, MB_ERR_INVALID_CHARS, true); } gunichar2 * -g_utf8_to_utf16le_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) +eg_wtf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, custom_alloc_func, custom_alloc_data, err, G_LITTLE_ENDIAN); + return g_utf8_to_utf16_impl (str, len, items_read, items_written, err, 0, false); } gunichar2 * -eg_utf8_to_utf16_with_nuls (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) +g_utf8_to_utf16_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, FALSE, NULL, NULL, err, G_BYTE_ORDER); + return g_utf8_to_utf16le_custom_alloc_impl (str, len, items_read, items_written, custom_alloc_func, custom_alloc_data, err, false); } gunichar2 * -eg_wtf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) +g_utf8_to_utf16le_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, TRUE, NULL, NULL, err, G_BYTE_ORDER); + return g_utf8_to_utf16le_custom_alloc_impl (str, len, items_read, items_written, custom_alloc_func, custom_alloc_data, err, true); } gunichar * @@ -789,120 +474,49 @@ g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_wri return outbuf; } -static -gchar * -eg_utf16_to_utf8_general (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err, unsigned endian) +static gchar * +g_utf16_to_utf8_impl (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err, bool treatAsLE) { - char *inptr, *outbuf, *outptr; - size_t outlen = 0; - size_t inleft; - gunichar c; - int n; - - g_return_val_if_fail (str != NULL, NULL); - - if (len < 0) { - len = 0; - while (str[len]) - len++; - } - - inptr = (char *) str; - inleft = len * 2; - - while (inleft > 0) { - if ((n = decode_utf16_endian (inptr, inleft, &c, endian)) < 0) { - if (n == -2 && inleft > 2) { - /* This means that the first UTF-16 char was read, but second failed */ - inleft -= 2; - inptr += 2; - } - - if (errno == EILSEQ) { - g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, - "Illegal byte sequence encountered in the input."); - } else if (items_read) { - /* partial input is ok if we can let our caller know... */ - break; - } else { - g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, - "Partial byte sequence encountered in the input."); - } - - if (items_read) - *items_read = GPTRDIFF_TO_LONG ((inptr - (char *) str) / 2); - - if (items_written) - *items_written = 0; - - return NULL; - } else if (c == 0) - break; - - outlen += g_unichar_to_utf8 (c, NULL); - inleft -= n; - inptr += n; - } - - if (items_read) - *items_read = GPTRDIFF_TO_LONG ((inptr - (char *) str) / 2); + errno = 0; + gchar* lpDestStr = NULL; + int ret = minipal_utf16_to_utf8_allocate (str, len, &lpDestStr, treatAsLE); if (items_written) - *items_written = (glong)outlen; - - if (G_LIKELY (!custom_alloc_func)) - outptr = outbuf = g_malloc (outlen + 1); - else - outptr = outbuf = (char *)custom_alloc_func (outlen + 1, custom_alloc_data); - - if (G_UNLIKELY (custom_alloc_func && !outbuf)) { - g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY, "Allocation failed."); - if (items_written) - *items_written = 0; - return NULL; - } + *items_written = errno == 0 ? ret : 0; - inptr = (char *) str; - inleft = len * 2; - - while (inleft > 0) { - if ((n = decode_utf16_endian (inptr, inleft, &c, endian)) < 0) - break; - else if (c == 0) - break; - - outptr += g_unichar_to_utf8 (c, outptr); - inleft -= n; - inptr += n; - } - - *outptr = '\0'; - - return outbuf; + map_error(err); + return lpDestStr; } gchar * g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf16_to_utf8_general (str, len, items_read, items_written, NULL, NULL, err, G_BYTE_ORDER); + return g_utf16_to_utf8_impl (str, len, items_read, items_written, err, /* treatAsLE */ false); } gchar * g_utf16le_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf16_to_utf8_general (str, len, items_read, items_written, NULL, NULL, err, G_LITTLE_ENDIAN); -} - -gchar * -g_utf16be_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err) -{ - return eg_utf16_to_utf8_general (str, len, items_read, items_written, NULL, NULL, err, G_BIG_ENDIAN); + return g_utf16_to_utf8_impl (str, len, items_read, items_written, err, /* treatAsLE */ true); } gchar * g_utf16_to_utf8_custom_alloc (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) { - return eg_utf16_to_utf8_general (str, len, items_read, items_written, custom_alloc_func, custom_alloc_data, err, G_BYTE_ORDER); + errno = 0; + int ret = minipal_utf16_to_utf8_preallocated (str, len, 0, 0); + map_error(err); + + if (items_written) + *items_written = errno == 0 ? ret : 0; + + if (ret <= 0) + return NULL; + + gchar* lpDestStr = custom_alloc_func((ret + 1) * sizeof (gunichar2), custom_alloc_data); + ret = minipal_utf16_to_utf8_preallocated (str, len, &lpDestStr, ret); + map_error(err); + return lpDestStr; } gunichar * diff --git a/src/mono/mono/eglib/glib.h b/src/mono/mono/eglib/glib.h index e438c00298ec72..fcd8d2e37bdae1 100644 --- a/src/mono/mono/eglib/glib.h +++ b/src/mono/mono/eglib/glib.h @@ -882,14 +882,11 @@ gunichar *g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_writte gunichar *g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err); G_EXTERN_C // Used by libtest, at least. gunichar2 *g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err); -gunichar2 *g_utf8_to_utf16be (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err); gunichar2 *g_utf8_to_utf16le (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err); -gunichar2 *eg_utf8_to_utf16_with_nuls (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err); gunichar2 *eg_wtf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err); G_EXTERN_C // Used by libtest, at least. gchar *g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err); gchar *g_utf16le_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err); -gchar *g_utf16be_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err); gunichar *g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err); gchar *g_ucs4_to_utf8 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err); gunichar2 *g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err); @@ -915,7 +912,6 @@ gpointer g_fixed_buffer_custom_allocator (gsize req_size, gpointer custom_alloc_data); gunichar2 *g_utf8_to_utf16_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err); -gunichar2 *g_utf8_to_utf16be_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err); gunichar2 *g_utf8_to_utf16le_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err); gchar *g_utf16_to_utf8_custom_alloc (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err); diff --git a/src/mono/mono/eglib/test/utf8.c b/src/mono/mono/eglib/test/utf8.c index d36dbfaa54ed04..5602bbcbcb7208 100644 --- a/src/mono/mono/eglib/test/utf8.c +++ b/src/mono/mono/eglib/test/utf8.c @@ -155,7 +155,7 @@ compare_utf8_to_utf16_explicit (const gunichar2 *expected, const gchar *utf8, gl gerror = NULL; if (include_nuls) - ret = eg_utf8_to_utf16_with_nuls (utf8, size_spec, &in_read, &out_read, &gerror); + ret = g_utf8_to_utf16 (utf8, size_spec, &in_read, &out_read, &gerror); else ret = g_utf8_to_utf16 (utf8, size_spec, &in_read, &out_read, &gerror); @@ -271,7 +271,7 @@ test_utf8_to_utf16_with_nuls (void) #endif /* implicit length is forbidden */ - if (eg_utf8_to_utf16_with_nuls (src1, -1, NULL, NULL, NULL) != NULL) + if (g_utf8_to_utf16 (src1, -1, NULL, NULL, NULL) != NULL) return FAILED ("explicit nulls must fail with -1 length\n"); /* empty string */ @@ -699,7 +699,7 @@ utf8_byteslen (const gchar *src) static Test utf8_tests [] = { {"g_utf16_to_utf8", test_utf16_to_utf8}, {"g_utf8_to_utf16", test_utf8_to_utf16}, - {"g_utf8_to_utf16_with_nuls", test_utf8_to_utf16_with_nuls}, + {"g_utf8_to_utf16_nuls", test_utf8_to_utf16_with_nuls}, {"g_utf8_seq", test_utf8_seq}, {"g_ucs4_to_utf16", test_ucs4_to_utf16 }, {"g_utf16_to_ucs4", test_utf16_to_ucs4 }, diff --git a/src/mono/mono/metadata/object.c b/src/mono/mono/metadata/object.c index 8604114fe520f5..b267334250c6c2 100644 --- a/src/mono/mono/metadata/object.c +++ b/src/mono/mono/metadata/object.c @@ -6361,7 +6361,7 @@ mono_string_new_utf8_len (const char *text, guint length, MonoError *error) gunichar2 *ut = NULL; glong items_written; - ut = eg_utf8_to_utf16_with_nuls (text, length, NULL, &items_written, &eg_error); + ut = g_utf8_to_utf16 (text, length, NULL, &items_written, &eg_error); if (eg_error) { o = NULL_HANDLE_STRING; diff --git a/src/coreclr/pal/src/locale/utf8.cpp b/src/native/minipal/utf8.cpp similarity index 72% rename from src/coreclr/pal/src/locale/utf8.cpp rename to src/native/minipal/utf8.cpp index f07c69ff7e15f3..ce967669c46d3d 100644 --- a/src/coreclr/pal/src/locale/utf8.cpp +++ b/src/native/minipal/utf8.cpp @@ -14,142 +14,169 @@ Revision History: --*/ -#include "pal/utf8.h" -#include "pal/malloc.hpp" +#include -using namespace CorUnix; +#include +#include +#include +#include #define FASTLOOP +#ifdef TARGET_WINDOWS +#define W(str) L ## str +#else +#define W(str) u##str +#endif + struct CharUnicodeInfo { - static const WCHAR HIGH_SURROGATE_START = 0xd800; - static const WCHAR HIGH_SURROGATE_END = 0xdbff; - static const WCHAR LOW_SURROGATE_START = 0xdc00; - static const WCHAR LOW_SURROGATE_END = 0xdfff; + static const char16_t HIGH_SURROGATE_START = 0xd800; + static const char16_t HIGH_SURROGATE_END = 0xdbff; + static const char16_t LOW_SURROGATE_START = 0xdc00; + static const char16_t LOW_SURROGATE_END = 0xdfff; }; struct Char { // Test if the wide character is a high surrogate - static bool IsHighSurrogate(const WCHAR c) + static bool IsHighSurrogate(const char16_t c) { return (c & 0xFC00) == CharUnicodeInfo::HIGH_SURROGATE_START; } // Test if the wide character is a low surrogate - static bool IsLowSurrogate(const WCHAR c) + static bool IsLowSurrogate(const char16_t c) { return (c & 0xFC00) == CharUnicodeInfo::LOW_SURROGATE_START; } // Test if the wide character is a surrogate half - static bool IsSurrogate(const WCHAR c) + static bool IsSurrogate(const char16_t c) { return (c & 0xF800) == CharUnicodeInfo::HIGH_SURROGATE_START; } // Test if the wide character is a high surrogate - static bool IsHighSurrogate(const WCHAR* s, int index) + static bool IsHighSurrogate(const char16_t* s, int index) { return IsHighSurrogate(s[index]); } // Test if the wide character is a low surrogate - static bool IsLowSurrogate(const WCHAR* s, int index) + static bool IsLowSurrogate(const char16_t* s, int index) { return IsLowSurrogate(s[index]); } // Test if the wide character is a surrogate half - static bool IsSurrogate(const WCHAR* s, int index) + static bool IsSurrogate(const char16_t* s, int index) { return IsSurrogate(s[index]); } }; -class ArgumentException +size_t wcslen(const char16_t* str) { + size_t nChar = 0; + while (*str++) nChar++; + return nChar; +} -public: - ArgumentException(LPCSTR message) +int wcscpy_s(char16_t *_Dst, size_t _SizeInWords, const char16_t *_Src) +{ + + char16_t* p = _Dst; + size_t available = _SizeInWords; + + if (!_Src || !_Dst || _SizeInWords == 0) return EINVAL; + + while ((*p++ = *_Src++) != 0 && --available > 0); + + if (available == 0) { + _Dst = 0; + return ERANGE; } - ArgumentException(LPCSTR message, LPCSTR argName) +#ifdef DEBUG + size_t offset = _SizeInWords - available + 1; + if (offset < _SizeInWords) { + memset((_Dst) + (offset), 0xFD, ((_SizeInWords) - (offset)) * sizeof(*(_Dst))); } -}; +#endif + + return 0; +} -class ArgumentNullException : public ArgumentException +int wcscat_s(char16_t *_Dst, size_t _SizeInWords, const char16_t *_Src) { -public: - ArgumentNullException(LPCSTR argName) - : ArgumentException("Argument is NULL", argName) - { + char16_t* p = _Dst; + size_t available = _SizeInWords; - } -}; + if (!_Src || !_Dst || _SizeInWords == 0) return EINVAL; -class ArgumentOutOfRangeException : public ArgumentException -{ -public: - ArgumentOutOfRangeException(LPCSTR argName, LPCSTR message) - : ArgumentException(message, argName) + while (available > 0 && *p != 0) { - + p++; + available--; } -}; -class InsufficientBufferException : public ArgumentException -{ -public: - InsufficientBufferException(LPCSTR message, LPCSTR argName) - : ArgumentException(message, argName) + if (available == 0) { + _Dst = 0; + return EINVAL; + } + while ((*p++ = *_Src++) != 0 && --available > 0) + { } -}; -class Contract -{ -public: - static void Assert(bool cond, LPCSTR str) + if (available == 0) { - if (!cond) - { - throw ArgumentException(str); - } + _Dst = 0; + return ERANGE; } - static void EndContractBlock() +#ifdef DEBUG + size_t offset = _SizeInWords - available + 1; + if (offset < _SizeInWords) { + memset((_Dst) + (offset), 0xFD, ((_SizeInWords) - (offset)) * sizeof(*(_Dst))); } -}; +#endif + return 0; +} -class DecoderFallbackException : public ArgumentException -{ - BYTE *bytesUnknown; - int index; +#define ContractAssert(cond) \ + if (!(cond)) \ + { \ + errno = ERROR_INVALID_PARAMETER; \ + return 0; \ + } -public: - DecoderFallbackException( - LPCSTR message, BYTE bytesUnknown[], int index) : ArgumentException(message) - { - this->bytesUnknown = bytesUnknown; - this->index = index; +#define ContractAssertVoid(cond) \ + if (!(cond)) \ + { \ + errno = ERROR_INVALID_PARAMETER; \ + return; \ } - BYTE *BytesUnknown() - { - return (bytesUnknown); +#define ContractAssertFreeFallback(cond) \ + if (!(cond)) \ + { \ + errno = ERROR_INVALID_PARAMETER; \ + if (fallback) free(fallback); \ + return 0; \ } - int GetIndex() - { - return index; +#define RETURN_ON_ERROR \ + if (errno) \ + { \ + if (fallback) free(fallback); \ + return 0; \ } -}; class DecoderFallbackBuffer; @@ -171,7 +198,7 @@ class DecoderFallback class DecoderReplacementFallback : public DecoderFallback { // Our variables - WCHAR strDefault[2]; + char16_t strDefault[2]; int strDefaultLength; public: @@ -180,16 +207,14 @@ class DecoderReplacementFallback : public DecoderFallback { } - DecoderReplacementFallback(const WCHAR* replacement) + DecoderReplacementFallback(const char16_t* replacement) { // Must not be null - if (replacement == nullptr) - throw ArgumentNullException("replacement"); - Contract::EndContractBlock(); + ContractAssertVoid(replacement != nullptr) // Make sure it doesn't have bad surrogate pairs bool bFoundHigh = false; - int replacementLength = PAL_wcslen((const WCHAR *)replacement); + int replacementLength = wcslen((const char16_t *)replacement); for (int i = 0; i < replacementLength; i++) { // Found a surrogate? @@ -221,14 +246,13 @@ class DecoderReplacementFallback : public DecoderFallback else if (bFoundHigh) break; } - if (bFoundHigh) - throw ArgumentException("String 'replacement' contains invalid Unicode code points.", "replacement"); + ContractAssertVoid(!bFoundHigh) wcscpy_s(strDefault, ARRAY_SIZE(strDefault), replacement); strDefaultLength = replacementLength; } - WCHAR* GetDefaultString() + char16_t* GetDefaultString() { return strDefault; } @@ -251,12 +275,10 @@ class DecoderFallbackBuffer // These wrap the internal methods so that we can check for people doing stuff that's incorrect public: - virtual ~DecoderFallbackBuffer() = default; - - virtual bool Fallback(BYTE bytesUnknown[], int index, int size) = 0; + virtual bool Fallback(unsigned char bytesUnknown[], int index, int size) = 0; // Get next character - virtual WCHAR GetNextChar() = 0; + virtual char16_t GetNextChar() = 0; //Back up a character virtual bool MovePrevious() = 0; @@ -267,14 +289,14 @@ class DecoderFallbackBuffer // Clear the buffer virtual void Reset() { - while (GetNextChar() != (WCHAR)0); + while (GetNextChar() != (char16_t)0); } // Internal items to help us figure out what we're doing as far as error messages, etc. // These help us with our performance and messages internally protected: - BYTE* byteStart; - WCHAR* charEnd; + unsigned char* byteStart; + char16_t* charEnd; // Internal reset void InternalReset() @@ -285,7 +307,7 @@ class DecoderFallbackBuffer // Set the above values // This can't be part of the constructor because EncoderFallbacks would have to know how to implement these. - void InternalInitialize(BYTE* byteStart, WCHAR* charEnd) + void InternalInitialize(unsigned char* byteStart, char16_t* charEnd) { this->byteStart = byteStart; this->charEnd = charEnd; @@ -299,17 +321,20 @@ class DecoderFallbackBuffer // Right now this has both bytes and bytes[], since we might have extra bytes, hence the // array, and we might need the index, hence the byte* // Don't touch ref chars unless we succeed - virtual bool InternalFallback(BYTE bytes[], BYTE* pBytes, WCHAR** chars, int size) + virtual bool InternalFallback(unsigned char bytes[], unsigned char* pBytes, char16_t** chars, int size) { - Contract::Assert(byteStart != nullptr, "[DecoderFallback.InternalFallback]Used InternalFallback without calling InternalInitialize"); + ContractAssert(byteStart != nullptr) + + bool fallbackResult = this->Fallback(bytes, (int)(pBytes - byteStart - size), size); + if (errno) return false; // See if there's a fallback character and we have an output buffer then copy our string. - if (this->Fallback(bytes, (int)(pBytes - byteStart - size), size)) + if (fallbackResult) { // Copy the chars to our output - WCHAR ch; - WCHAR* charTemp = *chars; + char16_t ch; + char16_t* charTemp = *chars; bool bHighSurrogate = false; while ((ch = GetNextChar()) != 0) { @@ -319,15 +344,13 @@ class DecoderFallbackBuffer if (Char::IsHighSurrogate(ch)) { // High Surrogate - if (bHighSurrogate) - throw ArgumentException("String 'chars' contains invalid Unicode code points."); + ContractAssert(!bHighSurrogate) bHighSurrogate = true; } else { // Low surrogate - if (!bHighSurrogate) - throw ArgumentException("String 'chars' contains invalid Unicode code points."); + ContractAssert(bHighSurrogate) bHighSurrogate = false; } } @@ -342,8 +365,7 @@ class DecoderFallbackBuffer } // Need to make sure that bHighSurrogate isn't true - if (bHighSurrogate) - throw ArgumentException("String 'chars' contains invalid Unicode code points."); + ContractAssert(!bHighSurrogate) // Now we aren't going to be false, so its OK to update chars *chars = charTemp; @@ -353,19 +375,22 @@ class DecoderFallbackBuffer } // This version just counts the fallback and doesn't actually copy anything. - virtual int InternalFallback(BYTE bytes[], BYTE* pBytes, int size) - // Right now this has both bytes[] and BYTE* bytes, since we might have extra bytes, hence the + virtual int InternalFallback(unsigned char bytes[], unsigned char* pBytes, int size) + // Right now this has both bytes[] and unsigned char* bytes, since we might have extra bytes, hence the // array, and we might need the index, hence the byte* { - Contract::Assert(byteStart != nullptr, "[DecoderFallback.InternalFallback]Used InternalFallback without calling InternalInitialize"); + ContractAssert(byteStart != nullptr) + + bool fallbackResult = this->Fallback(bytes, (int)(pBytes - byteStart - size), size); + if (errno) return 0; // See if there's a fallback character and we have an output buffer then copy our string. - if (this->Fallback(bytes, (int)(pBytes - byteStart - size), size)) + if (fallbackResult) { int count = 0; - WCHAR ch; + char16_t ch; bool bHighSurrogate = false; while ((ch = GetNextChar()) != 0) { @@ -375,15 +400,13 @@ class DecoderFallbackBuffer if (Char::IsHighSurrogate(ch)) { // High Surrogate - if (bHighSurrogate) - throw ArgumentException("String 'chars' contains invalid Unicode code points."); + ContractAssert(!bHighSurrogate) bHighSurrogate = true; } else { // Low surrogate - if (!bHighSurrogate) - throw ArgumentException("String 'chars' contains invalid Unicode code points."); + ContractAssert(bHighSurrogate) bHighSurrogate = false; } } @@ -392,8 +415,7 @@ class DecoderFallbackBuffer } // Need to make sure that bHighSurrogate isn't true - if (bHighSurrogate) - throw ArgumentException("String 'chars' contains invalid Unicode code points."); + ContractAssert(!bHighSurrogate) return count; } @@ -401,18 +423,12 @@ class DecoderFallbackBuffer // If no fallback return 0 return 0; } - - // private helper methods - void ThrowLastBytesRecursive(BYTE bytesUnknown[]) - { - throw ArgumentException("Recursive fallback not allowed"); - } }; class DecoderReplacementFallbackBuffer : public DecoderFallbackBuffer { // Store our default string - WCHAR strDefault[2]; + char16_t strDefault[2]; int strDefaultLength; int fallbackCount = -1; int fallbackIndex = -1; @@ -422,18 +438,15 @@ class DecoderReplacementFallbackBuffer : public DecoderFallbackBuffer DecoderReplacementFallbackBuffer(DecoderReplacementFallback* fallback) { wcscpy_s(strDefault, ARRAY_SIZE(strDefault), fallback->GetDefaultString()); - strDefaultLength = PAL_wcslen((const WCHAR *)fallback->GetDefaultString()); + strDefaultLength = wcslen((const char16_t *)fallback->GetDefaultString()); } // Fallback Methods - virtual bool Fallback(BYTE bytesUnknown[], int index, int size) + virtual bool Fallback(unsigned char bytesUnknown[], int index, int size) { // We expect no previous fallback in our buffer // We can't call recursively but others might (note, we don't test on last char!!!) - if (fallbackCount >= 1) - { - ThrowLastBytesRecursive(bytesUnknown); - } + ContractAssert(fallbackCount < 1) // Go ahead and get our fallback if (strDefaultLength == 0) @@ -445,7 +458,7 @@ class DecoderReplacementFallbackBuffer : public DecoderFallbackBuffer return true; } - virtual WCHAR GetNextChar() + virtual char16_t GetNextChar() { // We want it to get < 0 because == 0 means that the current/last character is a fallback // and we need to detect recursion. We could have a flag but we already have this counter. @@ -465,8 +478,7 @@ class DecoderReplacementFallbackBuffer : public DecoderFallbackBuffer } // Now make sure its in the expected range - Contract::Assert(fallbackIndex < strDefaultLength && fallbackIndex >= 0, - "Index exceeds buffer range"); + ContractAssert(fallbackIndex < strDefaultLength && fallbackIndex >= 0) return strDefault[fallbackIndex]; } @@ -501,7 +513,7 @@ class DecoderReplacementFallbackBuffer : public DecoderFallbackBuffer } // This version just counts the fallback and doesn't actually copy anything. - virtual int InternalFallback(BYTE bytes[], BYTE* pBytes, int size) + virtual int InternalFallback(unsigned char bytes[], unsigned char* pBytes, int size) // Right now this has both bytes and bytes[], since we might have extra bytes, hence the // array, and we might need the index, hence the byte* { @@ -517,13 +529,12 @@ class DecoderExceptionFallbackBuffer : public DecoderFallbackBuffer { } - virtual bool Fallback(BYTE bytesUnknown[], int index, int size) + virtual bool Fallback(unsigned char bytesUnknown[], int index, int size) { - throw DecoderFallbackException( - "Unable to translate UTF-8 character to Unicode", bytesUnknown, index); + ContractAssert(false) } - virtual WCHAR GetNextChar() + virtual char16_t GetNextChar() { return 0; } @@ -552,7 +563,13 @@ class DecoderExceptionFallback : public DecoderFallback virtual DecoderFallbackBuffer* CreateFallbackBuffer() { - return InternalNew(); + DecoderExceptionFallbackBuffer* pMem = (DecoderExceptionFallbackBuffer*)malloc(sizeof(DecoderExceptionFallbackBuffer)); + if (pMem == nullptr) + { + errno = ERROR_INSUFFICIENT_BUFFER; + return nullptr; + } + return new (pMem) DecoderExceptionFallbackBuffer(); } // Maximum number of characters that this instance of this fallback could return @@ -564,70 +581,20 @@ class DecoderExceptionFallback : public DecoderFallback DecoderFallbackBuffer* DecoderReplacementFallback::CreateFallbackBuffer() { - return InternalNew(this); -} - -class EncoderFallbackException : public ArgumentException -{ - WCHAR charUnknown; - WCHAR charUnknownHigh; - WCHAR charUnknownLow; - int index; - -public: - EncoderFallbackException( - LPCSTR message, WCHAR charUnknown, int index) : ArgumentException(message) - { - this->charUnknown = charUnknown; - this->index = index; - } - - EncoderFallbackException( - LPCSTR message, WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) : ArgumentException(message) + DecoderReplacementFallbackBuffer* pMem = (DecoderReplacementFallbackBuffer*)malloc(sizeof(DecoderReplacementFallbackBuffer)); + if (pMem == nullptr) { - if (!Char::IsHighSurrogate(charUnknownHigh)) - { - throw ArgumentOutOfRangeException("charUnknownHigh", - "Argument out of range 0xD800..0xDBFF"); - } - if (!Char::IsLowSurrogate(charUnknownLow)) - { - throw ArgumentOutOfRangeException("charUnknownLow", - "Argument out of range 0xDC00..0xDFFF"); - } - Contract::EndContractBlock(); - - this->charUnknownHigh = charUnknownHigh; - this->charUnknownLow = charUnknownLow; - this->index = index; + errno = ERROR_INSUFFICIENT_BUFFER; + return nullptr; } - - WCHAR GetCharUnknown() + pMem = new (pMem) DecoderReplacementFallbackBuffer(this); + if (errno) { - return (charUnknown); + free(pMem); + return nullptr; } - - WCHAR GetCharUnknownHigh() - { - return (charUnknownHigh); - } - - WCHAR GetCharUnknownLow() - { - return (charUnknownLow); - } - - int GetIndex() - { - return index; - } - - // Return true if the unknown character is a surrogate pair. - bool IsUnknownSurrogate() - { - return (charUnknownHigh != '\0'); - } -}; + return pMem; +} class EncoderFallbackBuffer; @@ -648,7 +615,7 @@ class EncoderFallback class EncoderReplacementFallback : public EncoderFallback { // Our variables - WCHAR strDefault[2]; + char16_t strDefault[2]; int strDefaultLength; public: @@ -657,16 +624,14 @@ class EncoderReplacementFallback : public EncoderFallback { } - EncoderReplacementFallback(const WCHAR* replacement) + EncoderReplacementFallback(const char16_t* replacement) { // Must not be null - if (replacement == nullptr) - throw ArgumentNullException("replacement"); - Contract::EndContractBlock(); + ContractAssertVoid(replacement != nullptr) // Make sure it doesn't have bad surrogate pairs bool bFoundHigh = false; - int replacementLength = PAL_wcslen((const WCHAR *)replacement); + int replacementLength = wcslen((const char16_t *)replacement); for (int i = 0; i < replacementLength; i++) { // Found a surrogate? @@ -698,14 +663,13 @@ class EncoderReplacementFallback : public EncoderFallback else if (bFoundHigh) break; } - if (bFoundHigh) - throw ArgumentException("String 'replacement' contains invalid Unicode code points.", "replacement"); + ContractAssertVoid(!bFoundHigh) wcscpy_s(strDefault, ARRAY_SIZE(strDefault), replacement); strDefaultLength = replacementLength; } - WCHAR* GetDefaultString() + char16_t* GetDefaultString() { return strDefault; } @@ -728,14 +692,12 @@ class EncoderFallbackBuffer // These wrap the internal methods so that we can check for people doing stuff that is incorrect public: - virtual ~EncoderFallbackBuffer() = default; + virtual bool Fallback(char16_t charUnknown, int index) = 0; - virtual bool Fallback(WCHAR charUnknown, int index) = 0; - - virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) = 0; + virtual bool Fallback(char16_t charUnknownHigh, char16_t charUnknownLow, int index) = 0; // Get next character - virtual WCHAR GetNextChar() = 0; + virtual char16_t GetNextChar() = 0; // Back up a character virtual bool MovePrevious() = 0; @@ -747,14 +709,14 @@ class EncoderFallbackBuffer // Clear the buffer virtual void Reset() { - while (GetNextChar() != (WCHAR)0); + while (GetNextChar() != (char16_t)0); } // Internal items to help us figure out what we're doing as far as error messages, etc. // These help us with our performance and messages internally protected: - WCHAR* charStart; - WCHAR* charEnd; + char16_t* charStart; + char16_t* charEnd; bool setEncoder; bool bUsedEncoder; bool bFallingBack = false; @@ -773,7 +735,7 @@ class EncoderFallbackBuffer // Set the above values // This can't be part of the constructor because EncoderFallbacks would have to know how to implement these. - void InternalInitialize(WCHAR* charStart, WCHAR* charEnd, bool setEncoder) + void InternalInitialize(char16_t* charStart, char16_t* charEnd, bool setEncoder) { this->charStart = charStart; this->charEnd = charEnd; @@ -783,9 +745,9 @@ class EncoderFallbackBuffer this->iRecursionCount = 0; } - WCHAR InternalGetNextChar() + char16_t InternalGetNextChar() { - WCHAR ch = GetNextChar(); + char16_t ch = GetNextChar(); bFallingBack = (ch != 0); if (ch == 0) iRecursionCount = 0; return ch; @@ -799,11 +761,10 @@ class EncoderFallbackBuffer // Note that this could also change the contents of this->encoder, which is the same // object that the caller is using, so the caller could mess up the encoder for us // if they aren't careful. - virtual bool InternalFallback(WCHAR ch, WCHAR** chars) + virtual bool InternalFallback(char16_t ch, char16_t** chars) { // Shouldn't have null charStart - Contract::Assert(charStart != nullptr, - "[EncoderFallback.InternalFallbackBuffer]Fallback buffer is not initialized"); + ContractAssert(charStart != nullptr) // Get our index, remember chars was preincremented to point at next char, so have to -1 int index = (int)(*chars - charStart) - 1; @@ -820,12 +781,11 @@ class EncoderFallbackBuffer else { // Might have a low surrogate - WCHAR cNext = **chars; + char16_t cNext = **chars; if (Char::IsLowSurrogate(cNext)) { // If already falling back then fail - if (bFallingBack && iRecursionCount++ > iMaxRecursion) - ThrowLastCharRecursive(ch, cNext); + ContractAssert(!bFallingBack || iRecursionCount++ <= iMaxRecursion) // Next is a surrogate, add it as surrogate pair, and increment chars (*chars)++; @@ -838,33 +798,19 @@ class EncoderFallbackBuffer } // If already falling back then fail - if (bFallingBack && iRecursionCount++ > iMaxRecursion) - ThrowLastCharRecursive((int)ch); + ContractAssert(!bFallingBack || iRecursionCount++ <= iMaxRecursion) // Fall back our char bFallingBack = Fallback(ch, index); return bFallingBack; } - - // private helper methods - void ThrowLastCharRecursive(WCHAR highSurrogate, WCHAR lowSurrogate) - { - // Throw it, using our complete character - throw ArgumentException("Recursive fallback not allowed", "chars"); - } - - void ThrowLastCharRecursive(int utf32Char) - { - throw ArgumentException("Recursive fallback not allowed", "chars"); - } - }; class EncoderReplacementFallbackBuffer : public EncoderFallbackBuffer { // Store our default string - WCHAR strDefault[4]; + char16_t strDefault[4]; int strDefaultLength; int fallbackCount = -1; int fallbackIndex = -1; @@ -875,25 +821,16 @@ class EncoderReplacementFallbackBuffer : public EncoderFallbackBuffer // 2X in case we're a surrogate pair wcscpy_s(strDefault, ARRAY_SIZE(strDefault), fallback->GetDefaultString()); wcscat_s(strDefault, ARRAY_SIZE(strDefault), fallback->GetDefaultString()); - strDefaultLength = 2 * PAL_wcslen((const WCHAR *)fallback->GetDefaultString()); + strDefaultLength = 2 * wcslen((const char16_t *)fallback->GetDefaultString()); } // Fallback Methods - virtual bool Fallback(WCHAR charUnknown, int index) + virtual bool Fallback(char16_t charUnknown, int index) { // If we had a buffer already we're being recursive, throw, it's probably at the suspect // character in our array. - if (fallbackCount >= 1) - { - // If we're recursive we may still have something in our buffer that makes this a surrogate - if (Char::IsHighSurrogate(charUnknown) && fallbackCount >= 0 && - Char::IsLowSurrogate(strDefault[fallbackIndex + 1])) - ThrowLastCharRecursive(charUnknown, strDefault[fallbackIndex + 1]); - - // Nope, just one character - ThrowLastCharRecursive((int)charUnknown); - } + ContractAssert(fallbackCount < 1) // Go ahead and get our fallback // Divide by 2 because we aren't a surrogate pair @@ -903,22 +840,15 @@ class EncoderReplacementFallbackBuffer : public EncoderFallbackBuffer return fallbackCount != 0; } - virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) + virtual bool Fallback(char16_t charUnknownHigh, char16_t charUnknownLow, int index) { // Double check input surrogate pair - if (!Char::IsHighSurrogate(charUnknownHigh)) - throw ArgumentOutOfRangeException("charUnknownHigh", - "Argument out of range 0xD800..0xDBFF"); - - if (!Char::IsLowSurrogate(charUnknownLow)) - throw ArgumentOutOfRangeException("charUnknownLow", - "Argument out of range 0xDC00..0xDFFF"); - Contract::EndContractBlock(); + ContractAssert(Char::IsHighSurrogate(charUnknownHigh)) + ContractAssert(Char::IsLowSurrogate(charUnknownLow)) // If we had a buffer already we're being recursive, throw, it's probably at the suspect // character in our array. - if (fallbackCount >= 1) - ThrowLastCharRecursive(charUnknownHigh, charUnknownLow); + ContractAssert(fallbackCount < 1) // Go ahead and get our fallback fallbackCount = strDefaultLength; @@ -927,7 +857,7 @@ class EncoderReplacementFallbackBuffer : public EncoderFallbackBuffer return fallbackCount != 0; } - virtual WCHAR GetNextChar() + virtual char16_t GetNextChar() { // We want it to get < 0 because == 0 means that the current/last character is a fallback // and we need to detect recursion. We could have a flag but we already have this counter. @@ -947,8 +877,7 @@ class EncoderReplacementFallbackBuffer : public EncoderFallbackBuffer } // Now make sure its in the expected range - Contract::Assert(fallbackIndex < strDefaultLength && fallbackIndex >= 0, - "Index exceeds buffer range"); + ContractAssert(fallbackIndex < strDefaultLength && fallbackIndex >= 0) return strDefault[fallbackIndex]; } @@ -991,34 +920,24 @@ class EncoderExceptionFallbackBuffer : public EncoderFallbackBuffer { } - virtual bool Fallback(WCHAR charUnknown, int index) + virtual bool Fallback(char16_t charUnknown, int index) { // Fall back our char - throw EncoderFallbackException("Unable to translate Unicode character to UTF-8", charUnknown, index); + ContractAssert(false) } - virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) + virtual bool Fallback(char16_t charUnknownHigh, char16_t charUnknownLow, int index) { - if (!Char::IsHighSurrogate(charUnknownHigh)) - { - throw ArgumentOutOfRangeException("charUnknownHigh", - "Argument out of range 0xD800..0xDBFF"); - } - if (!Char::IsLowSurrogate(charUnknownLow)) - { - throw ArgumentOutOfRangeException("charUnknownLow", - "Argument out of range 0xDC00..0xDFFF"); - } - Contract::EndContractBlock(); + ContractAssert(Char::IsHighSurrogate(charUnknownHigh)) + ContractAssert(Char::IsLowSurrogate(charUnknownLow)) //int iTemp = Char::ConvertToUtf32(charUnknownHigh, charUnknownLow); // Fall back our char - throw EncoderFallbackException( - "Unable to translate Unicode character to UTF-8", charUnknownHigh, charUnknownLow, index); + ContractAssert(false) } - virtual WCHAR GetNextChar() + virtual char16_t GetNextChar() { return 0; } @@ -1046,7 +965,10 @@ class EncoderExceptionFallback : public EncoderFallback virtual EncoderFallbackBuffer* CreateFallbackBuffer() { - return InternalNew(); + EncoderExceptionFallbackBuffer* pMem = (EncoderExceptionFallbackBuffer*)malloc(sizeof(EncoderExceptionFallbackBuffer)); + if (pMem == nullptr) + return nullptr; + return new (pMem) EncoderExceptionFallbackBuffer(); } // Maximum number of characters that this instance of this fallback could return @@ -1058,7 +980,13 @@ class EncoderExceptionFallback : public EncoderFallback EncoderFallbackBuffer* EncoderReplacementFallback::CreateFallbackBuffer() { - return InternalNew(this); + EncoderReplacementFallbackBuffer* pMem = (EncoderReplacementFallbackBuffer*)malloc(sizeof(EncoderReplacementFallbackBuffer)); + if (pMem == nullptr) + { + errno = ERROR_INSUFFICIENT_BUFFER; + return nullptr; + } + return new (pMem) EncoderReplacementFallbackBuffer(this); } class UTF8Encoding @@ -1075,65 +1003,39 @@ class UTF8Encoding DecoderReplacementFallback decoderReplacementFallback; DecoderExceptionFallback decoderExceptionFallback; +#if BIGENDIAN + bool treatAsLE; +#endif + bool InRange(int c, int begin, int end) { return begin <= c && c <= end; } - size_t PtrDiff(WCHAR* ptr1, WCHAR* ptr2) + size_t PtrDiff(char16_t* ptr1, char16_t* ptr2) { return ptr1 - ptr2; } - size_t PtrDiff(BYTE* ptr1, BYTE* ptr2) + size_t PtrDiff(unsigned char* ptr1, unsigned char* ptr2) { return ptr1 - ptr2; } - void ThrowBytesOverflow() - { - // Special message to include fallback type in case fallback's GetMaxCharCount is broken - // This happens if user has implemented an encoder fallback with a broken GetMaxCharCount - throw InsufficientBufferException("The output byte buffer is too small to contain the encoded data", "bytes"); - } - - void ThrowBytesOverflow(bool nothingEncoded) - { - // Special message to include fallback type in case fallback's GetMaxCharCount is broken - // This happens if user has implemented an encoder fallback with a broken GetMaxCharCount - if (nothingEncoded){ - ThrowBytesOverflow(); - } - } - - void ThrowCharsOverflow() - { - // Special message to include fallback type in case fallback's GetMaxCharCount is broken - // This happens if user has implemented a decoder fallback with a broken GetMaxCharCount - throw InsufficientBufferException("The output char buffer is too small to contain the encoded data", "chars"); - } - - void ThrowCharsOverflow(bool nothingEncoded) - { - // Special message to include fallback type in case fallback's GetMaxCharCount is broken - // This happens if user has implemented an decoder fallback with a broken GetMaxCharCount - if (nothingEncoded){ - ThrowCharsOverflow(); - } - } - // During GetChars we had an invalid byte sequence // pSrc is backed up to the start of the bad sequence if we didn't have room to // fall it back. Otherwise pSrc remains where it is. - bool FallbackInvalidByteSequence(BYTE** pSrc, int ch, DecoderFallbackBuffer* fallback, WCHAR** pTarget) + bool FallbackInvalidByteSequence(unsigned char** pSrc, int ch, DecoderFallbackBuffer* fallback, char16_t** pTarget) { // Get our byte[] - BYTE* pStart = *pSrc; - BYTE bytesUnknown[3]; + unsigned char* pStart = *pSrc; + unsigned char bytesUnknown[3]; int size = GetBytesUnknown(pStart, ch, bytesUnknown); + bool fallbackResult = fallback->InternalFallback(bytesUnknown, *pSrc, pTarget, size); + RETURN_ON_ERROR // Do the actual fallback - if (!fallback->InternalFallback(bytesUnknown, *pSrc, pTarget, size)) + if (!fallbackResult) { // Oops, it failed, back up to pStart *pSrc = pStart; @@ -1144,10 +1046,10 @@ class UTF8Encoding return true; } - int FallbackInvalidByteSequence(BYTE* pSrc, int ch, DecoderFallbackBuffer *fallback) + int FallbackInvalidByteSequence(unsigned char* pSrc, int ch, DecoderFallbackBuffer *fallback) { // Get our byte[] - BYTE bytesUnknown[3]; + unsigned char bytesUnknown[3]; int size = GetBytesUnknown(pSrc, ch, bytesUnknown); // Do the actual fallback @@ -1159,7 +1061,7 @@ class UTF8Encoding return count; } - int GetBytesUnknown(BYTE* pSrc, int ch, BYTE* bytesUnknown) + int GetBytesUnknown(unsigned char* pSrc, int ch, unsigned char* bytesUnknown) { int size; @@ -1168,14 +1070,14 @@ class UTF8Encoding if (ch < 0x100 && ch >= 0) { pSrc--; - bytesUnknown[0] = (BYTE)ch; + bytesUnknown[0] = (unsigned char)ch; size = 1; } // See if its an unfinished 2 byte sequence else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0) { pSrc--; - bytesUnknown[0] = (BYTE)((ch & 0x1F) | 0xc0); + bytesUnknown[0] = (unsigned char)((ch & 0x1F) | 0xc0); size = 1; } // So now we're either 2nd byte of 3 or 4 byte sequence or @@ -1188,24 +1090,24 @@ class UTF8Encoding { // 3rd byte of 4 byte sequence pSrc -= 3; - bytesUnknown[0] = (BYTE)(((ch >> 12) & 0x07) | 0xF0); - bytesUnknown[1] = (BYTE)(((ch >> 6) & 0x3F) | 0x80); - bytesUnknown[2] = (BYTE)(((ch)& 0x3F) | 0x80); + bytesUnknown[0] = (unsigned char)(((ch >> 12) & 0x07) | 0xF0); + bytesUnknown[1] = (unsigned char)(((ch >> 6) & 0x3F) | 0x80); + bytesUnknown[2] = (unsigned char)(((ch)& 0x3F) | 0x80); size = 3; } else if ((ch & (FinalByte >> 12)) != 0) { // 2nd byte of a 4 byte sequence pSrc -= 2; - bytesUnknown[0] = (BYTE)(((ch >> 6) & 0x07) | 0xF0); - bytesUnknown[1] = (BYTE)(((ch)& 0x3F) | 0x80); + bytesUnknown[0] = (unsigned char)(((ch >> 6) & 0x07) | 0xF0); + bytesUnknown[1] = (unsigned char)(((ch)& 0x3F) | 0x80); size = 2; } else { // 4th byte of a 4 byte sequence pSrc--; - bytesUnknown[0] = (BYTE)(((ch)& 0x07) | 0xF0); + bytesUnknown[0] = (unsigned char)(((ch)& 0x07) | 0xF0); size = 1; } } @@ -1216,15 +1118,15 @@ class UTF8Encoding { // So its 2nd byte of a 3 byte sequence pSrc -= 2; - bytesUnknown[0] = (BYTE)(((ch >> 6) & 0x0F) | 0xE0); - bytesUnknown[1] = (BYTE)(((ch)& 0x3F) | 0x80); + bytesUnknown[0] = (unsigned char)(((ch >> 6) & 0x0F) | 0xE0); + bytesUnknown[1] = (unsigned char)(((ch)& 0x3F) | 0x80); size = 2; } else { // 1st byte of a 3 byte sequence pSrc--; - bytesUnknown[0] = (BYTE)(((ch)& 0x0F) | 0xE0); + bytesUnknown[0] = (unsigned char)(((ch)& 0x0F) | 0xE0); size = 1; } } @@ -1234,8 +1136,11 @@ class UTF8Encoding public: - UTF8Encoding(bool isThrowException) + UTF8Encoding(bool isThrowException, bool treatAsLE) : encoderReplacementFallback(W("\xFFFD")), decoderReplacementFallback(W("\xFFFD")) +#if BIGENDIAN + , treatAsLE(treatAsLE) +#endif { if (isThrowException) { @@ -1258,14 +1163,14 @@ class UTF8Encoding const int SupplimentarySeq = 1 << 28; const int ThreeByteSeq = 1 << 27; - int GetCharCount(BYTE* bytes, int count) + int GetCharCount(unsigned char* bytes, int count) { - Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetCharCount]bytes!=nullptr"); - Contract::Assert(count >= 0, "[UTF8Encoding.GetCharCount]count >=0"); + ContractAssert(bytes != nullptr) + ContractAssert(count >= 0) // Initialize stuff - BYTE *pSrc = bytes; - BYTE *pEnd = pSrc + count; + unsigned char *pSrc = bytes; + unsigned char *pEnd = pSrc + count; // Start by assuming we have as many as count, charCount always includes the adjustment // for the character being decoded @@ -1304,8 +1209,7 @@ class UTF8Encoding ch = (ch << 6) | (cha & 0x3F); if ((ch & FinalByte) == 0) { - Contract::Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0, - "[UTF8Encoding.GetChars]Invariant volation"); + ContractAssertFreeFallback((ch & (SupplimentarySeq | ThreeByteSeq)) != 0) if ((ch & SupplimentarySeq) != 0) { if ((ch & (FinalByte >> 6)) != 0) { @@ -1345,6 +1249,7 @@ class UTF8Encoding if (fallback == nullptr) { fallback = decoderFallback->CreateFallbackBuffer(); + RETURN_ON_ERROR fallback->InternalInitialize(bytes, nullptr); } charCount += FallbackInvalidByteSequence(pSrc, ch, fallback); @@ -1429,7 +1334,7 @@ class UTF8Encoding // don't fall into the fast decoding loop if we don't have enough bytes if (availableBytes <= 13) { // try to get over the remainder of the ascii characters fast though - BYTE* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered + unsigned char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered while (pSrc < pLocalEnd) { ch = *pSrc; pSrc++; @@ -1445,7 +1350,7 @@ class UTF8Encoding // To compute the upper bound, assume that all characters are ASCII characters at this point, // the boundary will be decreased for every non-ASCII character we encounter // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences - BYTE *pStop = pSrc + availableBytes - 7; + unsigned char *pStop = pSrc + availableBytes - 7; while (pSrc < pStop) { ch = *pSrc; @@ -1466,7 +1371,7 @@ class UTF8Encoding // get pSrc 4-byte aligned if (((size_t)pSrc & 0x2) != 0) { - ch = *(USHORT*)pSrc; + ch = *(unsigned short*)pSrc; if ((ch & 0x8080) != 0) { goto LongCodeWithMask16; } @@ -1496,21 +1401,27 @@ class UTF8Encoding } break; -#if BIGENDIAN LongCodeWithMask32 : - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); - LongCodeWithMask16: - ch = (int)(((uint)ch) >> 8); -#else // BIGENDIAN - LongCodeWithMask32: +#if BIGENDIAN + // be careful about the sign extension + if (!treatAsLE) ch = (int)(((unsigned int)ch) >> 16); + else +#else + ch &= 0xFF; +#endif + LongCodeWithMask16: +#if BIGENDIAN + if (!treatAsLE) ch = (int)(((unsigned int)ch) >> 8); + else +#else ch &= 0xFF; -#endif // BIGENDIAN - pSrc++; - if (ch <= 0x7F) { - continue; - } +#endif + + pSrc++; + if (ch <= 0x7F) { + continue; + } LongCode: int chc = *pSrc; @@ -1610,6 +1521,7 @@ class UTF8Encoding if (fallback == nullptr) { fallback = decoderFallback->CreateFallbackBuffer(); + RETURN_ON_ERROR fallback->InternalInitialize(bytes, nullptr); } charCount += FallbackInvalidByteSequence(pSrc, ch, fallback); @@ -1617,27 +1529,26 @@ class UTF8Encoding // Shouldn't have anything in fallback buffer for GetCharCount // (don't have to check m_throwOnOverflow for count) - Contract::Assert(fallback == nullptr || fallback->GetRemaining() == 0, - "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end"); + ContractAssertFreeFallback(fallback == nullptr || fallback->GetRemaining() == 0) - InternalDelete(fallback); + free(fallback); return charCount; } - int GetChars(BYTE* bytes, int byteCount, WCHAR* chars, int charCount) + int GetChars(unsigned char* bytes, int byteCount, char16_t* chars, int charCount) { - Contract::Assert(chars != nullptr, "[UTF8Encoding.GetChars]chars!=nullptr"); - Contract::Assert(byteCount >= 0, "[UTF8Encoding.GetChars]byteCount >=0"); - Contract::Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0"); - Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetChars]bytes!=nullptr"); + ContractAssert(chars != nullptr) + ContractAssert(byteCount >= 0) + ContractAssert(charCount >= 0) + ContractAssert(bytes != nullptr) - BYTE *pSrc = bytes; - WCHAR *pTarget = chars; + unsigned char *pSrc = bytes; + char16_t *pTarget = chars; - BYTE *pEnd = pSrc + byteCount; - WCHAR *pAllocatedBufferEnd = pTarget + charCount; + unsigned char *pEnd = pSrc + byteCount; + char16_t *pAllocatedBufferEnd = pTarget + charCount; int ch = 0; @@ -1675,8 +1586,7 @@ class UTF8Encoding if ((ch & FinalByte) == 0) { // Not at last byte yet - Contract::Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0, - "[UTF8Encoding.GetChars]Invariant volation"); + ContractAssertFreeFallback((ch & (SupplimentarySeq | ThreeByteSeq)) != 0) if ((ch & SupplimentarySeq) != 0) { // Its a 4-byte supplimentary sequence @@ -1711,8 +1621,8 @@ class UTF8Encoding if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq) { // let the range check for the second char throw the exception if (pTarget < pAllocatedBufferEnd) { - *pTarget = (WCHAR)(((ch >> 10) & 0x7FF) + - (SHORT)((CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10)))); + *pTarget = (char16_t)(((ch >> 10) & 0x7FF) + + (short)((CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10)))); pTarget++; ch = (ch & 0x3FF) + @@ -1728,6 +1638,7 @@ class UTF8Encoding if (fallback == nullptr) { fallback = decoderFallback->CreateFallbackBuffer(); + RETURN_ON_ERROR fallback->InternalInitialize(bytes, pAllocatedBufferEnd); } @@ -1736,15 +1647,18 @@ class UTF8Encoding { // Ran out of buffer space // Need to throw an exception? - Contract::Assert(pSrc >= bytes || pTarget == chars, - "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback"); + ContractAssertFreeFallback(pSrc >= bytes || pTarget == chars) fallback->InternalReset(); - ThrowCharsOverflow(pTarget == chars); + if (pTarget == chars) + { + errno = ERROR_INSUFFICIENT_BUFFER; + if (fallback) free(fallback); + return 0; + } ch = 0; break; } - Contract::Assert(pSrc >= bytes, - "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array"); + ContractAssert(pSrc >= bytes) ch = 0; continue; @@ -1829,9 +1743,13 @@ class UTF8Encoding // Throw that we don't have enough room (pSrc could be < chars if we had started to process // a 4 byte sequence already) - Contract::Assert(pSrc >= bytes || pTarget == chars, - "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]"); - ThrowCharsOverflow(pTarget == chars); + ContractAssert(pSrc >= bytes || pTarget == chars) + if (pTarget == chars) + { + errno = ERROR_INSUFFICIENT_BUFFER; + if (fallback) free(fallback); + return 0; + } // Don't store ch in decoder, we already backed up to its start ch = 0; @@ -1839,7 +1757,7 @@ class UTF8Encoding // Didn't throw, just use this buffer size. break; } - *pTarget = (WCHAR)ch; + *pTarget = (char16_t)ch; pTarget++; #ifdef FASTLOOP @@ -1857,7 +1775,7 @@ class UTF8Encoding } // try to get over the remainder of the ascii characters fast though - BYTE* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered + unsigned char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered while (pSrc < pLocalEnd) { ch = *pSrc; pSrc++; @@ -1865,7 +1783,7 @@ class UTF8Encoding if (ch > 0x7F) goto ProcessChar; - *pTarget = (WCHAR)ch; + *pTarget = (char16_t)ch; pTarget++; } // we are done @@ -1882,7 +1800,7 @@ class UTF8Encoding // To compute the upper bound, assume that all characters are ASCII characters at this point, // the boundary will be decreased for every non-ASCII character we encounter // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences - WCHAR *pStop = pTarget + availableBytes - 7; + char16_t *pStop = pTarget + availableBytes - 7; while (pTarget < pStop) { ch = *pSrc; @@ -1891,7 +1809,7 @@ class UTF8Encoding if (ch > 0x7F) { goto LongCode; } - *pTarget = (WCHAR)ch; + *pTarget = (char16_t)ch; pTarget++; // get pSrc to be 2-byte aligned @@ -1901,29 +1819,35 @@ class UTF8Encoding if (ch > 0x7F) { goto LongCode; } - *pTarget = (WCHAR)ch; + *pTarget = (char16_t)ch; pTarget++; } // get pSrc to be 4-byte aligned if ((((size_t)pSrc) & 0x2) != 0) { - ch = *(USHORT*)pSrc; + ch = *(unsigned short*)pSrc; if ((ch & 0x8080) != 0) { goto LongCodeWithMask16; } // Unfortunately, this is endianness sensitive #if BIGENDIAN - *pTarget = (WCHAR)((ch >> 8) & 0x7F); - pSrc += 2; - *(pTarget + 1) = (WCHAR)(ch & 0x7F); - pTarget += 2; -#else // BIGENDIAN - *pTarget = (WCHAR)(ch & 0x7F); - pSrc += 2; - *(pTarget + 1) = (WCHAR)((ch >> 8) & 0x7F); - pTarget += 2; -#endif // BIGENDIAN + if (!treatAsLE) + { + *pTarget = (char16_t)((ch >> 8) & 0x7F); + pSrc += 2; + *(pTarget + 1) = (char16_t)(ch & 0x7F); + pTarget += 2; + } + else +#else + { + *pTarget = (char16_t)(ch & 0x7F); + pSrc += 2; + *(pTarget + 1) = (char16_t)((ch >> 8) & 0x7F); + pTarget += 2; + } +#endif } // Run 8 characters at a time! @@ -1936,45 +1860,57 @@ class UTF8Encoding // Unfortunately, this is endianness sensitive #if BIGENDIAN - *pTarget = (WCHAR)((ch >> 24) & 0x7F); - *(pTarget + 1) = (WCHAR)((ch >> 16) & 0x7F); - *(pTarget + 2) = (WCHAR)((ch >> 8) & 0x7F); - *(pTarget + 3) = (WCHAR)(ch & 0x7F); - pSrc += 8; - *(pTarget + 4) = (WCHAR)((chb >> 24) & 0x7F); - *(pTarget + 5) = (WCHAR)((chb >> 16) & 0x7F); - *(pTarget + 6) = (WCHAR)((chb >> 8) & 0x7F); - *(pTarget + 7) = (WCHAR)(chb & 0x7F); - pTarget += 8; -#else // BIGENDIAN - *pTarget = (WCHAR)(ch & 0x7F); - *(pTarget + 1) = (WCHAR)((ch >> 8) & 0x7F); - *(pTarget + 2) = (WCHAR)((ch >> 16) & 0x7F); - *(pTarget + 3) = (WCHAR)((ch >> 24) & 0x7F); - pSrc += 8; - *(pTarget + 4) = (WCHAR)(chb & 0x7F); - *(pTarget + 5) = (WCHAR)((chb >> 8) & 0x7F); - *(pTarget + 6) = (WCHAR)((chb >> 16) & 0x7F); - *(pTarget + 7) = (WCHAR)((chb >> 24) & 0x7F); - pTarget += 8; -#endif // BIGENDIAN + if (!treatAsLE) + { + *pTarget = (char16_t)((ch >> 24) & 0x7F); + *(pTarget + 1) = (char16_t)((ch >> 16) & 0x7F); + *(pTarget + 2) = (char16_t)((ch >> 8) & 0x7F); + *(pTarget + 3) = (char16_t)(ch & 0x7F); + pSrc += 8; + *(pTarget + 4) = (char16_t)((chb >> 24) & 0x7F); + *(pTarget + 5) = (char16_t)((chb >> 16) & 0x7F); + *(pTarget + 6) = (char16_t)((chb >> 8) & 0x7F); + *(pTarget + 7) = (char16_t)(chb & 0x7F); + pTarget += 8; + } + else +#else + { + *pTarget = (char16_t)(ch & 0x7F); + *(pTarget + 1) = (char16_t)((ch >> 8) & 0x7F); + *(pTarget + 2) = (char16_t)((ch >> 16) & 0x7F); + *(pTarget + 3) = (char16_t)((ch >> 24) & 0x7F); + pSrc += 8; + *(pTarget + 4) = (char16_t)(chb & 0x7F); + *(pTarget + 5) = (char16_t)((chb >> 8) & 0x7F); + *(pTarget + 6) = (char16_t)((chb >> 16) & 0x7F); + *(pTarget + 7) = (char16_t)((chb >> 24) & 0x7F); + pTarget += 8; + } +#endif } break; -#if BIGENDIAN LongCodeWithMask32 : - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); +#if BIGENDIAN + // be careful about the sign extension + if (!treatAsLE) ch = (int)(((unsigned int)ch) >> 16); + else +#else + ch &= 0xFF; +#endif + LongCodeWithMask16: - ch = (int)(((uint)ch) >> 8); -#else // BIGENDIAN - LongCodeWithMask32: - LongCodeWithMask16: +#if BIGENDIAN + if (!treatAsLE) ch = (int)(((unsigned int)ch) >> 8); + else +#else ch &= 0xFF; -#endif // BIGENDIAN +#endif + pSrc++; if (ch <= 0x7F) { - *pTarget = (WCHAR)ch; + *pTarget = (char16_t)ch; pTarget++; continue; } @@ -2024,12 +1960,12 @@ class UTF8Encoding ch = (chc << 6) | (ch & 0x3F); - *pTarget = (WCHAR)(((ch >> 10) & 0x7FF) + - (SHORT)(CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10))); + *pTarget = (char16_t)(((ch >> 10) & 0x7FF) + + (short)(CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10))); pTarget++; ch = (ch & 0x3FF) + - (SHORT)(CharUnicodeInfo::LOW_SURROGATE_START); + (short)(CharUnicodeInfo::LOW_SURROGATE_START); // extra byte, we're already planning 2 chars for 2 of these bytes, // but the big loop is testing the target against pStop, so we need @@ -2073,7 +2009,7 @@ class UTF8Encoding ch = (ch << 6) | chc; } - *pTarget = (WCHAR)ch; + *pTarget = (char16_t)ch; pTarget++; // extra byte, we're only expecting 1 char for each of these 2 bytes, @@ -2083,7 +2019,7 @@ class UTF8Encoding } #endif // FASTLOOP - Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd"); + ContractAssert(pTarget <= pAllocatedBufferEnd) // no pending bits at this point ch = 0; @@ -2101,50 +2037,53 @@ class UTF8Encoding if (fallback == nullptr) { fallback = decoderFallback->CreateFallbackBuffer(); + RETURN_ON_ERROR fallback->InternalInitialize(bytes, pAllocatedBufferEnd); } // This'll back us up the appropriate # of bytes if we didn't get anywhere if (!FallbackInvalidByteSequence(pSrc, ch, fallback)) { - Contract::Assert(pSrc >= bytes || pTarget == chars, - "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing"); + ContractAssertFreeFallback(pSrc >= bytes || pTarget == chars) // Ran out of buffer space // Need to throw an exception? fallback->InternalReset(); - ThrowCharsOverflow(pTarget == chars); + if (pTarget == chars) + { + errno = ERROR_INSUFFICIENT_BUFFER; + if (fallback) free(fallback); + return 0; + } } - Contract::Assert(pSrc >= bytes, - "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array"); + ContractAssertFreeFallback(pSrc >= bytes) ch = 0; } // Shouldn't have anything in fallback buffer for GetChars // (don't have to check m_throwOnOverflow for chars) - Contract::Assert(fallback == nullptr || fallback->GetRemaining() == 0, - "[UTF8Encoding.GetChars]Expected empty fallback buffer at end"); + ContractAssert(fallback == nullptr || fallback->GetRemaining() == 0) - InternalDelete(fallback); + free(fallback); return PtrDiff(pTarget, chars); } - int GetBytes(WCHAR* chars, int charCount, BYTE* bytes, int byteCount) + int GetBytes(char16_t* chars, int charCount, unsigned char* bytes, int byteCount) { - Contract::Assert(chars != nullptr, "[UTF8Encoding.GetBytes]chars!=nullptr"); - Contract::Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0"); - Contract::Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0"); - Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetBytes]bytes!=nullptr"); + ContractAssert(chars != nullptr) + ContractAssert(byteCount >= 0) + ContractAssert(charCount >= 0) + ContractAssert(bytes != nullptr) // For fallback we may need a fallback buffer. // We wait to initialize it though in case we don't have any broken input unicode - EncoderFallbackBuffer* fallbackBuffer = nullptr; - WCHAR *pSrc = chars; - BYTE *pTarget = bytes; + EncoderFallbackBuffer* fallback = nullptr; + char16_t *pSrc = chars; + unsigned char *pTarget = bytes; - WCHAR *pEnd = pSrc + charCount; - BYTE *pAllocatedBufferEnd = pTarget + byteCount; + char16_t *pEnd = pSrc + charCount; + unsigned char *pAllocatedBufferEnd = pTarget + byteCount; int ch = 0; @@ -2157,20 +2096,19 @@ class UTF8Encoding if (ch == 0) { // Check if there's anything left to get out of the fallback buffer - ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : 0; + ch = fallback != nullptr ? fallback->InternalGetNextChar() : 0; if (ch > 0) { goto ProcessChar; } } else { // Case of leftover surrogates in the fallback buffer - if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) { - Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF, - "[UTF8Encoding.GetBytes]expected high surrogate"); //, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); + if (fallback != nullptr && fallback->bFallingBack) { + ContractAssertFreeFallback(ch >= 0xD800 && ch <= 0xDBFF); //, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)) int cha = ch; - ch = fallbackBuffer->InternalGetNextChar(); + ch = fallback->InternalGetNextChar(); if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo::LOW_SURROGATE_START - (CharUnicodeInfo::HIGH_SURROGATE_START << 10)); @@ -2195,8 +2133,7 @@ class UTF8Encoding if (ch > 0) { // We have a high surrogate left over from a previous loop. - Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF, - "[UTF8Encoding.GetBytes]expected high surrogate");//, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); + ContractAssertFreeFallback(ch >= 0xD800 && ch <= 0xDBFF);//, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)) // use separate helper variables for local contexts so that the jit optimizations // won't get confused about the variable lifetimes @@ -2219,9 +2156,9 @@ class UTF8Encoding } // If we've used a fallback, then we have to check for it - if (fallbackBuffer != nullptr) + if (fallback != nullptr) { - ch = fallbackBuffer->InternalGetNextChar(); + ch = fallback->InternalGetNextChar(); if (ch > 0) goto ProcessChar; } @@ -2242,19 +2179,21 @@ class UTF8Encoding { // Lone surrogates aren't allowed, we have to do fallback for them // Have to make a fallback buffer if we don't have one - if (fallbackBuffer == nullptr) + if (fallback == nullptr) { // wait on fallbacks if we can // For fallback we may need a fallback buffer - fallbackBuffer = encoderFallback->CreateFallbackBuffer(); + fallback = encoderFallback->CreateFallbackBuffer(); + RETURN_ON_ERROR // Set our internal fallback interesting things. - fallbackBuffer->InternalInitialize(chars, pEnd, true); + fallback->InternalInitialize(chars, pEnd, true); } // Do our fallback. Actually we already know its a mixed up surrogate, // so the ref pSrc isn't gonna do anything. - fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc); + fallback->InternalFallback((char16_t)ch, &pSrc); + RETURN_ON_ERROR // Ignore it if we don't throw ch = 0; @@ -2275,11 +2214,11 @@ class UTF8Encoding if (pTarget > pAllocatedBufferEnd - bytesNeeded) { // Left over surrogate from last time will cause pSrc == chars, so we'll throw - if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) + if (fallback != nullptr && fallback->bFallingBack) { - fallbackBuffer->MovePrevious(); // Didn't use this fallback char + fallback->MovePrevious(); // Didn't use this fallback char if (ch > 0xFFFF) - fallbackBuffer->MovePrevious(); // Was surrogate, didn't use 2nd part either + fallback->MovePrevious(); // Was surrogate, didn't use 2nd part either } else { @@ -2287,52 +2226,56 @@ class UTF8Encoding if (ch > 0xFFFF) pSrc--; // Was surrogate, didn't use 2nd part either } - Contract::Assert(pSrc >= chars || pTarget == bytes, - "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room."); - ThrowBytesOverflow(pTarget == bytes); // Throw if we must + ContractAssertFreeFallback(pSrc >= chars || pTarget == bytes) + if (pTarget == bytes) // Throw if we must + { + errno = ERROR_INSUFFICIENT_BUFFER; + if (fallback) free(fallback); + return 0; + } ch = 0; // Nothing left over (we backed up to start of pair if supplimentary) break; } if (ch <= 0x7F) { - *pTarget = (BYTE)ch; + *pTarget = (unsigned char)ch; } else { // use separate helper variables for local contexts so that the jit optimizations // won't get confused about the variable lifetimes int chb; if (ch <= 0x7FF) { - // 2 BYTE encoding - chb = (BYTE)(0xC0 | (ch >> 6)); + // 2 unsigned char encoding + chb = (unsigned char)(0xC0 | (ch >> 6)); } else { if (ch <= 0xFFFF) { - chb = (BYTE)(0xE0 | (ch >> 12)); + chb = (unsigned char)(0xE0 | (ch >> 12)); } else { - *pTarget = (BYTE)(0xF0 | (ch >> 18)); + *pTarget = (unsigned char)(0xF0 | (ch >> 18)); pTarget++; chb = 0x80 | ((ch >> 12) & 0x3F); } - *pTarget = (BYTE)chb; + *pTarget = (unsigned char)chb; pTarget++; chb = 0x80 | ((ch >> 6) & 0x3F); } - *pTarget = (BYTE)chb; + *pTarget = (unsigned char)chb; pTarget++; - *pTarget = (BYTE)0x80 | (ch & 0x3F); + *pTarget = (unsigned char)0x80 | (ch & 0x3F); } pTarget++; #ifdef FASTLOOP // If still have fallback don't do fast loop - if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != 0) + if (fallback != nullptr && (ch = fallback->InternalGetNextChar()) != 0) goto ProcessChar; int availableChars = PtrDiff(pEnd, pSrc); @@ -2341,7 +2284,7 @@ class UTF8Encoding // don't fall into the fast decoding loop if we don't have enough characters // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop. if (availableChars <= 13) { - // we are hoping for 1 BYTE per char + // we are hoping for 1 unsigned char per char if (availableBytes < availableChars) { // not enough output room. no pending bits at this point ch = 0; @@ -2349,16 +2292,16 @@ class UTF8Encoding } // try to get over the remainder of the ascii characters fast though - WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered + char16_t* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered while (pSrc < pLocalEnd) { ch = *pSrc; pSrc++; - // Not ASCII, need more than 1 BYTE per char + // Not ASCII, need more than 1 unsigned char per char if (ch > 0x7F) goto ProcessChar; - *pTarget = (BYTE)ch; + *pTarget = (unsigned char)ch; pTarget++; } // we are done, let ch be 0 to clear encoder @@ -2366,7 +2309,7 @@ class UTF8Encoding break; } - // we need at least 1 BYTE per character, but Convert might allow us to convert + // we need at least 1 unsigned char per character, but Convert might allow us to convert // only part of the input, so try as much as we can. Reduce charCount if necessary if (availableBytes < availableChars) { @@ -2381,7 +2324,7 @@ class UTF8Encoding // the boundary will be decreased for every non-ASCII character we encounter // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop. - WCHAR *pStop = pSrc + availableChars - 5; + char16_t *pStop = pSrc + availableChars - 5; while (pSrc < pStop) { ch = *pSrc; @@ -2390,7 +2333,7 @@ class UTF8Encoding if (ch > 0x7F) { goto LongCode; } - *pTarget = (BYTE)ch; + *pTarget = (unsigned char)ch; pTarget++; // get pSrc aligned @@ -2400,7 +2343,7 @@ class UTF8Encoding if (ch > 0x7F) { goto LongCode; } - *pTarget = (BYTE)ch; + *pTarget = (unsigned char)ch; pTarget++; } @@ -2414,55 +2357,63 @@ class UTF8Encoding // Unfortunately, this is endianness sensitive #if BIGENDIAN - *pTarget = (BYTE)(ch >> 16); - *(pTarget + 1) = (BYTE)ch; - pSrc += 4; - *(pTarget + 2) = (BYTE)(chc >> 16); - *(pTarget + 3) = (BYTE)chc; - pTarget += 4; -#else // BIGENDIAN - *pTarget = (BYTE)ch; - *(pTarget + 1) = (BYTE)(ch >> 16); - pSrc += 4; - *(pTarget + 2) = (BYTE)chc; - *(pTarget + 3) = (BYTE)(chc >> 16); - pTarget += 4; -#endif // BIGENDIAN + if (!treatAsLE) + { + *pTarget = (unsigned char)(ch >> 16); + *(pTarget + 1) = (unsigned char)ch; + pSrc += 4; + *(pTarget + 2) = (unsigned char)(chc >> 16); + *(pTarget + 3) = (unsigned char)chc; + pTarget += 4; + } + else +#else + { + *pTarget = (unsigned char)ch; + *(pTarget + 1) = (unsigned char)(ch >> 16); + pSrc += 4; + *(pTarget + 2) = (unsigned char)chc; + *(pTarget + 3) = (unsigned char)(chc >> 16); + pTarget += 4; + } +#endif } continue; LongCodeWithMask: #if BIGENDIAN - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); -#else // BIGENDIAN - ch = (WCHAR)ch; -#endif // BIGENDIAN - pSrc++; + // be careful about the sign extension + if (!treatAsLE) ch = (int)(((unsigned int)ch) >> 16); + else +#else + ch = (char16_t)ch; +#endif - if (ch > 0x7F) { - goto LongCode; - } - *pTarget = (BYTE)ch; - pTarget++; - continue; + pSrc++; + + if (ch > 0x7F) { + goto LongCode; + } + *pTarget = (unsigned char)ch; + pTarget++; + continue; LongCode: // use separate helper variables for slow and fast loop so that the jit optimizations // won't get confused about the variable lifetimes int chd; if (ch <= 0x7FF) { - // 2 BYTE encoding + // 2 unsigned char encoding chd = 0xC0 | (ch >> 6); } else { if (!InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - // 3 BYTE encoding + // 3 unsigned char encoding chd = 0xE0 | (ch >> 12); } else { - // 4 BYTE encoding - high surrogate + low surrogate + // 4 unsigned char encoding - high surrogate + low surrogate if (ch > CharUnicodeInfo::HIGH_SURROGATE_END) { // low without high -> bad, try again in slow loop pSrc -= 1; @@ -2484,30 +2435,30 @@ class UTF8Encoding - CharUnicodeInfo::LOW_SURROGATE_START - (CharUnicodeInfo::HIGH_SURROGATE_START << 10)); - *pTarget = (BYTE)(0xF0 | (ch >> 18)); - // pStop - this BYTE is compensated by the second surrogate character + *pTarget = (unsigned char)(0xF0 | (ch >> 18)); + // pStop - this unsigned char is compensated by the second surrogate character // 2 input chars require 4 output bytes. 2 have been anticipated already // and 2 more will be accounted for by the 2 pStop-- calls below. pTarget++; chd = 0x80 | ((ch >> 12) & 0x3F); } - *pTarget = (BYTE)chd; - pStop--; // 3 BYTE sequence for 1 char, so need pStop-- and the one below too. + *pTarget = (unsigned char)chd; + pStop--; // 3 unsigned char sequence for 1 char, so need pStop-- and the one below too. pTarget++; chd = 0x80 | ((ch >> 6) & 0x3F); } - *pTarget = (BYTE)chd; - pStop--; // 2 BYTE sequence for 1 char so need pStop--. + *pTarget = (unsigned char)chd; + pStop--; // 2 unsigned char sequence for 1 char so need pStop--. pTarget++; - *pTarget = (BYTE)(0x80 | (ch & 0x3F)); - // pStop - this BYTE is already included + *pTarget = (unsigned char)(0x80 | (ch & 0x3F)); + // pStop - this unsigned char is already included pTarget++; } - Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd"); + ContractAssertFreeFallback(pTarget <= pAllocatedBufferEnd) #endif // FASTLOOP @@ -2515,18 +2466,18 @@ class UTF8Encoding ch = 0; } - InternalDelete(fallbackBuffer); + free(fallback); return (int)(pTarget - bytes); } - int GetByteCount(WCHAR *chars, int count) + int GetByteCount(char16_t *chars, int count) { // For fallback we may need a fallback buffer. // We wait to initialize it though in case we don't have any broken input unicode - EncoderFallbackBuffer* fallbackBuffer = nullptr; - WCHAR *pSrc = chars; - WCHAR *pEnd = pSrc + count; + EncoderFallbackBuffer* fallback = nullptr; + char16_t *pSrc = chars; + char16_t *pEnd = pSrc + count; // Start by assuming we have as many as count int byteCount = count; @@ -2539,7 +2490,7 @@ class UTF8Encoding if (ch == 0) { // Unroll any fallback that happens at the end - ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : 0; + ch = fallback != nullptr ? fallback->InternalGetNextChar() : 0; if (ch > 0) { byteCount++; goto ProcessChar; @@ -2547,11 +2498,10 @@ class UTF8Encoding } else { // Case of surrogates in the fallback. - if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) { - Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF, - "[UTF8Encoding.GetBytes]expected high surrogate");// , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); + if (fallback != nullptr && fallback->bFallingBack) { + ContractAssertFreeFallback(ch >= 0xD800 && ch <= 0xDBFF);// , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)) - ch = fallbackBuffer->InternalGetNextChar(); + ch = fallback->InternalGetNextChar(); byteCount++; if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { @@ -2579,8 +2529,7 @@ class UTF8Encoding } if (ch > 0) { - Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF, - "[UTF8Encoding.GetBytes]expected high surrogate"); // , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); + ContractAssertFreeFallback(ch >= 0xD800 && ch <= 0xDBFF); // , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)) // use separate helper variables for local contexts so that the jit optimizations // won't get confused about the variable lifetimes @@ -2609,9 +2558,9 @@ class UTF8Encoding } // If we've used a fallback, then we have to check for it - if (fallbackBuffer != nullptr) + if (fallback != nullptr) { - ch = fallbackBuffer->InternalGetNextChar(); + ch = fallback->InternalGetNextChar(); if (ch > 0) { // We have an extra byte we weren't expecting. @@ -2639,19 +2588,21 @@ class UTF8Encoding { // Lone surrogates aren't allowed // Have to make a fallback buffer if we don't have one - if (fallbackBuffer == nullptr) + if (fallback == nullptr) { // wait on fallbacks if we can // For fallback we may need a fallback buffer - fallbackBuffer = encoderFallback->CreateFallbackBuffer(); + fallback = encoderFallback->CreateFallbackBuffer(); + RETURN_ON_ERROR // Set our internal fallback interesting things. - fallbackBuffer->InternalInitialize(chars, chars + count, false); + fallback->InternalInitialize(chars, chars + count, false); } // Do our fallback. Actually we already know its a mixed up surrogate, // so the ref pSrc isn't gonna do anything. - fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc); + fallback->InternalFallback((char16_t)ch, &pSrc); + RETURN_ON_ERROR // Ignore it if we don't throw (we had preallocated this ch) byteCount--; @@ -2678,7 +2629,7 @@ class UTF8Encoding #ifdef FASTLOOP // If still have fallback don't do fast loop - if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != 0) + if (fallback != nullptr && (ch = fallback->InternalGetNextChar()) != 0) { // We're reserving 1 byte for each char by default byteCount++; @@ -2690,7 +2641,7 @@ class UTF8Encoding // don't fall into the fast decoding loop if we don't have enough characters if (availableChars <= 13) { // try to get over the remainder of the ascii characters fast though - WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered + char16_t* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered while (pSrc < pLocalEnd) { ch = *pSrc; pSrc++; @@ -2711,7 +2662,7 @@ class UTF8Encoding // To compute the upper bound, assume that all characters are ASCII characters at this point, // the boundary will be decreased for every non-ASCII character we encounter // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates - WCHAR *pStop = pSrc + availableChars - (3 + 4); + char16_t *pStop = pSrc + availableChars - (3 + 4); while (pSrc < pStop) { ch = *pSrc; @@ -2791,16 +2742,18 @@ class UTF8Encoding LongCodeWithMask: #if BIGENDIAN - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); -#else // BIGENDIAN - ch = (WCHAR)ch; -#endif // BIGENDIAN - pSrc++; + // be careful about the sign extension + if (!treatAsLE) ch = (int)(((unsigned int)ch) >> 16); + else +#else + ch = (char16_t)ch; +#endif - if (ch <= 0x7F) { - continue; - } + pSrc++; + + if (ch <= 0x7F) { + continue; + } LongCode: // use separate helper variables for slow and fast loop so that the jit optimizations @@ -2836,102 +2789,111 @@ class UTF8Encoding #if WIN64 // check for overflow - if (byteCount < 0) { - throw ArgumentException("Conversion buffer overflow."); - } + ContractAssertFreeFallback(byteCount >= 0) #endif + ContractAssertFreeFallback(fallback == nullptr || fallback->GetRemaining() == 0) - Contract::Assert(fallbackBuffer == nullptr || fallbackBuffer->GetRemaining() == 0, - "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer"); - - InternalDelete(fallbackBuffer); + free(fallback); return byteCount; } - }; - -//////////////////////////////////////////////////////////////////////////// -// -// UTF8ToUnicode -// -// Maps a UTF-8 character string to its wide character string counterpart. -// -//////////////////////////////////////////////////////////////////////////// - -int UTF8ToUnicode( - LPCSTR lpSrcStr, +int minipal_utf8_to_utf16_preallocated( + const char* lpSrcStr, int cchSrc, - LPWSTR lpDestStr, + char16_t** lpDestStr, int cchDest, - DWORD dwFlags - ) + unsigned int dwFlags, + bool treatAsLE) { int ret; - UTF8Encoding enc(dwFlags & MB_ERR_INVALID_CHARS); - try { - ret = enc.GetCharCount((BYTE*)lpSrcStr, cchSrc); - if (cchDest){ - if (ret > cchDest){ - SetLastError(ERROR_INSUFFICIENT_BUFFER); - ret = 0; - } - enc.GetChars((BYTE*)lpSrcStr, cchSrc, (WCHAR*)lpDestStr, ret); + errno = 0; + + if (cchSrc < 0) + cchSrc = strlen(lpSrcStr) + 1; + + UTF8Encoding enc(dwFlags & MB_ERR_INVALID_CHARS, treatAsLE); + ret = enc.GetCharCount((unsigned char*)lpSrcStr, cchSrc); + if (cchDest) + { + if (ret > cchDest) + { + errno = ERROR_INSUFFICIENT_BUFFER; + ret = 0; } - } - catch (const InsufficientBufferException& e){ - SetLastError(ERROR_INSUFFICIENT_BUFFER); - return 0; - } - catch (const DecoderFallbackException& e){ - SetLastError(ERROR_NO_UNICODE_TRANSLATION); - return 0; - } - catch (const ArgumentException& e){ - SetLastError(ERROR_INVALID_PARAMETER); - return 0; + enc.GetChars((unsigned char*)lpSrcStr, cchSrc, (char16_t*)*lpDestStr, ret); + if (errno) ret = 0; } return ret; } -//////////////////////////////////////////////////////////////////////////// -// -// UnicodeToUTF8 -// -// Maps a Unicode character string to its UTF-8 string counterpart. -// -//////////////////////////////////////////////////////////////////////////// - -int UnicodeToUTF8( - LPCWSTR lpSrcStr, +static int utf16_to_utf8_preallocated( + const char16_t* lpSrcStr, int cchSrc, - LPSTR lpDestStr, - int cchDest) + char** lpDestStr, + int cchDest, + bool treatAsLE) { int ret; - UTF8Encoding enc(false); - try{ - ret = enc.GetByteCount((WCHAR*)lpSrcStr, cchSrc); - if (cchDest){ - if (ret > cchDest){ - SetLastError(ERROR_INSUFFICIENT_BUFFER); - ret = 0; - } - enc.GetBytes((WCHAR*)lpSrcStr, cchSrc, (BYTE*)lpDestStr, ret); + errno = 0; + + if (cchSrc < 0) + cchSrc = wcslen(lpSrcStr) + 1; + + UTF8Encoding enc(false, treatAsLE); + ret = enc.GetByteCount((char16_t*)lpSrcStr, cchSrc); + if (cchDest) + { + if (ret > cchDest) + { + errno = ERROR_INSUFFICIENT_BUFFER; + ret = 0; } + enc.GetBytes((char16_t*)lpSrcStr, cchSrc, (unsigned char*)*lpDestStr, ret); + if (errno) ret = 0; } - catch (const InsufficientBufferException& e){ - SetLastError(ERROR_INSUFFICIENT_BUFFER); - return 0; - } - catch (const EncoderFallbackException& e){ - SetLastError(ERROR_NO_UNICODE_TRANSLATION); - return 0; + return ret; +} + +int minipal_utf16_to_utf8_preallocated( + const char16_t* lpSrcStr, + int cchSrc, + char** lpDestStr, + int cchDest) +{ + return utf16_to_utf8_preallocated(lpSrcStr, cchSrc, lpDestStr, cchDest, false); +} + +int minipal_utf8_to_utf16_allocate( + const char* lpSrcStr, + int cchSrc, + char16_t** lpDestStr, + unsigned int dwFlags, + bool treatAsLE) +{ + int cchDest = minipal_utf8_to_utf16_preallocated(lpSrcStr, cchSrc, nullptr, 0, dwFlags, !treatAsLE); + if (cchDest > 0) + { + *lpDestStr = (char16_t*)malloc((cchDest + 1) * sizeof(char16_t)); + cchDest = minipal_utf8_to_utf16_preallocated(lpSrcStr, cchSrc, lpDestStr, cchDest, dwFlags, !treatAsLE); + (*lpDestStr)[cchDest] = '\0'; } - catch (const ArgumentException& e){ - SetLastError(ERROR_INVALID_PARAMETER); - return 0; + return cchDest; +} + +int minipal_utf16_to_utf8_allocate( + const char16_t* lpSrcStr, + int cchSrc, + char** lpDestStr, + bool treatAsLE) +{ + int cchDest = utf16_to_utf8_preallocated(lpSrcStr, cchSrc, nullptr, 0, treatAsLE); + if (cchDest > 0) + { + *lpDestStr = (char*)malloc((cchDest + 1) * sizeof(char)); + cchDest = utf16_to_utf8_preallocated(lpSrcStr, cchSrc, lpDestStr, cchDest, treatAsLE); + (*lpDestStr)[cchDest] = '\0'; } - return ret; + return cchDest; } diff --git a/src/native/minipal/utf8.h b/src/native/minipal/utf8.h new file mode 100644 index 00000000000000..71b9a805aa11b5 --- /dev/null +++ b/src/native/minipal/utf8.h @@ -0,0 +1,33 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#ifndef HAVE_MINIPAL_UTF8_H +#define HAVE_MINIPAL_UTF8_H + +#include +#include +#include + +#define MB_ERR_INVALID_CHARS 0x00000008 +#define ERROR_NO_UNICODE_TRANSLATION 1113L +#define ERROR_INSUFFICIENT_BUFFER 122L +#define ERROR_INVALID_PARAMETER 87L + +#ifdef __cplusplus +extern "C" +{ +#endif // __cplusplus + +int minipal_utf8_to_utf16_preallocated(const char* lpSrcStr, int cchSrc, char16_t** lpDestStr, int cchDest, unsigned int dwFlags, bool treatAsLE); + +int minipal_utf16_to_utf8_preallocated(const char16_t* lpSrcStr, int cchSrc, char** lpDestStr, int cchDest); + +int minipal_utf8_to_utf16_allocate(const char* lpSrcStr, int cchSrc, char16_t** lpDestStr, unsigned int dwFlags, bool treatAsLE); + +int minipal_utf16_to_utf8_allocate(const char16_t* lpSrcStr, int cchSrc, char** lpDestStr, bool treatAsLE); + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif /* HAVE_MINIPAL_UTF8_H */ From 0bd1d9cd6067b18666147346e03e5258613b372b Mon Sep 17 00:00:00 2001 From: Adeel <3840695+am11@users.noreply.github.com> Date: Fri, 26 May 2023 06:10:06 +0300 Subject: [PATCH 4/9] Remove C++ runtime dependency --- src/mono/CMakeLists.txt | 8 +++++++ src/mono/mono/eglib/CMakeLists.txt | 2 +- src/mono/mono/mini/CMakeLists.txt | 5 ++++- src/mono/mono/profiler/CMakeLists.txt | 2 +- src/native/minipal/utf8.cpp | 30 ++++++++++++++------------- 5 files changed, 30 insertions(+), 17 deletions(-) diff --git a/src/mono/CMakeLists.txt b/src/mono/CMakeLists.txt index bc65886aea2f36..05cd80948408ea 100644 --- a/src/mono/CMakeLists.txt +++ b/src/mono/CMakeLists.txt @@ -589,6 +589,14 @@ if(GCC) if(ENABLE_WERROR) append("-Werror" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) endif() + + # don't link with C++ runtime lib for targets which don't require it (tvOS and iOS need it for ICU) + if(NOT LLVM_PREFIX AND NOT HOST_TVOS AND NOT HOST_IOS) + append("-nodefaultlibs" CMAKE_CXX_FLAGS) + if(NOT HOST_WASM) + append("-lc" CMAKE_CXX_FLAGS) + endif() + endif() endif() ###################################### diff --git a/src/mono/mono/eglib/CMakeLists.txt b/src/mono/mono/eglib/CMakeLists.txt index a65a4e64085e92..9946145859e030 100644 --- a/src/mono/mono/eglib/CMakeLists.txt +++ b/src/mono/mono/eglib/CMakeLists.txt @@ -39,7 +39,7 @@ set(eglib_common_sources if(HOST_WIN32) set_source_files_properties("${CLR_SRC_NATIVE_DIR}/minipal/utf8.cpp" PROPERTIES COMPILE_FLAGS "/wd4100 /wd4267 /wd4458 /wd4310") else() -set_source_files_properties("${CLR_SRC_NATIVE_DIR}/minipal/utf8.cpp" PROPERTIES COMPILE_FLAGS "-std=c++11 -nostdlib -fno-rtti -fno-exceptions") +set_source_files_properties("${CLR_SRC_NATIVE_DIR}/minipal/utf8.cpp" PROPERTIES COMPILE_FLAGS "-std=c++11 -fno-rtti -fno-exceptions") endif() set(eglib_headers diff --git a/src/mono/mono/mini/CMakeLists.txt b/src/mono/mono/mini/CMakeLists.txt index 8e60babe7bb83c..9df1cb56f14fa2 100644 --- a/src/mono/mono/mini/CMakeLists.txt +++ b/src/mono/mono/mini/CMakeLists.txt @@ -24,7 +24,10 @@ set(OS_LIBS "-framework CoreFoundation" "-lobjc" "-lc++") elseif(HOST_ANDROID) set(OS_LIBS m dl log) elseif(HOST_LINUX) -set(OS_LIBS pthread m dl) +set(OS_LIBS pthread m dl gcc_s) + if(NOT CLR_CMAKE_HOST_LINUX_MUSL) # glibc build also requires libc_nonshared.a for atexit(3) usage + set(OS_LIBS ${OS_LIBS} -l:libc_nonshared.a) + endif() elseif(HOST_WIN32) set(OS_LIBS bcrypt.lib Mswsock.lib ws2_32.lib psapi.lib version.lib advapi32.lib winmm.lib kernel32.lib) elseif(HOST_SOLARIS) diff --git a/src/mono/mono/profiler/CMakeLists.txt b/src/mono/mono/profiler/CMakeLists.txt index 6bca00983686fc..f826371103b6f2 100644 --- a/src/mono/mono/profiler/CMakeLists.txt +++ b/src/mono/mono/profiler/CMakeLists.txt @@ -14,7 +14,7 @@ if(NOT DISABLE_LIBS) # Build the logging profiler only for certain platforms add_library(mono-profiler-log SHARED helper.c log.c log-args.c ${ZLIB_SOURCES}) target_compile_definitions(mono-profiler-log PRIVATE -DMONO_DLL_EXPORT) - target_link_libraries(mono-profiler-log PRIVATE monosgen-shared monoapi eglib_objects) + target_link_libraries(mono-profiler-log PRIVATE monosgen-shared monoapi eglib_objects ${CMAKE_DL_LIBS}) if(HOST_ANDROID) target_link_libraries(mono-profiler-log PRIVATE log) endif() diff --git a/src/native/minipal/utf8.cpp b/src/native/minipal/utf8.cpp index ce967669c46d3d..875b4eeb6008ca 100644 --- a/src/native/minipal/utf8.cpp +++ b/src/native/minipal/utf8.cpp @@ -19,7 +19,7 @@ Revision History: #include #include #include -#include +#include #define FASTLOOP @@ -29,6 +29,8 @@ Revision History: #define W(str) u##str #endif +inline void *operator new(size_t, void *p) throw () { return p; } + struct CharUnicodeInfo { static const char16_t HIGH_SURROGATE_START = 0xd800; @@ -188,11 +190,11 @@ class DecoderFallback // // Return the appropriate unicode string alternative to the character that need to fall back. - virtual DecoderFallbackBuffer* CreateFallbackBuffer() = 0; + virtual DecoderFallbackBuffer* CreateFallbackBuffer() { assert(!"pure virtual function called"); while(true); } // Maximum number of characters that this instance of this fallback could return - virtual int GetMaxCharCount() = 0; + virtual int GetMaxCharCount() { assert(!"pure virtual function called"); while(true); } }; class DecoderReplacementFallback : public DecoderFallback @@ -275,16 +277,16 @@ class DecoderFallbackBuffer // These wrap the internal methods so that we can check for people doing stuff that's incorrect public: - virtual bool Fallback(unsigned char bytesUnknown[], int index, int size) = 0; + virtual bool Fallback(unsigned char bytesUnknown[], int index, int size) { assert(!"pure virtual function called"); while(true); } // Get next character - virtual char16_t GetNextChar() = 0; + virtual char16_t GetNextChar() { assert(!"pure virtual function called"); while(true); } //Back up a character - virtual bool MovePrevious() = 0; + virtual bool MovePrevious() { assert(!"pure virtual function called"); while(true); } // How many chars left in this fallback? - virtual int GetRemaining() = 0; + virtual int GetRemaining() { assert(!"pure virtual function called"); while(true); } // Clear the buffer virtual void Reset() @@ -606,10 +608,10 @@ class EncoderFallback // // Return the appropriate unicode string alternative to the character that need to fall back. - virtual EncoderFallbackBuffer* CreateFallbackBuffer() = 0; + virtual EncoderFallbackBuffer* CreateFallbackBuffer() { assert(!"pure virtual function called"); while(true); } // Maximum number of characters that this instance of this fallback could return - virtual int GetMaxCharCount() = 0; + virtual int GetMaxCharCount() { assert(!"pure virtual function called"); while(true); } }; class EncoderReplacementFallback : public EncoderFallback @@ -692,18 +694,18 @@ class EncoderFallbackBuffer // These wrap the internal methods so that we can check for people doing stuff that is incorrect public: - virtual bool Fallback(char16_t charUnknown, int index) = 0; + virtual bool Fallback(char16_t charUnknown, int index) { assert(!"pure virtual function called"); while(true); } - virtual bool Fallback(char16_t charUnknownHigh, char16_t charUnknownLow, int index) = 0; + virtual bool Fallback(char16_t charUnknownHigh, char16_t charUnknownLow, int index) { assert(!"pure virtual function called"); while(true); } // Get next character - virtual char16_t GetNextChar() = 0; + virtual char16_t GetNextChar() { assert(!"pure virtual function called"); while(true); } // Back up a character - virtual bool MovePrevious() = 0; + virtual bool MovePrevious() { assert(!"pure virtual function called"); while(true); } // How many chars left in this fallback? - virtual int GetRemaining() = 0; + virtual int GetRemaining() { assert(!"pure virtual function called"); while(true); } // Not sure if this should be public or not. // Clear the buffer From 5e1289496fcccb7f8f7ecab912765185f71dca90 Mon Sep 17 00:00:00 2001 From: Adeel <3840695+am11@users.noreply.github.com> Date: Sat, 17 Jun 2023 13:08:16 +0300 Subject: [PATCH 5/9] Initial C++ to C conversion --- src/coreclr/pal/src/CMakeLists.txt | 2 +- src/coreclr/pal/src/locale/unicode.cpp | 30 +- src/mono/CMakeLists.txt | 8 - src/mono/mono/eglib/CMakeLists.txt | 10 +- src/mono/mono/eglib/giconv.c | 98 +- src/mono/mono/metadata/object.c | 2 +- src/mono/mono/mini/CMakeLists.txt | 7 +- src/mono/mono/profiler/CMakeLists.txt | 2 +- src/native/minipal/utf8.c | 2140 +++++++++++++++++ src/native/minipal/utf8.cpp | 2901 ------------------------ src/native/minipal/utf8.h | 57 +- 11 files changed, 2299 insertions(+), 2958 deletions(-) create mode 100644 src/native/minipal/utf8.c delete mode 100644 src/native/minipal/utf8.cpp diff --git a/src/coreclr/pal/src/CMakeLists.txt b/src/coreclr/pal/src/CMakeLists.txt index d61ffc4cca5f11..cd23cadbe2d749 100644 --- a/src/coreclr/pal/src/CMakeLists.txt +++ b/src/coreclr/pal/src/CMakeLists.txt @@ -152,7 +152,7 @@ set(SOURCES loader/module.cpp locale/unicode.cpp locale/unicodedata.cpp - ${CLR_SRC_NATIVE_DIR}/minipal/utf8.cpp + ${CLR_SRC_NATIVE_DIR}/minipal/utf8.c map/common.cpp map/map.cpp map/virtual.cpp diff --git a/src/coreclr/pal/src/locale/unicode.cpp b/src/coreclr/pal/src/locale/unicode.cpp index d3f4da7a60b53f..b9a0291394dc9b 100644 --- a/src/coreclr/pal/src/locale/unicode.cpp +++ b/src/coreclr/pal/src/locale/unicode.cpp @@ -253,11 +253,22 @@ MultiByteToWideChar( goto EXIT; } - // Use minipal_utf8_to_utf16_preallocated on all systems, since it replaces + // Use minipal_convert_utf8_to_utf16 on all systems, since it replaces // invalid characters and Core Foundation doesn't do that. if (CodePage == CP_UTF8 || CodePage == CP_ACP) { - retval = minipal_utf8_to_utf16_preallocated(lpMultiByteStr, cbMultiByte, &lpWideCharStr, cchWideChar, dwFlags, /* treatAsLE */ false); + if (cbMultiByte < 0) + cbMultiByte = strlen(lpMultiByteStr) + 1; + + if (!lpWideCharStr || cchWideChar == 0) + retval = minipal_get_length_utf8_to_utf16(lpMultiByteStr, cbMultiByte, dwFlags); + + if (lpWideCharStr) + { + if (cchWideChar == 0) cchWideChar = retval; + retval = minipal_convert_utf8_to_utf16(lpMultiByteStr, cbMultiByte, (CHAR16_T*)lpWideCharStr, cchWideChar, dwFlags); + } + goto EXIT; } @@ -333,11 +344,22 @@ WideCharToMultiByte( defaultChar = *lpDefaultChar; } - // Use minipal_utf16_to_utf8_preallocated on all systems because we use + // Use minipal_convert_utf16_to_utf8 on all systems because we use // UTF8ToUnicode in MultiByteToWideChar() on all systems. if (CodePage == CP_UTF8 || CodePage == CP_ACP) { - retval = minipal_utf16_to_utf8_preallocated(lpWideCharStr, cchWideChar, &lpMultiByteStr, cbMultiByte); + if (cchWideChar < 0) + cchWideChar = PAL_wcslen(lpWideCharStr) + 1; + + if (!lpMultiByteStr || cbMultiByte == 0) + retval = minipal_get_length_utf16_to_utf8((CHAR16_T*)lpWideCharStr, cchWideChar, dwFlags); + + if (lpMultiByteStr) + { + if (cbMultiByte == 0) cbMultiByte = retval; + retval = minipal_convert_utf16_to_utf8((CHAR16_T*)lpWideCharStr, cchWideChar, lpMultiByteStr, cbMultiByte, dwFlags); + } + goto EXIT; } diff --git a/src/mono/CMakeLists.txt b/src/mono/CMakeLists.txt index 05cd80948408ea..bc65886aea2f36 100644 --- a/src/mono/CMakeLists.txt +++ b/src/mono/CMakeLists.txt @@ -589,14 +589,6 @@ if(GCC) if(ENABLE_WERROR) append("-Werror" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) endif() - - # don't link with C++ runtime lib for targets which don't require it (tvOS and iOS need it for ICU) - if(NOT LLVM_PREFIX AND NOT HOST_TVOS AND NOT HOST_IOS) - append("-nodefaultlibs" CMAKE_CXX_FLAGS) - if(NOT HOST_WASM) - append("-lc" CMAKE_CXX_FLAGS) - endif() - endif() endif() ###################################### diff --git a/src/mono/mono/eglib/CMakeLists.txt b/src/mono/mono/eglib/CMakeLists.txt index 9946145859e030..09cf32eaa81ada 100644 --- a/src/mono/mono/eglib/CMakeLists.txt +++ b/src/mono/mono/eglib/CMakeLists.txt @@ -34,12 +34,10 @@ set(eglib_common_sources gfile.c gfile-posix.c gutf8.c - ${CLR_SRC_NATIVE_DIR}/minipal/utf8.cpp) + ${CLR_SRC_NATIVE_DIR}/minipal/utf8.c) -if(HOST_WIN32) -set_source_files_properties("${CLR_SRC_NATIVE_DIR}/minipal/utf8.cpp" PROPERTIES COMPILE_FLAGS "/wd4100 /wd4267 /wd4458 /wd4310") -else() -set_source_files_properties("${CLR_SRC_NATIVE_DIR}/minipal/utf8.cpp" PROPERTIES COMPILE_FLAGS "-std=c++11 -fno-rtti -fno-exceptions") +if(IS_BIG_ENDIAN) + set_source_files_properties("${CLR_SRC_NATIVE_DIR}/minipal/utf8.c" PROPERTIES COMPILE_FLAGS "-DBIGENDIAN=1") endif() set(eglib_headers @@ -48,7 +46,7 @@ set(eglib_headers gmodule.h) if(HAVE_CLOCK_NANOSLEEP) -list(APPEND eglib_common_sources gclock-nanosleep.c) + list(APPEND eglib_common_sources gclock-nanosleep.c) endif() set(eglib_sources "${eglib_platform_sources};${eglib_common_sources}") diff --git a/src/mono/mono/eglib/giconv.c b/src/mono/mono/eglib/giconv.c index 79c45c8182adea..7863d8cbd35cd6 100644 --- a/src/mono/mono/eglib/giconv.c +++ b/src/mono/mono/eglib/giconv.c @@ -28,7 +28,6 @@ #include #include "../utils/mono-errno.h" -typedef gunichar2 char16_t; #include #ifdef _MSC_VER @@ -44,9 +43,6 @@ static FORCE_INLINE (int) decode_utf8 (char *inbuf, size_t inleft, gunichar *out #if G_BYTE_ORDER == G_LITTLE_ENDIAN #define decode_utf16 decode_utf16le #else -#ifndef BIGENDIAN -#define BIGENDIAN -#endif #define decode_utf16 decode_utf16be #endif @@ -328,27 +324,42 @@ g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written) static FORCE_INLINE (void) map_error(GError **err) { - if (errno == 0) return; - if (errno == ERROR_INSUFFICIENT_BUFFER) { + if (errno == MINIPAL_ERROR_INSUFFICIENT_BUFFER) { g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY, "Allocation failed."); - } else if (errno == ERROR_NO_UNICODE_TRANSLATION) { - g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, - "Illegal byte sequence encountered in the input."); - } else { - g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, - "Partial byte sequence encountered in the input."); } } static gunichar2 * -g_utf8_to_utf16_impl (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err, int dwFlags, bool treatAsLE) +g_utf8_to_utf16_impl (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err, int flags, bool treatAsLE) { errno = 0; gunichar2* lpDestStr = NULL; - int ret = minipal_utf8_to_utf16_allocate (str, len, &lpDestStr, dwFlags, treatAsLE); +#if G_BYTE_ORDER == G_BIG_ENDIAN + if (treatAsLE) + flags |= MINIPAL_TREAT_AS_LITTLE_ENDIAN; +#endif + + if (len < 0) + len = (glong)strlen(str) + 1; + + glong ret = (glong)minipal_get_length_utf8_to_utf16 (str, len, flags); + + map_error(err); + if (items_written) *items_written = errno == 0 ? ret : 0; + + if (ret <= 0) + return NULL; + + lpDestStr = malloc((ret + 1) * sizeof(gunichar2)); + ret = (glong)minipal_convert_utf8_to_utf16 (str, len, lpDestStr, ret, flags); + lpDestStr[ret] = '\0'; + + if (items_written) + *items_written = errno == 0 ? ret : 0; + map_error(err); return lpDestStr; } @@ -356,8 +367,17 @@ g_utf8_to_utf16_impl (const gchar *str, glong len, glong *items_read, glong *ite static gunichar2 * g_utf8_to_utf16le_custom_alloc_impl (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err, bool treatAsLE) { + guint flags = 0; errno = 0; - int ret = minipal_utf8_to_utf16_preallocated (str, len, 0, 0, 0, /* treatAsLE */ treatAsLE); +#if G_BYTE_ORDER == G_BIG_ENDIAN + if (treatAsLE) + flags = MINIPAL_TREAT_AS_LITTLE_ENDIAN; +#endif + if (len < 0) + len = (glong)strlen(str) + 1; + + glong ret = (glong)minipal_get_length_utf8_to_utf16 (str, len, flags); + map_error(err); if (items_written) @@ -366,8 +386,10 @@ g_utf8_to_utf16le_custom_alloc_impl (const gchar *str, glong len, glong *items_r if (ret <= 0) return NULL; - gunichar2* lpDestStr = custom_alloc_func((ret + 1) * sizeof (gunichar2), custom_alloc_data); - ret = minipal_utf8_to_utf16_preallocated (str, len, &lpDestStr, ret, MB_ERR_INVALID_CHARS, /* treatAsLE */ treatAsLE); + gunichar2 *lpDestStr = custom_alloc_func((ret + 1) * sizeof (gunichar2), custom_alloc_data); + flags |= MINIPAL_MB_NO_REPLACE_INVALID_CHARS; + ret = (glong)minipal_convert_utf8_to_utf16 (str, len, lpDestStr, ret, flags); + map_error(err); return lpDestStr; } @@ -375,13 +397,13 @@ g_utf8_to_utf16le_custom_alloc_impl (const gchar *str, glong len, glong *items_r gunichar2 * g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) { - return g_utf8_to_utf16_impl (str, len, items_read, items_written, err, MB_ERR_INVALID_CHARS, false); + return g_utf8_to_utf16_impl (str, len, items_read, items_written, err, MINIPAL_MB_NO_REPLACE_INVALID_CHARS, false); } gunichar2 * g_utf8_to_utf16le (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) { - return g_utf8_to_utf16_impl (str, len, items_read, items_written, err, MB_ERR_INVALID_CHARS, true); + return g_utf8_to_utf16_impl (str, len, items_read, items_written, err, MINIPAL_MB_NO_REPLACE_INVALID_CHARS, true); } gunichar2 * @@ -477,9 +499,31 @@ g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_wri static gchar * g_utf16_to_utf8_impl (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err, bool treatAsLE) { + guint flags = 0; errno = 0; gchar* lpDestStr = NULL; - int ret = minipal_utf16_to_utf8_allocate (str, len, &lpDestStr, treatAsLE); +#if G_BYTE_ORDER == G_BIG_ENDIAN + if (treatAsLE) + flags |= MINIPAL_TREAT_AS_LITTLE_ENDIAN; +#endif + if (len < 0) { + len = 0; + while (str[len]) + len++; + } + + glong ret = (glong)minipal_get_length_utf16_to_utf8 (str, len, flags); + map_error(err); + + if (items_written) + *items_written = errno == 0 ? ret : 0; + + if (ret <= 0) + return NULL; + + lpDestStr = (gchar *)malloc((ret + 1) * sizeof(gchar)); + ret = (glong)minipal_convert_utf16_to_utf8 (str, len, lpDestStr, ret, flags); + lpDestStr[ret] = '\0'; if (items_written) *items_written = errno == 0 ? ret : 0; @@ -504,7 +548,14 @@ gchar * g_utf16_to_utf8_custom_alloc (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) { errno = 0; - int ret = minipal_utf16_to_utf8_preallocated (str, len, 0, 0); + + if (len < 0) { + len = 0; + while (str[len]) + len++; + } + + glong ret = (glong)minipal_get_length_utf16_to_utf8 (str, len, 0); map_error(err); if (items_written) @@ -513,8 +564,9 @@ g_utf16_to_utf8_custom_alloc (const gunichar2 *str, glong len, glong *items_read if (ret <= 0) return NULL; - gchar* lpDestStr = custom_alloc_func((ret + 1) * sizeof (gunichar2), custom_alloc_data); - ret = minipal_utf16_to_utf8_preallocated (str, len, &lpDestStr, ret); + gchar *lpDestStr = custom_alloc_func((ret + 1) * sizeof (gunichar2), custom_alloc_data); + ret = (glong)minipal_convert_utf16_to_utf8 (str, len, lpDestStr, ret, 0); + map_error(err); return lpDestStr; } diff --git a/src/mono/mono/metadata/object.c b/src/mono/mono/metadata/object.c index b267334250c6c2..b0289cebf414ae 100644 --- a/src/mono/mono/metadata/object.c +++ b/src/mono/mono/metadata/object.c @@ -327,7 +327,7 @@ get_type_init_exception_for_vtable (MonoVTable *vtable) mono_mem_manager_init_reflection_hashes (mem_manager); - /* + /* * If the initializing thread was rudely aborted, the exception is not stored * in the hash. */ diff --git a/src/mono/mono/mini/CMakeLists.txt b/src/mono/mono/mini/CMakeLists.txt index 9df1cb56f14fa2..6f5e8507315dbf 100644 --- a/src/mono/mono/mini/CMakeLists.txt +++ b/src/mono/mono/mini/CMakeLists.txt @@ -24,10 +24,7 @@ set(OS_LIBS "-framework CoreFoundation" "-lobjc" "-lc++") elseif(HOST_ANDROID) set(OS_LIBS m dl log) elseif(HOST_LINUX) -set(OS_LIBS pthread m dl gcc_s) - if(NOT CLR_CMAKE_HOST_LINUX_MUSL) # glibc build also requires libc_nonshared.a for atexit(3) usage - set(OS_LIBS ${OS_LIBS} -l:libc_nonshared.a) - endif() +set(OS_LIBS pthread m dl) elseif(HOST_WIN32) set(OS_LIBS bcrypt.lib Mswsock.lib ws2_32.lib psapi.lib version.lib advapi32.lib winmm.lib kernel32.lib) elseif(HOST_SOLARIS) @@ -554,7 +551,7 @@ if(NOT DISABLE_EXECUTABLES) target_link_libraries(mono-sgen PRIVATE monoapi eglib_api monosgen-static) if(HAVE_ICU_SHIM) target_link_libraries(mono-sgen PRIVATE icu_shim_objects) - endif() + endif() target_link_libraries(mono-sgen PRIVATE ${OS_LIBS} ${LLVM_LIBS} ${ICU_LIBS} ${Z_LIBS}) # Alpine Linux implements ucontext in a different library if(CLR_CMAKE_HOST_ALPINE_LINUX AND TARGET_S390X) diff --git a/src/mono/mono/profiler/CMakeLists.txt b/src/mono/mono/profiler/CMakeLists.txt index f826371103b6f2..6bca00983686fc 100644 --- a/src/mono/mono/profiler/CMakeLists.txt +++ b/src/mono/mono/profiler/CMakeLists.txt @@ -14,7 +14,7 @@ if(NOT DISABLE_LIBS) # Build the logging profiler only for certain platforms add_library(mono-profiler-log SHARED helper.c log.c log-args.c ${ZLIB_SOURCES}) target_compile_definitions(mono-profiler-log PRIVATE -DMONO_DLL_EXPORT) - target_link_libraries(mono-profiler-log PRIVATE monosgen-shared monoapi eglib_objects ${CMAKE_DL_LIBS}) + target_link_libraries(mono-profiler-log PRIVATE monosgen-shared monoapi eglib_objects) if(HOST_ANDROID) target_link_libraries(mono-profiler-log PRIVATE log) endif() diff --git a/src/native/minipal/utf8.c b/src/native/minipal/utf8.c new file mode 100644 index 00000000000000..bacad116efcd8a --- /dev/null +++ b/src/native/minipal/utf8.c @@ -0,0 +1,2140 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#include + +#include +#include +#include +#include + +#define HIGH_SURROGATE_START 0xd800 +#define HIGH_SURROGATE_END 0xdbff +#define LOW_SURROGATE_START 0xdc00 +#define LOW_SURROGATE_END 0xdfff + +// Test if the wide character is a high surrogate +static bool IsHighSurrogate(const CHAR16_T c) +{ + return (c & 0xFC00) == HIGH_SURROGATE_START; +} + +// Test if the wide character is a low surrogate +static bool IsLowSurrogate(const CHAR16_T c) +{ + return (c & 0xFC00) == LOW_SURROGATE_START; +} + +// Test if the wide character is a surrogate half +static bool IsSurrogate(const CHAR16_T c) +{ + return (c & 0xF800) == HIGH_SURROGATE_START; +} + +typedef struct +{ + // Store our default string + unsigned char* byteStart; + CHAR16_T* charEnd; + const CHAR16_T strDefault[2]; + int strDefaultLength; + int fallbackCount; + int fallbackIndex; +} DecoderBuffer; + +static CHAR16_T DecoderReplacementFallbackBuffer_GetNextChar(DecoderBuffer* self) +{ + // We want it to get < 0 because == 0 means that the current/last character is a fallback + // and we need to detect recursion. We could have a flag but we already have this counter. + self->fallbackCount--; + self->fallbackIndex++; + + // Do we have anything left? 0 is now last fallback char, negative is nothing left + if (self->fallbackCount < 0) + return '\0'; + + // Need to get it out of the buffer. + // Make sure it didn't wrap from the fast count-- path + if (self->fallbackCount == INT_MAX) + { + self->fallbackCount = -1; + return '\0'; + } + + // Now make sure its in the expected range + assert(self->fallbackIndex < self->strDefaultLength && self->fallbackIndex >= 0); + + return self->strDefault[self->fallbackIndex]; +} + +// Fallback Methods +static bool DecoderReplacementFallbackBuffer_Fallback(DecoderBuffer* self) +{ + // We expect no previous fallback in our buffer + // We can't call recursively but others might (note, we don't test on last char!!!) + assert(self->fallbackCount < 1); + + // Go ahead and get our fallback + if (self->strDefaultLength == 0) + return false; + + self->fallbackCount = self->strDefaultLength; + self->fallbackIndex = -1; + + return true; +} + +// Fallback the current byte by sticking it into the remaining char buffer. +// This can only be called by our encodings (other have to use the public fallback methods), so +// we can use our DecoderNLS here too (except we don't). +// Returns true if we are successful, false if we can't fallback the character (no buffer space) +// So caller needs to throw buffer space if return false. +// Right now this has both bytes and bytes[], since we might have extra bytes, hence the +// array, and we might need the index, hence the byte* +// Don't touch ref chars unless we succeed +static bool DecoderReplacementFallbackBuffer_InternalFallback_Copy(DecoderBuffer* self, CHAR16_T** chars) +{ + assert(self->byteStart != NULL); + + bool fallbackResult = DecoderReplacementFallbackBuffer_Fallback(self); + + // See if there's a fallback character and we have an output buffer then copy our string. + if (fallbackResult) + { + // Copy the chars to our output + CHAR16_T ch; + CHAR16_T* charTemp = *chars; + bool bHighSurrogate = false; + (void)bHighSurrogate; // unused in release build + while ((ch = DecoderReplacementFallbackBuffer_GetNextChar(self)) != 0) + { + // Make sure no mixed up surrogates + if (IsSurrogate(ch)) + { + if (IsHighSurrogate(ch)) + { + // High Surrogate + assert(!bHighSurrogate); + bHighSurrogate = true; + } + else + { + // Low surrogate + assert(bHighSurrogate); + bHighSurrogate = false; + } + } + + if (charTemp >= self->charEnd) + { + // No buffer space + return false; + } + + *(charTemp++) = ch; + } + + // Need to make sure that bHighSurrogate isn't true + assert(!bHighSurrogate); + + // Now we aren't going to be false, so its OK to update chars + *chars = charTemp; + } + + return true; +} + +// Clear the buffer +static void DecoderReplacementFallbackBuffer_Reset(DecoderBuffer* self) +{ + self->fallbackCount = -1; + self->fallbackIndex = -1; + self->byteStart = NULL; +} + +// Set the above values +static void DecoderBuffer_InternalInitialize(DecoderBuffer* self, unsigned char* byteStart, CHAR16_T* charEnd) +{ + self->byteStart = byteStart; + self->charEnd = charEnd; +} + +typedef struct +{ + const CHAR16_T strDefault[3]; + int strDefaultLength; + CHAR16_T* charStart; + CHAR16_T* charEnd; + bool setEncoder; + bool bUsedEncoder; + bool bFallingBack; + int iRecursionCount; + int fallbackCount; + int fallbackIndex; +} EncoderBuffer; + +#define MAX_RECURSION 250 + +// Set the above values +// This can't be part of the constructor because EncoderFallbacks would have to know how to implement these. +static void EncoderReplacementFallbackBuffer_InternalInitialize(EncoderBuffer* self, CHAR16_T* charStart, CHAR16_T* charEnd, bool setEncoder) +{ + self->charStart = charStart; + self->charEnd = charEnd; + self->setEncoder = setEncoder; + self->bUsedEncoder = false; + self->bFallingBack = false; + self->iRecursionCount = 0; +} + +static CHAR16_T EncoderReplacementFallbackBuffer_InternalGetNextChar(EncoderBuffer* self) +{ + // We want it to get < 0 because == 0 means that the current/last character is a fallback + // and we need to detect recursion. We could have a flag but we already have this counter. + self->fallbackCount--; + self->fallbackIndex++; + + // Do we have anything left? 0 is now last fallback char, negative is nothing left + if (self->fallbackCount < 0) + return '\0'; + + // Need to get it out of the buffer. + // Make sure it didn't wrap from the fast count-- path + if (self->fallbackCount == INT_MAX) + { + self->fallbackCount = -1; + return '\0'; + } + + // Now make sure its in the expected range + assert(self->fallbackIndex < self->strDefaultLength && self->fallbackIndex >= 0); + + CHAR16_T ch = self->strDefault[self->fallbackIndex]; + self->bFallingBack = (ch != 0); + if (ch == 0) self->iRecursionCount = 0; + return ch; +} + +// Fallback Methods +static bool EncoderReplacementFallbackBuffer_Fallback(EncoderBuffer* self) +{ + // If we had a buffer already we're being recursive, throw, it's probably at the suspect + // character in our array. + assert(self->fallbackCount < 1); + + // Go ahead and get our fallback + // Divide by 2 because we aren't a surrogate pair + self->fallbackCount = self->strDefaultLength / 2; + self->fallbackIndex = -1; + + return self->fallbackCount != 0; +} + +static bool EncoderReplacementFallbackBuffer_Fallback_Unknown(EncoderBuffer* self) +{ + // If we had a buffer already we're being recursive, throw, it's probably at the suspect + // character in our array. + assert(self->fallbackCount < 1); + + // Go ahead and get our fallback + self->fallbackCount = self->strDefaultLength; + self->fallbackIndex = -1; + + return self->fallbackCount != 0; +} + +// Fallback the current character using the remaining buffer and encoder if necessary +// This can only be called by our encodings (other have to use the public fallback methods), so +// we can use our EncoderNLS here too. +// setEncoder is true if we're calling from a GetBytes method, false if we're calling from a GetByteCount +// +// Note that this could also change the contents of self->buffer.encoder, which is the same +// object that the caller is using, so the caller could mess up the encoder for us +// if they aren't careful. +static bool EncoderReplacementFallbackBuffer_InternalFallback(EncoderBuffer* self, CHAR16_T ch, CHAR16_T** chars) +{ + // Shouldn't have null charStart + assert(self->charStart != NULL); + + // See if it was a high surrogate + if (IsHighSurrogate(ch)) + { + // See if there's a low surrogate to go with it + if (*chars >= self->charEnd) + { + // Nothing left in input buffer + // No input, return 0 + } + else + { + // Might have a low surrogate + CHAR16_T cNext = **chars; + if (IsLowSurrogate(cNext)) + { + // If already falling back then fail + assert(!self->bFallingBack || self->iRecursionCount++ <= MAX_RECURSION); + + // Next is a surrogate, add it as surrogate pair, and increment chars + (*chars)++; + self->bFallingBack = EncoderReplacementFallbackBuffer_Fallback_Unknown(self); + return self->bFallingBack; + } + + // Next isn't a low surrogate, just fallback the high surrogate + } + } + + // If already falling back then fail + assert(!self->bFallingBack || self->iRecursionCount++ <= MAX_RECURSION); + + // Fall back our char + self->bFallingBack = EncoderReplacementFallbackBuffer_Fallback(self); + + return self->bFallingBack; +} + +static bool EncoderReplacementFallbackBuffer_MovePrevious(EncoderBuffer* self) +{ + // Back up one, only if we just processed the last character (or earlier) + if (self->fallbackCount >= -1 && self->fallbackIndex >= 0) + { + self->fallbackIndex--; + self->fallbackCount++; + return true; + } + + // Return false 'cause we couldn't do it. + return false; +} + +typedef struct +{ + union + { + DecoderBuffer decoder; + EncoderBuffer encoder; + } buffer; + + bool useFallback; + +#if BIGENDIAN + bool treatAsLE; +#endif +} UTF8Encoding; + +// These are bitmasks used to maintain the state in the decoder. They occupy the higher bits +// while the actual character is being built in the lower bits. They are shifted together +// with the actual bits of the character. + +// bits 30 & 31 are used for pending bits fixup +#define FinalByte (1 << 29) +#define SupplimentarySeq (1 << 28) +#define ThreeByteSeq (1 << 27) + +static bool InRange(int c, int begin, int end) +{ + return begin <= c && c <= end; +} + +// During GetChars we had an invalid byte sequence +// pSrc is backed up to the start of the bad sequence if we didn't have room to +// fall it back. Otherwise pSrc remains where it is. +static bool FallbackInvalidByteSequence_Copy(UTF8Encoding* self, unsigned char** pSrc, CHAR16_T** pTarget) +{ + assert(self->useFallback); + + // Get our byte[] + unsigned char* pStart = *pSrc; + bool fallbackResult = DecoderReplacementFallbackBuffer_InternalFallback_Copy(&self->buffer.decoder, pTarget); + + // Do the actual fallback + if (!fallbackResult) + { + // Oops, it failed, back up to pStart + *pSrc = pStart; + return false; + } + + // It worked + return true; +} + +static size_t GetCharCount(UTF8Encoding* self, unsigned char* bytes, size_t count) +{ + assert(bytes != NULL); + assert(count >= 0); + + // Initialize stuff + unsigned char *pSrc = bytes; + unsigned char *pEnd = pSrc + count; + int availableBytes, chc; + + // Start by assuming we have as many as count, charCount always includes the adjustment + // for the character being decoded + size_t charCount = count; + int ch = 0; + bool fallbackUsed = false; + + while (true) + { + // SLOWLOOP: does all range checks, handles all special cases, but it is slow + if (pSrc >= pEnd) break; + + // read next byte. The JIT optimization seems to be getting confused when + // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead + int cha = *pSrc; + + // no pending bits + if (ch == 0) goto ReadChar; + + pSrc++; + + // we are expecting to see trailing bytes like 10vvvvvv + if ((cha & 0xC0) != 0x80) + { + // This can be a valid starting byte for another UTF8 byte sequence, so let's put + // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence + pSrc--; + charCount += (ch >> 30); + goto InvalidByteSequence; + } + + // fold in the new byte + ch = (ch << 6) | (cha & 0x3F); + + if ((ch & FinalByte) == 0) + { + assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0); + + if ((ch & SupplimentarySeq) != 0) + { + if ((ch & (FinalByte >> 6)) != 0) + { + // this is 3rd byte (of 4 byte supplimentary) - nothing to do + continue; + } + + // 2nd byte, check for non-shortest form of supplimentary char and the valid + // supplimentary characters in range 0x010000 - 0x10FFFF at the same time + if (!InRange(ch & 0x1F0, 0x10, 0x100)) + { + goto InvalidByteSequence; + } + } + else + { + // Must be 2nd byte of a 3-byte sequence + // check for non-shortest form of 3 byte seq + if ((ch & (0x1F << 5)) == 0 || // non-shortest form + (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate + { + goto InvalidByteSequence; + } + } + continue; + } + + // ready to punch + + // adjust for surrogates in non-shortest form + if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq) charCount--; + + goto EncodeChar; + + InvalidByteSequence: + // this code fragment should be close to the gotos referencing it + // Have to do fallback for invalid bytes + if (!fallbackUsed) + { + fallbackUsed = true; + if (self->useFallback) DecoderBuffer_InternalInitialize(&self->buffer.decoder, bytes, NULL); + } + charCount += self->buffer.decoder.strDefaultLength; + + ch = 0; + continue; + + ReadChar: + ch = *pSrc; + pSrc++; + + ProcessChar: + if (ch > 0x7F) + { + // If its > 0x7F, its start of a new multi-byte sequence + + // Long sequence, so unreserve our char. + charCount--; + + // bit 6 has to be non-zero for start of multibyte chars. + if ((ch & 0x40) == 0) goto InvalidByteSequence; + + // start a new long code + if ((ch & 0x20) != 0) + { + if ((ch & 0x10) != 0) + { + // 4 byte encoding - supplimentary character (2 surrogates) + + ch &= 0x0F; + + // check that bit 4 is zero and the valid supplimentary character + // range 0x000000 - 0x10FFFF at the same time + if (ch > 0x04) + { + ch |= 0xf0; + goto InvalidByteSequence; + } + + // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. + // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag. + ch |= (FinalByte >> 3 * 6) | // Final byte is 3 more bytes from now + (1 << 30) | // If it dies on next byte we'll need an extra char + (3 << (30 - 2 * 6)) | // If it dies on last byte we'll need to subtract a char + (SupplimentarySeq) | (SupplimentarySeq >> 6) | + (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); + + // Our character count will be 2 characters for these 4 bytes, so subtract another char + charCount--; + } + else + { + // 3 byte encoding + // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. + ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | + (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); + + // We'll expect 1 character for these 3 bytes, so subtract another char. + charCount--; + } + } + else + { + // 2 byte encoding + + ch &= 0x1F; + + // check for non-shortest form + if (ch <= 1) + { + ch |= 0xc0; + goto InvalidByteSequence; + } + + // Add bit flags so we'll be flagged correctly + ch |= (FinalByte >> 6); + } + continue; + } + + EncodeChar: + + availableBytes = pEnd - pSrc; + + // don't fall into the fast decoding loop if we don't have enough bytes + if (availableBytes <= 13) + { + // try to get over the remainder of the ascii characters fast though + unsigned char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered + while (pSrc < pLocalEnd) + { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) + goto ProcessChar; + } + // we are done + ch = 0; + break; + } + + // To compute the upper bound, assume that all characters are ASCII characters at this point, + // the boundary will be decreased for every non-ASCII character we encounter + // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences + unsigned char *pStop = pSrc + availableBytes - 7; + + while (pSrc < pStop) + { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) + { + goto LongCode; + } + + // get pSrc 2-byte aligned + if (((size_t)pSrc & 0x1) != 0) + { + ch = *pSrc; + pSrc++; + if (ch > 0x7F) + { + goto LongCode; + } + } + + // get pSrc 4-byte aligned + if (((size_t)pSrc & 0x2) != 0) + { + ch = *(unsigned short*)pSrc; + if ((ch & 0x8080) != 0) + { + goto LongCodeWithMask16; + } + pSrc += 2; + } + + + // Run 8 + 8 characters at a time! + while (pSrc < pStop) + { + ch = *(int*)pSrc; + int chb = *(int*)(pSrc + 4); + if (((ch | chb) & (int)0x80808080) != 0) + { + goto LongCodeWithMask32; + } + pSrc += 8; + + // This is a really small loop - unroll it + if (pSrc >= pStop) + break; + + ch = *(int*)pSrc; + chb = *(int*)(pSrc + 4); + if (((ch | chb) & (int)0x80808080) != 0) + { + goto LongCodeWithMask32; + } + pSrc += 8; + } + break; + + LongCodeWithMask32 : +#if BIGENDIAN + // be careful about the sign extension + if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16); + else +#endif + ch &= 0xFF; + + LongCodeWithMask16: +#if BIGENDIAN + if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 8); + else +#endif + ch &= 0xFF; + + pSrc++; + if (ch <= 0x7F) + { + continue; + } + + LongCode: + chc = *pSrc; + pSrc++; + + if ( + // bit 6 has to be zero + (ch & 0x40) == 0 || + // we are expecting to see trailing bytes like 10vvvvvv + (chc & 0xC0) != 0x80) + { + goto BadLongCode; + } + + chc &= 0x3F; + + // start a new long code + if ((ch & 0x20) != 0) + { + // fold the first two bytes together + chc |= (ch & 0x0F) << 6; + + if ((ch & 0x10) != 0) + { + // 4 byte encoding - surrogate + ch = *pSrc; + if ( + // check that bit 4 is zero, the non-shortest form of surrogate + // and the valid surrogate range 0x000000 - 0x10FFFF at the same time + !InRange(chc >> 4, 0x01, 0x10) || + // we are expecting to see trailing bytes like 10vvvvvv + (ch & 0xC0) != 0x80) + { + goto BadLongCode; + } + + chc = (chc << 6) | (ch & 0x3F); + + ch = *(pSrc + 1); + // we are expecting to see trailing bytes like 10vvvvvv + if ((ch & 0xC0) != 0x80) + { + goto BadLongCode; + } + pSrc += 2; + + // extra byte + charCount--; + } + else + { + // 3 byte encoding + ch = *pSrc; + if ( + // check for non-shortest form of 3 byte seq + (chc & (0x1F << 5)) == 0 || + // Can't have surrogates here. + (chc & (0xF800 >> 6)) == (0xD800 >> 6) || + // we are expecting to see trailing bytes like 10vvvvvv + (ch & 0xC0) != 0x80) + { + goto BadLongCode; + } + pSrc++; + + // extra byte + charCount--; + } + } + else + { + // 2 byte encoding + + // check for non-shortest form + if ((ch & 0x1E) == 0) goto BadLongCode; + } + + // extra byte + charCount--; + } + + // no pending bits at this point + ch = 0; + continue; + + BadLongCode: + pSrc -= 2; + ch = 0; + continue; + } + + // May have a problem if we have to flush + if (ch != 0) + { + // We were already adjusting for these, so need to unadjust + charCount += (ch >> 30); + // Have to do fallback for invalid bytes + if (!fallbackUsed) + { + fallbackUsed = true; + if (self->useFallback) DecoderBuffer_InternalInitialize(&self->buffer.decoder, bytes, NULL); + } + charCount += self->buffer.decoder.strDefaultLength; + } + + // Shouldn't have anything in fallback buffer for GetCharCount + // (don't have to check m_throwOnOverflow for count) + assert(!fallbackUsed || !self->useFallback || self->buffer.decoder.fallbackCount < 0); + + return charCount; +} + +static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, CHAR16_T* chars, size_t charCount) +{ + assert(chars != NULL); + assert(byteCount >= 0); + assert(charCount >= 0); + assert(bytes != NULL); + + unsigned char *pSrc = bytes; + CHAR16_T *pTarget = chars; + + unsigned char *pEnd = pSrc + byteCount; + CHAR16_T *pAllocatedBufferEnd = pTarget + charCount; + + int ch = 0; + int chc; + + bool fallbackUsed = false; + + while (true) + { + // SLOWLOOP: does all range checks, handles all special cases, but it is slow + + if (pSrc >= pEnd) break; + + // read next byte. The JIT optimization seems to be getting confused when + // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead + int cha = *pSrc; + + if (ch == 0) + { + // no pending bits + goto ReadChar; + } + + pSrc++; + + // we are expecting to see trailing bytes like 10vvvvvv + if ((cha & 0xC0) != 0x80) + { + // This can be a valid starting byte for another UTF8 byte sequence, so let's put + // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence + pSrc--; + goto InvalidByteSequence; + } + + // fold in the new byte + ch = (ch << 6) | (cha & 0x3F); + + if ((ch & FinalByte) == 0) + { + // Not at last byte yet + assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0); + + if ((ch & SupplimentarySeq) != 0) + { + // Its a 4-byte supplimentary sequence + if ((ch & (FinalByte >> 6)) != 0) + { + // this is 3rd byte of 4 byte sequence - nothing to do + continue; + } + + // 2nd byte of 4 bytes + // check for non-shortest form of surrogate and the valid surrogate + // range 0x000000 - 0x10FFFF at the same time + if (!InRange(ch & 0x1F0, 0x10, 0x100)) + { + goto InvalidByteSequence; + } + } + else + { + // Must be 2nd byte of a 3-byte sequence + // check for non-shortest form of 3 byte seq + if ((ch & (0x1F << 5)) == 0 || // non-shortest form + (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate + { + goto InvalidByteSequence; + } + } + continue; + } + + // ready to punch + + // surrogate in shortest form? + // Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte? + if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq) + { + // let the range check for the second char throw the exception + if (pTarget < pAllocatedBufferEnd) + { + *pTarget = (CHAR16_T)(((ch >> 10) & 0x7FF) + + (HIGH_SURROGATE_START - (0x10000 >> 10))); + pTarget++; + + ch = (ch & 0x3FF) + + (int)(LOW_SURROGATE_START); + } + } + + goto EncodeChar; + + InvalidByteSequence: + // this code fragment should be close to the gotos referencing it + // Have to do fallback for invalid bytes + if (!fallbackUsed) + { + fallbackUsed = true; + if (self->useFallback) DecoderBuffer_InternalInitialize(&self->buffer.decoder, bytes, pAllocatedBufferEnd); + } + + // That'll back us up the appropriate # of bytes if we didn't get anywhere + if (!FallbackInvalidByteSequence_Copy(self, &pSrc, &pTarget)) + { + // Check if we ran out of buffer space + assert(pSrc >= bytes || pTarget == chars); + + if (self->useFallback) DecoderReplacementFallbackBuffer_Reset(&self->buffer.decoder); + if (pTarget == chars) + { + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; + } + ch = 0; + break; + } + + assert(pSrc >= bytes); + + ch = 0; + continue; + + ReadChar: + ch = *pSrc; + pSrc++; + + ProcessChar: + if (ch > 0x7F) + { + // If its > 0x7F, its start of a new multi-byte sequence + + // bit 6 has to be non-zero + if ((ch & 0x40) == 0) goto InvalidByteSequence; + + // start a new long code + if ((ch & 0x20) != 0) + { + if ((ch & 0x10) != 0) + { + // 4 byte encoding - supplimentary character (2 surrogates) + + ch &= 0x0F; + + // check that bit 4 is zero and the valid supplimentary character + // range 0x000000 - 0x10FFFF at the same time + if (ch > 0x04) + { + ch |= 0xf0; + goto InvalidByteSequence; + } + + ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) | + (SupplimentarySeq) | (SupplimentarySeq >> 6) | + (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); + } + else + { + // 3 byte encoding + ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | + (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); + } + } + else + { + // 2 byte encoding + + ch &= 0x1F; + + // check for non-shortest form + if (ch <= 1) + { + ch |= 0xc0; + goto InvalidByteSequence; + } + + ch |= (FinalByte >> 6); + } + continue; + } + + EncodeChar: + // write the pending character + if (pTarget >= pAllocatedBufferEnd) + { + // Fix chars so we make sure to throw if we didn't output anything + ch &= 0x1fffff; + if (ch > 0x7f) + { + if (ch > 0x7ff) + { + if (ch >= LOW_SURROGATE_START && + ch <= LOW_SURROGATE_END) + { + pSrc--; // It was 4 bytes + pTarget--; // 1 was stored already, but we can't remember 1/2, so back up + } + else if (ch > 0xffff) + { + pSrc--; // It was 4 bytes, nothing was stored + } + pSrc--; // It was at least 3 bytes + } + pSrc--; // It was at least 2 bytes + } + pSrc--; + + // Throw that we don't have enough room (pSrc could be < chars if we had started to process + // a 4 byte sequence already) + assert(pSrc >= bytes || pTarget == chars); + + if (pTarget == chars) + { + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; + } + + // Don't store ch in decoder, we already backed up to its start + ch = 0; + + // Didn't throw, just use this buffer size. + break; + } + *pTarget = (CHAR16_T)ch; + pTarget++; + + int availableChars = pAllocatedBufferEnd - pTarget; + int availableBytes = pEnd - pSrc; + + // don't fall into the fast decoding loop if we don't have enough bytes + // Test for availableChars is done because pStop would be <= pTarget. + if (availableBytes <= 13) + { + // we may need as many as 1 character per byte + if (availableChars < availableBytes) + { + // not enough output room. no pending bits at this point + ch = 0; + continue; + } + + // try to get over the remainder of the ascii characters fast though + unsigned char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered + while (pSrc < pLocalEnd) + { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) goto ProcessChar; + + *pTarget = (CHAR16_T)ch; + pTarget++; + } + // we are done + ch = 0; + break; + } + + // we may need as many as 1 character per byte, so reduce the byte count if necessary. + // If availableChars is too small, pStop will be before pTarget and we won't do fast loop. + if (availableChars < availableBytes) availableBytes = availableChars; + + // To compute the upper bound, assume that all characters are ASCII characters at this point, + // the boundary will be decreased for every non-ASCII character we encounter + // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences + CHAR16_T *pStop = pTarget + availableBytes - 7; + + while (pTarget < pStop) + { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) goto LongCode; + + *pTarget = (CHAR16_T)ch; + pTarget++; + + // get pSrc to be 2-byte aligned + if ((((size_t)pSrc) & 0x1) != 0) + { + ch = *pSrc; + pSrc++; + if (ch > 0x7F) goto LongCode; + + *pTarget = (CHAR16_T)ch; + pTarget++; + } + + // get pSrc to be 4-byte aligned + if ((((size_t)pSrc) & 0x2) != 0) + { + ch = *(unsigned short*)pSrc; + if ((ch & 0x8080) != 0) goto LongCodeWithMask16; + + // Unfortunately, this is endianness sensitive +#if BIGENDIAN + if (!self->treatAsLE) + { + *pTarget = (CHAR16_T)((ch >> 8) & 0x7F); + pSrc += 2; + *(pTarget + 1) = (CHAR16_T)(ch & 0x7F); + pTarget += 2; + } + else +#endif + { + *pTarget = (CHAR16_T)(ch & 0x7F); + pSrc += 2; + *(pTarget + 1) = (CHAR16_T)((ch >> 8) & 0x7F); + pTarget += 2; + } + } + + // Run 8 characters at a time! + while (pTarget < pStop) + { + ch = *(int*)pSrc; + int chb = *(int*)(pSrc + 4); + if (((ch | chb) & (int)0x80808080) != 0) goto LongCodeWithMask32; + + // Unfortunately, this is endianness sensitive +#if BIGENDIAN + if (!self->treatAsLE) + { + *pTarget = (CHAR16_T)((ch >> 24) & 0x7F); + *(pTarget + 1) = (CHAR16_T)((ch >> 16) & 0x7F); + *(pTarget + 2) = (CHAR16_T)((ch >> 8) & 0x7F); + *(pTarget + 3) = (CHAR16_T)(ch & 0x7F); + pSrc += 8; + *(pTarget + 4) = (CHAR16_T)((chb >> 24) & 0x7F); + *(pTarget + 5) = (CHAR16_T)((chb >> 16) & 0x7F); + *(pTarget + 6) = (CHAR16_T)((chb >> 8) & 0x7F); + *(pTarget + 7) = (CHAR16_T)(chb & 0x7F); + pTarget += 8; + } + else +#endif + { + *pTarget = (CHAR16_T)(ch & 0x7F); + *(pTarget + 1) = (CHAR16_T)((ch >> 8) & 0x7F); + *(pTarget + 2) = (CHAR16_T)((ch >> 16) & 0x7F); + *(pTarget + 3) = (CHAR16_T)((ch >> 24) & 0x7F); + pSrc += 8; + *(pTarget + 4) = (CHAR16_T)(chb & 0x7F); + *(pTarget + 5) = (CHAR16_T)((chb >> 8) & 0x7F); + *(pTarget + 6) = (CHAR16_T)((chb >> 16) & 0x7F); + *(pTarget + 7) = (CHAR16_T)((chb >> 24) & 0x7F); + pTarget += 8; + } + } + break; + + LongCodeWithMask32 : +#if BIGENDIAN + // be careful about the sign extension + if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16); + else +#endif + ch &= 0xFF; + + LongCodeWithMask16: +#if BIGENDIAN + if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 8); + else +#endif + ch &= 0xFF; + + pSrc++; + if (ch <= 0x7F) + { + *pTarget = (CHAR16_T)ch; + pTarget++; + continue; + } + + LongCode: + chc = *pSrc; + pSrc++; + + if ( + // bit 6 has to be zero + (ch & 0x40) == 0 || + // we are expecting to see trailing bytes like 10vvvvvv + (chc & 0xC0) != 0x80) + { + goto BadLongCode; + } + + chc &= 0x3F; + + // start a new long code + if ((ch & 0x20) != 0) + { + + // fold the first two bytes together + chc |= (ch & 0x0F) << 6; + + if ((ch & 0x10) != 0) + { + // 4 byte encoding - surrogate + ch = *pSrc; + if ( + // check that bit 4 is zero, the non-shortest form of surrogate + // and the valid surrogate range 0x000000 - 0x10FFFF at the same time + !InRange(chc >> 4, 0x01, 0x10) || + // we are expecting to see trailing bytes like 10vvvvvv + (ch & 0xC0) != 0x80) + { + goto BadLongCode; + } + + chc = (chc << 6) | (ch & 0x3F); + + ch = *(pSrc + 1); + // we are expecting to see trailing bytes like 10vvvvvv + if ((ch & 0xC0) != 0x80) goto BadLongCode; + + pSrc += 2; + + ch = (chc << 6) | (ch & 0x3F); + + *pTarget = (CHAR16_T)(((ch >> 10) & 0x7FF) + + (HIGH_SURROGATE_START - (0x10000 >> 10))); + pTarget++; + + ch = (ch & 0x3FF) + (LOW_SURROGATE_START); + + // extra byte, we're already planning 2 chars for 2 of these bytes, + // but the big loop is testing the target against pStop, so we need + // to subtract 2 more or we risk overrunning the input. Subtract + // one here and one below. + pStop--; + } + else + { + // 3 byte encoding + ch = *pSrc; + if ( + // check for non-shortest form of 3 byte seq + (chc & (0x1F << 5)) == 0 || + // Can't have surrogates here. + (chc & (0xF800 >> 6)) == (0xD800 >> 6) || + // we are expecting to see trailing bytes like 10vvvvvv + (ch & 0xC0) != 0x80) + { + goto BadLongCode; + } + pSrc++; + + ch = (chc << 6) | (ch & 0x3F); + + // extra byte, we're only expecting 1 char for each of these 3 bytes, + // but the loop is testing the target (not source) against pStop, so + // we need to subtract 2 more or we risk overrunning the input. + // Subtract 1 here and one more below + pStop--; + } + } + else + { + // 2 byte encoding + + ch &= 0x1F; + + // check for non-shortest form + if (ch <= 1) goto BadLongCode; + + ch = (ch << 6) | chc; + } + + *pTarget = (CHAR16_T)ch; + pTarget++; + + // extra byte, we're only expecting 1 char for each of these 2 bytes, + // but the loop is testing the target (not source) against pStop. + // subtract an extra count from pStop so that we don't overrun the input. + pStop--; + } + + assert(pTarget <= pAllocatedBufferEnd); + + // no pending bits at this point + ch = 0; + continue; + + BadLongCode: + pSrc -= 2; + ch = 0; + continue; + } + + if (ch != 0) + { + // Have to do fallback for invalid bytes + if (!fallbackUsed) + { + fallbackUsed = true; + if (self->useFallback) DecoderBuffer_InternalInitialize(&self->buffer.decoder, bytes, NULL); + } + + // This'll back us up the appropriate # of bytes if we didn't get anywhere + if (!self->useFallback) + { + assert(pSrc >= bytes || pTarget == chars); + + // Ran out of buffer space + // Need to throw an exception? + if (self->useFallback) DecoderReplacementFallbackBuffer_Reset(&self->buffer.decoder); + if (pTarget == chars) + { + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; + } + } + assert(pSrc >= bytes); + ch = 0; + } + + // Shouldn't have anything in fallback buffer for GetChars + // (don't have to check m_throwOnOverflow for chars) + assert(!fallbackUsed || self->buffer.decoder.fallbackCount < 0); + + return pTarget - chars; +} + +static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, unsigned char* bytes, size_t byteCount) +{ + assert(chars != NULL); + assert(byteCount >= 0); + assert(charCount >= 0); + assert(bytes != NULL); + + // For fallback we may need a fallback buffer. + // We wait to initialize it though in case we don't have any broken input unicode + bool fallbackUsed = false; + CHAR16_T *pSrc = chars; + unsigned char *pTarget = bytes; + + CHAR16_T *pEnd = pSrc + charCount; + unsigned char *pAllocatedBufferEnd = pTarget + byteCount; + + int ch = 0; + int chd; + + // assume that JIT will enregister pSrc, pTarget and ch + + while (true) + { + // SLOWLOOP: does all range checks, handles all special cases, but it is slow + + if (pSrc >= pEnd) + { + if (ch == 0) + { + // Check if there's anything left to get out of the fallback buffer + ch = fallbackUsed ? EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder) : 0; + if (ch > 0) goto ProcessChar; + } + else + { + // Case of leftover surrogates in the fallback buffer + if (fallbackUsed && self->buffer.encoder.bFallingBack) + { + assert(ch >= 0xD800 && ch <= 0xDBFF); + + int cha = ch; + + ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder); + + if (InRange(ch, LOW_SURROGATE_START, LOW_SURROGATE_END)) + { + ch = ch + (cha << 10) + (0x10000 - LOW_SURROGATE_START - (HIGH_SURROGATE_START << 10)); + goto EncodeChar; + } + else if (ch > 0) + { + goto ProcessChar; + } + + break; + } + } + + // attempt to encode the partial surrogate (will fail or ignore) + if (ch > 0) goto EncodeChar; + + // We're done + break; + } + + if (ch > 0) + { + // We have a high surrogate left over from a previous loop. + assert(ch >= 0xD800 && ch <= 0xDBFF); + + // use separate helper variables for local contexts so that the jit optimizations + // won't get confused about the variable lifetimes + int cha = *pSrc; + + // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. + if (InRange(cha, LOW_SURROGATE_START, LOW_SURROGATE_END)) + { + ch = cha + (ch << 10) + + (0x10000 + - LOW_SURROGATE_START + - (HIGH_SURROGATE_START << 10)); + + pSrc++; + } + // else ch is still high surrogate and encoding will fail + + // attempt to encode the surrogate or partial surrogate + goto EncodeChar; + } + + // If we've used a fallback, then we have to check for it + if (fallbackUsed) + { + ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder); + if (ch > 0) goto ProcessChar; + } + + // read next char. The JIT optimization seems to be getting confused when + // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead + ch = *pSrc; + pSrc++; + + ProcessChar: + if (InRange(ch, HIGH_SURROGATE_START, HIGH_SURROGATE_END)) continue; + + // either good char or partial surrogate + + EncodeChar: + // throw exception on partial surrogate if necessary + if (InRange(ch, HIGH_SURROGATE_START, LOW_SURROGATE_END)) + { + // Lone surrogates aren't allowed, we have to do fallback for them + // Have to make a fallback buffer if we don't have one + if (!fallbackUsed) + { + // wait on fallbacks if we can + // For fallback we may need a fallback buffer + fallbackUsed = true; + + // Set our internal fallback interesting things. + EncoderReplacementFallbackBuffer_InternalInitialize(&self->buffer.encoder, chars, pEnd, true); + } + + // Do our fallback. Actually we already know its a mixed up surrogate, + // so the ref pSrc isn't gonna do anything. + EncoderReplacementFallbackBuffer_InternalFallback(&self->buffer.encoder, (CHAR16_T)ch, &pSrc); + + // Ignore it if we don't throw + ch = 0; + continue; + } + + // Count bytes needed + int bytesNeeded = 1; + if (ch > 0x7F) + { + if (ch > 0x7FF) + { + if (ch > 0xFFFF) + { + bytesNeeded++; // 4 bytes (surrogate pair) + } + bytesNeeded++; // 3 bytes (800-FFFF) + } + bytesNeeded++; // 2 bytes (80-7FF) + } + + if (pTarget > pAllocatedBufferEnd - bytesNeeded) + { + // Left over surrogate from last time will cause pSrc == chars, so we'll throw + if (fallbackUsed && self->buffer.encoder.bFallingBack) + { + EncoderReplacementFallbackBuffer_MovePrevious(&self->buffer.encoder); // Didn't use this fallback char + if (ch > 0xFFFF) + EncoderReplacementFallbackBuffer_MovePrevious(&self->buffer.encoder); // Was surrogate, didn't use 2nd part either + } + else + { + pSrc--; // Didn't use this char + if (ch > 0xFFFF) + pSrc--; // Was surrogate, didn't use 2nd part either + } + + assert(pSrc >= chars || pTarget == bytes); + + if (pTarget == bytes) // Throw if we must + { + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; + } + ch = 0; // Nothing left over (we backed up to start of pair if supplimentary) + break; + } + + if (ch <= 0x7F) + { + *pTarget = (unsigned char)ch; + } + else + { + // use separate helper variables for local contexts so that the jit optimizations + // won't get confused about the variable lifetimes + int chb; + if (ch <= 0x7FF) + { + // 2 unsigned char encoding + chb = (unsigned char)(0xC0 | (ch >> 6)); + } + else + { + if (ch <= 0xFFFF) + { + chb = (unsigned char)(0xE0 | (ch >> 12)); + } + else + { + *pTarget = (unsigned char)(0xF0 | (ch >> 18)); + pTarget++; + + chb = 0x80 | ((ch >> 12) & 0x3F); + } + *pTarget = (unsigned char)chb; + pTarget++; + + chb = 0x80 | ((ch >> 6) & 0x3F); + } + *pTarget = (unsigned char)chb; + pTarget++; + + *pTarget = (unsigned char)0x80 | (ch & 0x3F); + } + + pTarget++; + + // If still have fallback don't do fast loop + if (fallbackUsed && (ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder)) != 0) + goto ProcessChar; + + int availableChars = pEnd - pSrc; + int availableBytes = pAllocatedBufferEnd - pTarget; + + // don't fall into the fast decoding loop if we don't have enough characters + // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop. + if (availableChars <= 13) + { + // we are hoping for 1 unsigned char per char + if (availableBytes < availableChars) + { + // not enough output room. no pending bits at this point + ch = 0; + continue; + } + + // try to get over the remainder of the ascii characters fast though + CHAR16_T* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered + while (pSrc < pLocalEnd) + { + ch = *pSrc; + pSrc++; + + // Not ASCII, need more than 1 unsigned char per char + if (ch > 0x7F) goto ProcessChar; + + *pTarget = (unsigned char)ch; + pTarget++; + } + // we are done, let ch be 0 to clear encoder + ch = 0; + break; + } + + // we need at least 1 unsigned char per character, but Convert might allow us to convert + // only part of the input, so try as much as we can. Reduce charCount if necessary + if (availableBytes < availableChars) + { + availableChars = availableBytes; + } + + // FASTLOOP: + // - optimistic range checks + // - fallbacks to the slow loop for all special cases, exception throwing, etc. + + // To compute the upper bound, assume that all characters are ASCII characters at this point, + // the boundary will be decreased for every non-ASCII character we encounter + // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates + // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop. + CHAR16_T *pStop = pSrc + availableChars - 5; + + while (pSrc < pStop) + { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) goto LongCode; + + *pTarget = (unsigned char)ch; + pTarget++; + + // get pSrc aligned + if (((size_t)pSrc & 0x2) != 0) + { + ch = *pSrc; + pSrc++; + if (ch > 0x7F) goto LongCode; + + *pTarget = (unsigned char)ch; + pTarget++; + } + + // Run 4 characters at a time! + while (pSrc < pStop) + { + ch = *(int*)pSrc; + int chc = *(int*)(pSrc + 2); + + if (((ch | chc) & (int)0xFF80FF80) != 0) goto LongCodeWithMask; + + // Unfortunately, this is endianness sensitive +#if BIGENDIAN + if (!self->treatAsLE) + { + *pTarget = (unsigned char)(ch >> 16); + *(pTarget + 1) = (unsigned char)ch; + pSrc += 4; + *(pTarget + 2) = (unsigned char)(chc >> 16); + *(pTarget + 3) = (unsigned char)chc; + pTarget += 4; + } + else +#endif + { + *pTarget = (unsigned char)ch; + *(pTarget + 1) = (unsigned char)(ch >> 16); + pSrc += 4; + *(pTarget + 2) = (unsigned char)chc; + *(pTarget + 3) = (unsigned char)(chc >> 16); + pTarget += 4; + } + } + continue; + + LongCodeWithMask: +#if BIGENDIAN + // be careful about the sign extension + if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16); + else +#endif + ch = (CHAR16_T)ch; + pSrc++; + + if (ch > 0x7F) goto LongCode; + + *pTarget = (unsigned char)ch; + pTarget++; + continue; + + LongCode: + // use separate helper variables for slow and fast loop so that the jit optimizations + // won't get confused about the variable lifetimes + if (ch <= 0x7FF) + { + // 2 unsigned char encoding + chd = 0xC0 | (ch >> 6); + } + else + { + if (!InRange(ch, HIGH_SURROGATE_START, LOW_SURROGATE_END)) + { + // 3 unsigned char encoding + chd = 0xE0 | (ch >> 12); + } + else + { + // 4 unsigned char encoding - high surrogate + low surrogate + if (ch > HIGH_SURROGATE_END) + { + // low without high -> bad, try again in slow loop + pSrc -= 1; + break; + } + + chd = *pSrc; + pSrc++; + + if (!InRange(chd, LOW_SURROGATE_START, LOW_SURROGATE_END)) + { + // high not followed by low -> bad, try again in slow loop + pSrc -= 2; + break; + } + + ch = chd + (ch << 10) + + (0x10000 + - LOW_SURROGATE_START + - (HIGH_SURROGATE_START << 10)); + + *pTarget = (unsigned char)(0xF0 | (ch >> 18)); + // pStop - this unsigned char is compensated by the second surrogate character + // 2 input chars require 4 output bytes. 2 have been anticipated already + // and 2 more will be accounted for by the 2 pStop-- calls below. + pTarget++; + + chd = 0x80 | ((ch >> 12) & 0x3F); + } + *pTarget = (unsigned char)chd; + pStop--; // 3 unsigned char sequence for 1 char, so need pStop-- and the one below too. + pTarget++; + + chd = 0x80 | ((ch >> 6) & 0x3F); + } + *pTarget = (unsigned char)chd; + pStop--; // 2 unsigned char sequence for 1 char so need pStop--. + pTarget++; + + *pTarget = (unsigned char)(0x80 | (ch & 0x3F)); + // pStop - this unsigned char is already included + pTarget++; + } + + assert(pTarget <= pAllocatedBufferEnd); + + // no pending char at this point + ch = 0; + } + + return (int)(pTarget - bytes); +} + +static size_t GetByteCount(UTF8Encoding* self, CHAR16_T *chars, size_t count) +{ + // For fallback we may need a fallback buffer. + // We wait to initialize it though in case we don't have any broken input unicode + bool fallbackUsed = false; + CHAR16_T *pSrc = chars; + CHAR16_T *pEnd = pSrc + count; + + // Start by assuming we have as many as count + size_t byteCount = count; + + int ch = 0; + + while (true) + { + // SLOWLOOP: does all range checks, handles all special cases, but it is slow + if (pSrc >= pEnd) + { + + if (ch == 0) + { + // Unroll any fallback that happens at the end + ch = fallbackUsed ? EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder) : 0; + if (ch > 0) + { + byteCount++; + goto ProcessChar; + } + } + else + { + // Case of surrogates in the fallback. + if (fallbackUsed && self->buffer.encoder.bFallingBack) + { + assert(ch >= 0xD800 && ch <= 0xDBFF); + + ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder); + byteCount++; + + if (InRange(ch, LOW_SURROGATE_START, LOW_SURROGATE_END)) + { + ch = 0xfffd; + byteCount++; + goto EncodeChar; + } + else if (ch > 0) + { + goto ProcessChar; + } + else + { + byteCount--; // ignore last one. + break; + } + } + } + + if (ch <= 0) + { + break; + } + + // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1. + byteCount++; + goto EncodeChar; + } + + if (ch > 0) + { + assert(ch >= 0xD800 && ch <= 0xDBFF); + + // use separate helper variables for local contexts so that the jit optimizations + // won't get confused about the variable lifetimes + int cha = *pSrc; + + // count the pending surrogate + byteCount++; + + // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. + if (InRange(cha, LOW_SURROGATE_START, LOW_SURROGATE_END)) + { + // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do. + ch = 0xfffd; + // ch = cha + (ch << 10) + + // (0x10000 + // - LOW_SURROGATE_START + // - (HIGH_SURROGATE_START << 10) ); + + // Use this next char + pSrc++; + } + // else ch is still high surrogate and encoding will fail (so don't add count) + + // attempt to encode the surrogate or partial surrogate + goto EncodeChar; + } + + // If we've used a fallback, then we have to check for it + if (fallbackUsed) + { + ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder); + if (ch > 0) + { + // We have an extra byte we weren't expecting. + byteCount++; + goto ProcessChar; + } + } + + // read next char. The JIT optimization seems to be getting confused when + // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead + ch = *pSrc; + pSrc++; + + ProcessChar: + if (InRange(ch, HIGH_SURROGATE_START, HIGH_SURROGATE_END)) + { + // we will count this surrogate next time around + byteCount--; + continue; + } + // either good char or partial surrogate + + EncodeChar: + // throw exception on partial surrogate if necessary + if (InRange(ch, HIGH_SURROGATE_START, LOW_SURROGATE_END)) + { + // Lone surrogates aren't allowed + // Have to make a fallback buffer if we don't have one + if (!fallbackUsed) + { + // wait on fallbacks if we can + // For fallback we may need a fallback buffer + fallbackUsed = true; + + // Set our internal fallback interesting things. + EncoderReplacementFallbackBuffer_InternalInitialize(&self->buffer.encoder, chars, chars + count, false); + } + + // Do our fallback. Actually we already know its a mixed up surrogate, + // so the ref pSrc isn't gonna do anything. + EncoderReplacementFallbackBuffer_InternalFallback(&self->buffer.encoder, (CHAR16_T)ch, &pSrc); + + // Ignore it if we don't throw (we had preallocated this ch) + byteCount--; + ch = 0; + continue; + } + + // Count them + if (ch > 0x7F) + { + if (ch > 0x7FF) + { + // the extra surrogate byte was compensated by the second surrogate character + // (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char) + byteCount++; + } + byteCount++; + } + +#if WIN64 + // check for overflow + if (byteCount < 0) + { + break; + } +#endif + + // If still have fallback don't do fast loop + if (fallbackUsed && (ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder)) != 0) + { + // We're reserving 1 byte for each char by default + byteCount++; + goto ProcessChar; + } + + int availableChars = pEnd - pSrc; + + // don't fall into the fast decoding loop if we don't have enough characters + if (availableChars <= 13) + { + // try to get over the remainder of the ascii characters fast though + CHAR16_T* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered + while (pSrc < pLocalEnd) + { + ch = *pSrc; + pSrc++; + if (ch > 0x7F) goto ProcessChar; + } + + // we are done + break; + } + +#if WIN64 + // make sure that we won't get a silent overflow inside the fast loop + // (Fall out to slow loop if we have this many characters) + availableChars &= 0x0FFFFFFF; +#endif + + // To compute the upper bound, assume that all characters are ASCII characters at this point, + // the boundary will be decreased for every non-ASCII character we encounter + // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates + CHAR16_T *pStop = pSrc + availableChars - (3 + 4); + + while (pSrc < pStop) + { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) // Not ASCII + { + if (ch > 0x7FF) // Not 2 Byte + { + if ((ch & 0xF800) == 0xD800) // See if its a Surrogate + goto LongCode; + byteCount++; + } + byteCount++; + } + + // get pSrc aligned + if (((size_t)pSrc & 0x2) != 0) + { + ch = *pSrc; + pSrc++; + if (ch > 0x7F) // Not ASCII + { + if (ch > 0x7FF) // Not 2 Byte + { + if ((ch & 0xF800) == 0xD800) // See if its a Surrogate + goto LongCode; + byteCount++; + } + byteCount++; + } + } + + // Run 2 * 4 characters at a time! + while (pSrc < pStop) + { + ch = *(int*)pSrc; + int chc = *(int*)(pSrc + 2); + if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII + { + if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte + { + goto LongCodeWithMask; + } + + + if ((ch & (int)0xFF800000) != 0) // Actually 0x07800780 is all we care about (4 bits) + byteCount++; + if ((ch & (int)0xFF80) != 0) + byteCount++; + if ((chc & (int)0xFF800000) != 0) + byteCount++; + if ((chc & (int)0xFF80) != 0) + byteCount++; + } + pSrc += 4; + + ch = *(int*)pSrc; + chc = *(int*)(pSrc + 2); + if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII + { + if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte + { + goto LongCodeWithMask; + } + + if ((ch & (int)0xFF800000) != 0) + byteCount++; + if ((ch & (int)0xFF80) != 0) + byteCount++; + if ((chc & (int)0xFF800000) != 0) + byteCount++; + if ((chc & (int)0xFF80) != 0) + byteCount++; + } + pSrc += 4; + } + break; + + LongCodeWithMask: +#if BIGENDIAN + // be careful about the sign extension + if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16); + else +#endif + ch = (CHAR16_T)ch; + + pSrc++; + + if (ch <= 0x7F) + { + continue; + } + + LongCode: + // use separate helper variables for slow and fast loop so that the jit optimizations + // won't get confused about the variable lifetimes + if (ch > 0x7FF) + { + if (InRange(ch, HIGH_SURROGATE_START, LOW_SURROGATE_END)) + { + // 4 byte encoding - high surrogate + low surrogate + + int chd = *pSrc; + if ( + ch > HIGH_SURROGATE_END || + !InRange(chd, LOW_SURROGATE_START, LOW_SURROGATE_END)) + { + // Back up and drop out to slow loop to figure out error + pSrc--; + break; + } + pSrc++; + + // byteCount - this byte is compensated by the second surrogate character + } + byteCount++; + } + byteCount++; + + // byteCount - the last byte is already included + } + + // no pending char at this point + ch = 0; + } + +#if WIN64 + // check for overflow + assert(byteCount >= 0); +#endif + assert(!fallbackUsed || self->buffer.encoder.fallbackCount < 0); + + return byteCount; +} + +size_t minipal_get_length_utf8_to_utf16(const char* source, size_t sourceLength, unsigned int flags) +{ + errno = 0; + + if (sourceLength == 0) + return 0; + + UTF8Encoding enc = + { + .buffer = { .decoder = { .fallbackCount = -1, .fallbackIndex = -1, .strDefault = { 0xFFFD, 0 }, .strDefaultLength = 1 } }, + .useFallback = !(flags & MINIPAL_MB_NO_REPLACE_INVALID_CHARS), +#if BIGENDIAN + .treatAsLE = (flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN) +#endif + }; + + return GetCharCount(&enc, (unsigned char*)source, sourceLength); +} + +size_t minipal_get_length_utf16_to_utf8(const CHAR16_T* source, size_t sourceLength, unsigned int flags) +{ + errno = 0; + + if (sourceLength == 0) + return 0; + + UTF8Encoding enc = + { + // repeat replacement char (0xFFFD) twice for a surrogate pair + .buffer = { .encoder = { .fallbackCount = -1, .fallbackIndex = -1, .strDefault = { 0xFFFD, 0xFFFD, 0 }, .strDefaultLength = 2 } }, + .useFallback = true, +#if BIGENDIAN + .treatAsLE = (flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN) +#endif + }; + +#if !BIGENDIAN + (void)flags; // unused +#endif + + return GetByteCount(&enc, (CHAR16_T*)source, sourceLength); +} + +size_t minipal_convert_utf8_to_utf16(const char* source, size_t sourceLength, CHAR16_T* destination, size_t destinationLength, unsigned int flags) +{ + size_t ret; + errno = 0; + + if (sourceLength == 0) + return 0; + + UTF8Encoding enc = + { + .buffer = { .decoder = { .fallbackCount = -1, .fallbackIndex = -1, .strDefault = { 0xFFFD, 0 }, .strDefaultLength = 1 } }, + .useFallback = !(flags & MINIPAL_MB_NO_REPLACE_INVALID_CHARS), +#if BIGENDIAN + .treatAsLE = (flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN) +#endif + }; + + if (GetCharCount(&enc, (unsigned char*)source, sourceLength) > destinationLength) + { + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + ret = 0; + } + else + { + ret = GetChars(&enc, (unsigned char*)source, sourceLength, destination, destinationLength); + if (errno) ret = 0; + } + + return ret; +} + +size_t minipal_convert_utf16_to_utf8(const CHAR16_T* source, size_t sourceLength, char* destination, size_t destinationLength, unsigned int flags) +{ + size_t ret; + errno = 0; + + if (sourceLength == 0) + return 0; + + UTF8Encoding enc = + { + // repeat replacement char (0xFFFD) twice for a surrogate pair + .buffer = { .encoder = { .fallbackCount = -1, .fallbackIndex = -1, .strDefault = { 0xFFFD, 0xFFFD, 0 }, .strDefaultLength = 2 } }, + .useFallback = true, +#if BIGENDIAN + .treatAsLE = (flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN) +#endif + }; + + if (GetByteCount(&enc, (CHAR16_T*)source, sourceLength) > destinationLength) + { + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + ret = 0; + } + else + { +#if !BIGENDIAN + (void)flags; // unused +#endif + + ret = GetBytes(&enc, (CHAR16_T*)source, sourceLength, (unsigned char*)destination, destinationLength); + if (errno) ret = 0; + } + + return ret; +} diff --git a/src/native/minipal/utf8.cpp b/src/native/minipal/utf8.cpp deleted file mode 100644 index 875b4eeb6008ca..00000000000000 --- a/src/native/minipal/utf8.cpp +++ /dev/null @@ -1,2901 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -/*++ - -Module Name: - - unicode/utf8.c - -Abstract: - Functions to encode and decode UTF-8 strings. This is a port of the C# version from Utf8Encoding.cs. - -Revision History: - ---*/ - -#include - -#include -#include -#include -#include - -#define FASTLOOP - -#ifdef TARGET_WINDOWS -#define W(str) L ## str -#else -#define W(str) u##str -#endif - -inline void *operator new(size_t, void *p) throw () { return p; } - -struct CharUnicodeInfo -{ - static const char16_t HIGH_SURROGATE_START = 0xd800; - static const char16_t HIGH_SURROGATE_END = 0xdbff; - static const char16_t LOW_SURROGATE_START = 0xdc00; - static const char16_t LOW_SURROGATE_END = 0xdfff; -}; - -struct Char -{ - // Test if the wide character is a high surrogate - static bool IsHighSurrogate(const char16_t c) - { - return (c & 0xFC00) == CharUnicodeInfo::HIGH_SURROGATE_START; - } - - // Test if the wide character is a low surrogate - static bool IsLowSurrogate(const char16_t c) - { - return (c & 0xFC00) == CharUnicodeInfo::LOW_SURROGATE_START; - } - - // Test if the wide character is a surrogate half - static bool IsSurrogate(const char16_t c) - { - return (c & 0xF800) == CharUnicodeInfo::HIGH_SURROGATE_START; - } - - // Test if the wide character is a high surrogate - static bool IsHighSurrogate(const char16_t* s, int index) - { - return IsHighSurrogate(s[index]); - } - - // Test if the wide character is a low surrogate - static bool IsLowSurrogate(const char16_t* s, int index) - { - return IsLowSurrogate(s[index]); - } - - // Test if the wide character is a surrogate half - static bool IsSurrogate(const char16_t* s, int index) - { - return IsSurrogate(s[index]); - } -}; - -size_t wcslen(const char16_t* str) -{ - size_t nChar = 0; - while (*str++) nChar++; - return nChar; -} - -int wcscpy_s(char16_t *_Dst, size_t _SizeInWords, const char16_t *_Src) -{ - - char16_t* p = _Dst; - size_t available = _SizeInWords; - - if (!_Src || !_Dst || _SizeInWords == 0) return EINVAL; - - while ((*p++ = *_Src++) != 0 && --available > 0); - - if (available == 0) - { - _Dst = 0; - return ERANGE; - } - -#ifdef DEBUG - size_t offset = _SizeInWords - available + 1; - if (offset < _SizeInWords) - { - memset((_Dst) + (offset), 0xFD, ((_SizeInWords) - (offset)) * sizeof(*(_Dst))); - } -#endif - - return 0; -} - -int wcscat_s(char16_t *_Dst, size_t _SizeInWords, const char16_t *_Src) -{ - char16_t* p = _Dst; - size_t available = _SizeInWords; - - if (!_Src || !_Dst || _SizeInWords == 0) return EINVAL; - - while (available > 0 && *p != 0) - { - p++; - available--; - } - - if (available == 0) - { - _Dst = 0; - return EINVAL; - } - - while ((*p++ = *_Src++) != 0 && --available > 0) - { - } - - if (available == 0) - { - _Dst = 0; - return ERANGE; - } - -#ifdef DEBUG - size_t offset = _SizeInWords - available + 1; - if (offset < _SizeInWords) - { - memset((_Dst) + (offset), 0xFD, ((_SizeInWords) - (offset)) * sizeof(*(_Dst))); - } -#endif - return 0; -} - -#define ContractAssert(cond) \ - if (!(cond)) \ - { \ - errno = ERROR_INVALID_PARAMETER; \ - return 0; \ - } - -#define ContractAssertVoid(cond) \ - if (!(cond)) \ - { \ - errno = ERROR_INVALID_PARAMETER; \ - return; \ - } - -#define ContractAssertFreeFallback(cond) \ - if (!(cond)) \ - { \ - errno = ERROR_INVALID_PARAMETER; \ - if (fallback) free(fallback); \ - return 0; \ - } - -#define RETURN_ON_ERROR \ - if (errno) \ - { \ - if (fallback) free(fallback); \ - return 0; \ - } - -class DecoderFallbackBuffer; - -class DecoderFallback -{ -public: - - // Fallback - // - // Return the appropriate unicode string alternative to the character that need to fall back. - - virtual DecoderFallbackBuffer* CreateFallbackBuffer() { assert(!"pure virtual function called"); while(true); } - - // Maximum number of characters that this instance of this fallback could return - - virtual int GetMaxCharCount() { assert(!"pure virtual function called"); while(true); } -}; - -class DecoderReplacementFallback : public DecoderFallback -{ - // Our variables - char16_t strDefault[2]; - int strDefaultLength; - -public: - // Construction. Default replacement fallback uses no best fit and ? replacement string - DecoderReplacementFallback() : DecoderReplacementFallback(W("?")) - { - } - - DecoderReplacementFallback(const char16_t* replacement) - { - // Must not be null - ContractAssertVoid(replacement != nullptr) - - // Make sure it doesn't have bad surrogate pairs - bool bFoundHigh = false; - int replacementLength = wcslen((const char16_t *)replacement); - for (int i = 0; i < replacementLength; i++) - { - // Found a surrogate? - if (Char::IsSurrogate(replacement, i)) - { - // High or Low? - if (Char::IsHighSurrogate(replacement, i)) - { - // if already had a high one, stop - if (bFoundHigh) - break; // break & throw at the bFoundHIgh below - bFoundHigh = true; - } - else - { - // Low, did we have a high? - if (!bFoundHigh) - { - // Didn't have one, make if fail when we stop - bFoundHigh = true; - break; - } - - // Clear flag - bFoundHigh = false; - } - } - // If last was high we're in trouble (not surrogate so not low surrogate, so break) - else if (bFoundHigh) - break; - } - ContractAssertVoid(!bFoundHigh) - - wcscpy_s(strDefault, ARRAY_SIZE(strDefault), replacement); - strDefaultLength = replacementLength; - } - - char16_t* GetDefaultString() - { - return strDefault; - } - - virtual DecoderFallbackBuffer* CreateFallbackBuffer(); - - // Maximum number of characters that this instance of this fallback could return - virtual int GetMaxCharCount() - { - return strDefaultLength; - } -}; - -class DecoderFallbackBuffer -{ - friend class UTF8Encoding; - // Most implementations will probably need an implementation-specific constructor - - // internal methods that cannot be overridden that let us do our fallback thing - // These wrap the internal methods so that we can check for people doing stuff that's incorrect - -public: - virtual bool Fallback(unsigned char bytesUnknown[], int index, int size) { assert(!"pure virtual function called"); while(true); } - - // Get next character - virtual char16_t GetNextChar() { assert(!"pure virtual function called"); while(true); } - - //Back up a character - virtual bool MovePrevious() { assert(!"pure virtual function called"); while(true); } - - // How many chars left in this fallback? - virtual int GetRemaining() { assert(!"pure virtual function called"); while(true); } - - // Clear the buffer - virtual void Reset() - { - while (GetNextChar() != (char16_t)0); - } - - // Internal items to help us figure out what we're doing as far as error messages, etc. - // These help us with our performance and messages internally -protected: - unsigned char* byteStart; - char16_t* charEnd; - - // Internal reset - void InternalReset() - { - byteStart = nullptr; - Reset(); - } - - // Set the above values - // This can't be part of the constructor because EncoderFallbacks would have to know how to implement these. - void InternalInitialize(unsigned char* byteStart, char16_t* charEnd) - { - this->byteStart = byteStart; - this->charEnd = charEnd; - } - - // Fallback the current byte by sticking it into the remaining char buffer. - // This can only be called by our encodings (other have to use the public fallback methods), so - // we can use our DecoderNLS here too (except we don't). - // Returns true if we are successful, false if we can't fallback the character (no buffer space) - // So caller needs to throw buffer space if return false. - // Right now this has both bytes and bytes[], since we might have extra bytes, hence the - // array, and we might need the index, hence the byte* - // Don't touch ref chars unless we succeed - virtual bool InternalFallback(unsigned char bytes[], unsigned char* pBytes, char16_t** chars, int size) - { - - ContractAssert(byteStart != nullptr) - - bool fallbackResult = this->Fallback(bytes, (int)(pBytes - byteStart - size), size); - if (errno) return false; - - // See if there's a fallback character and we have an output buffer then copy our string. - if (fallbackResult) - { - // Copy the chars to our output - char16_t ch; - char16_t* charTemp = *chars; - bool bHighSurrogate = false; - while ((ch = GetNextChar()) != 0) - { - // Make sure no mixed up surrogates - if (Char::IsSurrogate(ch)) - { - if (Char::IsHighSurrogate(ch)) - { - // High Surrogate - ContractAssert(!bHighSurrogate) - bHighSurrogate = true; - } - else - { - // Low surrogate - ContractAssert(bHighSurrogate) - bHighSurrogate = false; - } - } - - if (charTemp >= charEnd) - { - // No buffer space - return false; - } - - *(charTemp++) = ch; - } - - // Need to make sure that bHighSurrogate isn't true - ContractAssert(!bHighSurrogate) - - // Now we aren't going to be false, so its OK to update chars - *chars = charTemp; - } - - return true; - } - - // This version just counts the fallback and doesn't actually copy anything. - virtual int InternalFallback(unsigned char bytes[], unsigned char* pBytes, int size) - // Right now this has both bytes[] and unsigned char* bytes, since we might have extra bytes, hence the - // array, and we might need the index, hence the byte* - { - - ContractAssert(byteStart != nullptr) - - bool fallbackResult = this->Fallback(bytes, (int)(pBytes - byteStart - size), size); - if (errno) return 0; - - // See if there's a fallback character and we have an output buffer then copy our string. - if (fallbackResult) - { - int count = 0; - - char16_t ch; - bool bHighSurrogate = false; - while ((ch = GetNextChar()) != 0) - { - // Make sure no mixed up surrogates - if (Char::IsSurrogate(ch)) - { - if (Char::IsHighSurrogate(ch)) - { - // High Surrogate - ContractAssert(!bHighSurrogate) - bHighSurrogate = true; - } - else - { - // Low surrogate - ContractAssert(bHighSurrogate) - bHighSurrogate = false; - } - } - - count++; - } - - // Need to make sure that bHighSurrogate isn't true - ContractAssert(!bHighSurrogate) - - return count; - } - - // If no fallback return 0 - return 0; - } -}; - -class DecoderReplacementFallbackBuffer : public DecoderFallbackBuffer -{ - // Store our default string - char16_t strDefault[2]; - int strDefaultLength; - int fallbackCount = -1; - int fallbackIndex = -1; - -public: - // Construction - DecoderReplacementFallbackBuffer(DecoderReplacementFallback* fallback) - { - wcscpy_s(strDefault, ARRAY_SIZE(strDefault), fallback->GetDefaultString()); - strDefaultLength = wcslen((const char16_t *)fallback->GetDefaultString()); - } - - // Fallback Methods - virtual bool Fallback(unsigned char bytesUnknown[], int index, int size) - { - // We expect no previous fallback in our buffer - // We can't call recursively but others might (note, we don't test on last char!!!) - ContractAssert(fallbackCount < 1) - - // Go ahead and get our fallback - if (strDefaultLength == 0) - return false; - - fallbackCount = strDefaultLength; - fallbackIndex = -1; - - return true; - } - - virtual char16_t GetNextChar() - { - // We want it to get < 0 because == 0 means that the current/last character is a fallback - // and we need to detect recursion. We could have a flag but we already have this counter. - fallbackCount--; - fallbackIndex++; - - // Do we have anything left? 0 is now last fallback char, negative is nothing left - if (fallbackCount < 0) - return '\0'; - - // Need to get it out of the buffer. - // Make sure it didn't wrap from the fast count-- path - if (fallbackCount == INT_MAX) - { - fallbackCount = -1; - return '\0'; - } - - // Now make sure its in the expected range - ContractAssert(fallbackIndex < strDefaultLength && fallbackIndex >= 0) - - return strDefault[fallbackIndex]; - } - - virtual bool MovePrevious() - { - // Back up one, only if we just processed the last character (or earlier) - if (fallbackCount >= -1 && fallbackIndex >= 0) - { - fallbackIndex--; - fallbackCount++; - return true; - } - - // Return false 'cause we couldn't do it. - return false; - } - - // How many characters left to output? - virtual int GetRemaining() - { - // Our count is 0 for 1 character left. - return (fallbackCount < 0) ? 0 : fallbackCount; - } - - // Clear the buffer - virtual void Reset() - { - fallbackCount = -1; - fallbackIndex = -1; - byteStart = nullptr; - } - - // This version just counts the fallback and doesn't actually copy anything. - virtual int InternalFallback(unsigned char bytes[], unsigned char* pBytes, int size) - // Right now this has both bytes and bytes[], since we might have extra bytes, hence the - // array, and we might need the index, hence the byte* - { - // return our replacement string Length - return strDefaultLength; - } -}; - -class DecoderExceptionFallbackBuffer : public DecoderFallbackBuffer -{ -public: - DecoderExceptionFallbackBuffer() - { - } - - virtual bool Fallback(unsigned char bytesUnknown[], int index, int size) - { - ContractAssert(false) - } - - virtual char16_t GetNextChar() - { - return 0; - } - - virtual bool MovePrevious() - { - // Exception fallback doesn't have anywhere to back up to. - return false; - } - - // Exceptions are always empty - virtual int GetRemaining() - { - return 0; - } - -}; - -class DecoderExceptionFallback : public DecoderFallback -{ - // Construction -public: - DecoderExceptionFallback() - { - } - - virtual DecoderFallbackBuffer* CreateFallbackBuffer() - { - DecoderExceptionFallbackBuffer* pMem = (DecoderExceptionFallbackBuffer*)malloc(sizeof(DecoderExceptionFallbackBuffer)); - if (pMem == nullptr) - { - errno = ERROR_INSUFFICIENT_BUFFER; - return nullptr; - } - return new (pMem) DecoderExceptionFallbackBuffer(); - } - - // Maximum number of characters that this instance of this fallback could return - virtual int GetMaxCharCount() - { - return 0; - } -}; - -DecoderFallbackBuffer* DecoderReplacementFallback::CreateFallbackBuffer() -{ - DecoderReplacementFallbackBuffer* pMem = (DecoderReplacementFallbackBuffer*)malloc(sizeof(DecoderReplacementFallbackBuffer)); - if (pMem == nullptr) - { - errno = ERROR_INSUFFICIENT_BUFFER; - return nullptr; - } - pMem = new (pMem) DecoderReplacementFallbackBuffer(this); - if (errno) - { - free(pMem); - return nullptr; - } - return pMem; -} - -class EncoderFallbackBuffer; - -class EncoderFallback -{ -public: - - // Fallback - // - // Return the appropriate unicode string alternative to the character that need to fall back. - - virtual EncoderFallbackBuffer* CreateFallbackBuffer() { assert(!"pure virtual function called"); while(true); } - - // Maximum number of characters that this instance of this fallback could return - virtual int GetMaxCharCount() { assert(!"pure virtual function called"); while(true); } -}; - -class EncoderReplacementFallback : public EncoderFallback -{ - // Our variables - char16_t strDefault[2]; - int strDefaultLength; - -public: - // Construction. Default replacement fallback uses no best fit and ? replacement string - EncoderReplacementFallback() : EncoderReplacementFallback(W("?")) - { - } - - EncoderReplacementFallback(const char16_t* replacement) - { - // Must not be null - ContractAssertVoid(replacement != nullptr) - - // Make sure it doesn't have bad surrogate pairs - bool bFoundHigh = false; - int replacementLength = wcslen((const char16_t *)replacement); - for (int i = 0; i < replacementLength; i++) - { - // Found a surrogate? - if (Char::IsSurrogate(replacement, i)) - { - // High or Low? - if (Char::IsHighSurrogate(replacement, i)) - { - // if already had a high one, stop - if (bFoundHigh) - break; // break & throw at the bFoundHIgh below - bFoundHigh = true; - } - else - { - // Low, did we have a high? - if (!bFoundHigh) - { - // Didn't have one, make if fail when we stop - bFoundHigh = true; - break; - } - - // Clear flag - bFoundHigh = false; - } - } - // If last was high we're in trouble (not surrogate so not low surrogate, so break) - else if (bFoundHigh) - break; - } - ContractAssertVoid(!bFoundHigh) - - wcscpy_s(strDefault, ARRAY_SIZE(strDefault), replacement); - strDefaultLength = replacementLength; - } - - char16_t* GetDefaultString() - { - return strDefault; - } - - virtual EncoderFallbackBuffer* CreateFallbackBuffer(); - - // Maximum number of characters that this instance of this fallback could return - virtual int GetMaxCharCount() - { - return strDefaultLength; - } -}; - -class EncoderFallbackBuffer -{ - friend class UTF8Encoding; - // Most implementations will probably need an implementation-specific constructor - - // Public methods that cannot be overridden that let us do our fallback thing - // These wrap the internal methods so that we can check for people doing stuff that is incorrect - -public: - virtual bool Fallback(char16_t charUnknown, int index) { assert(!"pure virtual function called"); while(true); } - - virtual bool Fallback(char16_t charUnknownHigh, char16_t charUnknownLow, int index) { assert(!"pure virtual function called"); while(true); } - - // Get next character - virtual char16_t GetNextChar() { assert(!"pure virtual function called"); while(true); } - - // Back up a character - virtual bool MovePrevious() { assert(!"pure virtual function called"); while(true); } - - // How many chars left in this fallback? - virtual int GetRemaining() { assert(!"pure virtual function called"); while(true); } - - // Not sure if this should be public or not. - // Clear the buffer - virtual void Reset() - { - while (GetNextChar() != (char16_t)0); - } - - // Internal items to help us figure out what we're doing as far as error messages, etc. - // These help us with our performance and messages internally -protected: - char16_t* charStart; - char16_t* charEnd; - bool setEncoder; - bool bUsedEncoder; - bool bFallingBack = false; - int iRecursionCount = 0; - static const int iMaxRecursion = 250; - - // Internal Reset - // For example, what if someone fails a conversion and wants to reset one of our fallback buffers? - void InternalReset() - { - charStart = nullptr; - bFallingBack = false; - iRecursionCount = 0; - Reset(); - } - - // Set the above values - // This can't be part of the constructor because EncoderFallbacks would have to know how to implement these. - void InternalInitialize(char16_t* charStart, char16_t* charEnd, bool setEncoder) - { - this->charStart = charStart; - this->charEnd = charEnd; - this->setEncoder = setEncoder; - this->bUsedEncoder = false; - this->bFallingBack = false; - this->iRecursionCount = 0; - } - - char16_t InternalGetNextChar() - { - char16_t ch = GetNextChar(); - bFallingBack = (ch != 0); - if (ch == 0) iRecursionCount = 0; - return ch; - } - - // Fallback the current character using the remaining buffer and encoder if necessary - // This can only be called by our encodings (other have to use the public fallback methods), so - // we can use our EncoderNLS here too. - // setEncoder is true if we're calling from a GetBytes method, false if we're calling from a GetByteCount - // - // Note that this could also change the contents of this->encoder, which is the same - // object that the caller is using, so the caller could mess up the encoder for us - // if they aren't careful. - virtual bool InternalFallback(char16_t ch, char16_t** chars) - { - // Shouldn't have null charStart - ContractAssert(charStart != nullptr) - - // Get our index, remember chars was preincremented to point at next char, so have to -1 - int index = (int)(*chars - charStart) - 1; - - // See if it was a high surrogate - if (Char::IsHighSurrogate(ch)) - { - // See if there's a low surrogate to go with it - if (*chars >= this->charEnd) - { - // Nothing left in input buffer - // No input, return 0 - } - else - { - // Might have a low surrogate - char16_t cNext = **chars; - if (Char::IsLowSurrogate(cNext)) - { - // If already falling back then fail - ContractAssert(!bFallingBack || iRecursionCount++ <= iMaxRecursion) - - // Next is a surrogate, add it as surrogate pair, and increment chars - (*chars)++; - bFallingBack = Fallback(ch, cNext, index); - return bFallingBack; - } - - // Next isn't a low surrogate, just fallback the high surrogate - } - } - - // If already falling back then fail - ContractAssert(!bFallingBack || iRecursionCount++ <= iMaxRecursion) - - // Fall back our char - bFallingBack = Fallback(ch, index); - - return bFallingBack; - } -}; - -class EncoderReplacementFallbackBuffer : public EncoderFallbackBuffer -{ - // Store our default string - char16_t strDefault[4]; - int strDefaultLength; - int fallbackCount = -1; - int fallbackIndex = -1; -public: - // Construction - EncoderReplacementFallbackBuffer(EncoderReplacementFallback* fallback) - { - // 2X in case we're a surrogate pair - wcscpy_s(strDefault, ARRAY_SIZE(strDefault), fallback->GetDefaultString()); - wcscat_s(strDefault, ARRAY_SIZE(strDefault), fallback->GetDefaultString()); - strDefaultLength = 2 * wcslen((const char16_t *)fallback->GetDefaultString()); - - } - - // Fallback Methods - virtual bool Fallback(char16_t charUnknown, int index) - { - // If we had a buffer already we're being recursive, throw, it's probably at the suspect - // character in our array. - ContractAssert(fallbackCount < 1) - - // Go ahead and get our fallback - // Divide by 2 because we aren't a surrogate pair - fallbackCount = strDefaultLength / 2; - fallbackIndex = -1; - - return fallbackCount != 0; - } - - virtual bool Fallback(char16_t charUnknownHigh, char16_t charUnknownLow, int index) - { - // Double check input surrogate pair - ContractAssert(Char::IsHighSurrogate(charUnknownHigh)) - ContractAssert(Char::IsLowSurrogate(charUnknownLow)) - - // If we had a buffer already we're being recursive, throw, it's probably at the suspect - // character in our array. - ContractAssert(fallbackCount < 1) - - // Go ahead and get our fallback - fallbackCount = strDefaultLength; - fallbackIndex = -1; - - return fallbackCount != 0; - } - - virtual char16_t GetNextChar() - { - // We want it to get < 0 because == 0 means that the current/last character is a fallback - // and we need to detect recursion. We could have a flag but we already have this counter. - fallbackCount--; - fallbackIndex++; - - // Do we have anything left? 0 is now last fallback char, negative is nothing left - if (fallbackCount < 0) - return '\0'; - - // Need to get it out of the buffer. - // Make sure it didn't wrap from the fast count-- path - if (fallbackCount == INT_MAX) - { - fallbackCount = -1; - return '\0'; - } - - // Now make sure its in the expected range - ContractAssert(fallbackIndex < strDefaultLength && fallbackIndex >= 0) - - return strDefault[fallbackIndex]; - } - - virtual bool MovePrevious() - { - // Back up one, only if we just processed the last character (or earlier) - if (fallbackCount >= -1 && fallbackIndex >= 0) - { - fallbackIndex--; - fallbackCount++; - return true; - } - - // Return false 'cause we couldn't do it. - return false; - } - - // How many characters left to output? - virtual int GetRemaining() - { - // Our count is 0 for 1 character left. - return (fallbackCount < 0) ? 0 : fallbackCount; - } - - // Clear the buffer - virtual void Reset() - { - fallbackCount = -1; - fallbackIndex = 0; - charStart = nullptr; - bFallingBack = false; - } -}; - -class EncoderExceptionFallbackBuffer : public EncoderFallbackBuffer -{ -public: - EncoderExceptionFallbackBuffer() - { - } - - virtual bool Fallback(char16_t charUnknown, int index) - { - // Fall back our char - ContractAssert(false) - } - - virtual bool Fallback(char16_t charUnknownHigh, char16_t charUnknownLow, int index) - { - ContractAssert(Char::IsHighSurrogate(charUnknownHigh)) - ContractAssert(Char::IsLowSurrogate(charUnknownLow)) - - //int iTemp = Char::ConvertToUtf32(charUnknownHigh, charUnknownLow); - - // Fall back our char - ContractAssert(false) - } - - virtual char16_t GetNextChar() - { - return 0; - } - - virtual bool MovePrevious() - { - // Exception fallback doesn't have anywhere to back up to. - return false; - } - - // Exceptions are always empty - virtual int GetRemaining() - { - return 0; - } -}; - -class EncoderExceptionFallback : public EncoderFallback -{ - // Construction -public: - EncoderExceptionFallback() - { - } - - virtual EncoderFallbackBuffer* CreateFallbackBuffer() - { - EncoderExceptionFallbackBuffer* pMem = (EncoderExceptionFallbackBuffer*)malloc(sizeof(EncoderExceptionFallbackBuffer)); - if (pMem == nullptr) - return nullptr; - return new (pMem) EncoderExceptionFallbackBuffer(); - } - - // Maximum number of characters that this instance of this fallback could return - virtual int GetMaxCharCount() - { - return 0; - } -}; - -EncoderFallbackBuffer* EncoderReplacementFallback::CreateFallbackBuffer() -{ - EncoderReplacementFallbackBuffer* pMem = (EncoderReplacementFallbackBuffer*)malloc(sizeof(EncoderReplacementFallbackBuffer)); - if (pMem == nullptr) - { - errno = ERROR_INSUFFICIENT_BUFFER; - return nullptr; - } - return new (pMem) EncoderReplacementFallbackBuffer(this); -} - -class UTF8Encoding -{ - EncoderFallback* encoderFallback; - // Instances of the two possible fallbacks. The constructor parameter - // determines which one to use. - EncoderReplacementFallback encoderReplacementFallback; - EncoderExceptionFallback encoderExceptionFallback; - - DecoderFallback* decoderFallback; - // Instances of the two possible fallbacks. The constructor parameter - // determines which one to use. - DecoderReplacementFallback decoderReplacementFallback; - DecoderExceptionFallback decoderExceptionFallback; - -#if BIGENDIAN - bool treatAsLE; -#endif - - bool InRange(int c, int begin, int end) - { - return begin <= c && c <= end; - } - - size_t PtrDiff(char16_t* ptr1, char16_t* ptr2) - { - return ptr1 - ptr2; - } - - size_t PtrDiff(unsigned char* ptr1, unsigned char* ptr2) - { - return ptr1 - ptr2; - } - - // During GetChars we had an invalid byte sequence - // pSrc is backed up to the start of the bad sequence if we didn't have room to - // fall it back. Otherwise pSrc remains where it is. - bool FallbackInvalidByteSequence(unsigned char** pSrc, int ch, DecoderFallbackBuffer* fallback, char16_t** pTarget) - { - // Get our byte[] - unsigned char* pStart = *pSrc; - unsigned char bytesUnknown[3]; - int size = GetBytesUnknown(pStart, ch, bytesUnknown); - bool fallbackResult = fallback->InternalFallback(bytesUnknown, *pSrc, pTarget, size); - RETURN_ON_ERROR - - // Do the actual fallback - if (!fallbackResult) - { - // Oops, it failed, back up to pStart - *pSrc = pStart; - return false; - } - - // It worked - return true; - } - - int FallbackInvalidByteSequence(unsigned char* pSrc, int ch, DecoderFallbackBuffer *fallback) - { - // Get our byte[] - unsigned char bytesUnknown[3]; - int size = GetBytesUnknown(pSrc, ch, bytesUnknown); - - // Do the actual fallback - int count = fallback->InternalFallback(bytesUnknown, pSrc, size); - - // # of fallback chars expected. - // Note that we only get here for "long" sequences, and have already unreserved - // the count that we prereserved for the input bytes - return count; - } - - int GetBytesUnknown(unsigned char* pSrc, int ch, unsigned char* bytesUnknown) - { - int size; - - // See if it was a plain char - // (have to check >= 0 because we have all sorts of weird bit flags) - if (ch < 0x100 && ch >= 0) - { - pSrc--; - bytesUnknown[0] = (unsigned char)ch; - size = 1; - } - // See if its an unfinished 2 byte sequence - else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0) - { - pSrc--; - bytesUnknown[0] = (unsigned char)((ch & 0x1F) | 0xc0); - size = 1; - } - // So now we're either 2nd byte of 3 or 4 byte sequence or - // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence - // 1st check if its a 4 byte sequence - else if ((ch & SupplimentarySeq) != 0) - { - // 3rd byte of 4 byte sequence? - if ((ch & (FinalByte >> 6)) != 0) - { - // 3rd byte of 4 byte sequence - pSrc -= 3; - bytesUnknown[0] = (unsigned char)(((ch >> 12) & 0x07) | 0xF0); - bytesUnknown[1] = (unsigned char)(((ch >> 6) & 0x3F) | 0x80); - bytesUnknown[2] = (unsigned char)(((ch)& 0x3F) | 0x80); - size = 3; - } - else if ((ch & (FinalByte >> 12)) != 0) - { - // 2nd byte of a 4 byte sequence - pSrc -= 2; - bytesUnknown[0] = (unsigned char)(((ch >> 6) & 0x07) | 0xF0); - bytesUnknown[1] = (unsigned char)(((ch)& 0x3F) | 0x80); - size = 2; - } - else - { - // 4th byte of a 4 byte sequence - pSrc--; - bytesUnknown[0] = (unsigned char)(((ch)& 0x07) | 0xF0); - size = 1; - } - } - else - { - // 2nd byte of 3 byte sequence? - if ((ch & (FinalByte >> 6)) != 0) - { - // So its 2nd byte of a 3 byte sequence - pSrc -= 2; - bytesUnknown[0] = (unsigned char)(((ch >> 6) & 0x0F) | 0xE0); - bytesUnknown[1] = (unsigned char)(((ch)& 0x3F) | 0x80); - size = 2; - } - else - { - // 1st byte of a 3 byte sequence - pSrc--; - bytesUnknown[0] = (unsigned char)(((ch)& 0x0F) | 0xE0); - size = 1; - } - } - - return size; - } - -public: - - UTF8Encoding(bool isThrowException, bool treatAsLE) - : encoderReplacementFallback(W("\xFFFD")), decoderReplacementFallback(W("\xFFFD")) -#if BIGENDIAN - , treatAsLE(treatAsLE) -#endif - { - if (isThrowException) - { - encoderFallback = &encoderExceptionFallback; - decoderFallback = &decoderExceptionFallback; - } - else - { - encoderFallback = &encoderReplacementFallback; - decoderFallback = &decoderReplacementFallback; - } - } - - // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits - // while the actual character is being built in the lower bits. They are shifted together - // with the actual bits of the character. - - // bits 30 & 31 are used for pending bits fixup - const int FinalByte = 1 << 29; - const int SupplimentarySeq = 1 << 28; - const int ThreeByteSeq = 1 << 27; - - int GetCharCount(unsigned char* bytes, int count) - { - ContractAssert(bytes != nullptr) - ContractAssert(count >= 0) - - // Initialize stuff - unsigned char *pSrc = bytes; - unsigned char *pEnd = pSrc + count; - - // Start by assuming we have as many as count, charCount always includes the adjustment - // for the character being decoded - int charCount = count; - int ch = 0; - DecoderFallbackBuffer *fallback = nullptr; - - while (true) - { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - if (pSrc >= pEnd) { - break; - } - - // read next byte. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - int cha = *pSrc; - - if (ch == 0) { - // no pending bits - goto ReadChar; - } - - pSrc++; - - // we are expecting to see trailing bytes like 10vvvvvv - if ((cha & 0xC0) != 0x80) { - // This can be a valid starting byte for another UTF8 byte sequence, so let's put - // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence - pSrc--; - charCount += (ch >> 30); - goto InvalidByteSequence; - } - - // fold in the new byte - ch = (ch << 6) | (cha & 0x3F); - - if ((ch & FinalByte) == 0) { - ContractAssertFreeFallback((ch & (SupplimentarySeq | ThreeByteSeq)) != 0) - - if ((ch & SupplimentarySeq) != 0) { - if ((ch & (FinalByte >> 6)) != 0) { - // this is 3rd byte (of 4 byte supplimentary) - nothing to do - continue; - } - - // 2nd byte, check for non-shortest form of supplimentary char and the valid - // supplimentary characters in range 0x010000 - 0x10FFFF at the same time - if (!InRange(ch & 0x1F0, 0x10, 0x100)) { - goto InvalidByteSequence; - } - } - else { - // Must be 2nd byte of a 3-byte sequence - // check for non-shortest form of 3 byte seq - if ((ch & (0x1F << 5)) == 0 || // non-shortest form - (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate - { - goto InvalidByteSequence; - } - } - continue; - } - - // ready to punch - - // adjust for surrogates in non-shortest form - if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq) { - charCount--; - } - goto EncodeChar; - - InvalidByteSequence: - // this code fragment should be close to the gotos referencing it - // Have to do fallback for invalid bytes - if (fallback == nullptr) - { - fallback = decoderFallback->CreateFallbackBuffer(); - RETURN_ON_ERROR - fallback->InternalInitialize(bytes, nullptr); - } - charCount += FallbackInvalidByteSequence(pSrc, ch, fallback); - - ch = 0; - continue; - - ReadChar: - ch = *pSrc; - pSrc++; - - ProcessChar: - if (ch > 0x7F) { - // If its > 0x7F, its start of a new multi-byte sequence - - // Long sequence, so unreserve our char. - charCount--; - - // bit 6 has to be non-zero for start of multibyte chars. - if ((ch & 0x40) == 0) { - // Unexpected trail byte - goto InvalidByteSequence; - } - - // start a new long code - if ((ch & 0x20) != 0) { - if ((ch & 0x10) != 0) { - // 4 byte encoding - supplimentary character (2 surrogates) - - ch &= 0x0F; - - // check that bit 4 is zero and the valid supplimentary character - // range 0x000000 - 0x10FFFF at the same time - if (ch > 0x04) { - ch |= 0xf0; - goto InvalidByteSequence; - } - - // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. - // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag. - ch |= (FinalByte >> 3 * 6) | // Final byte is 3 more bytes from now - (1 << 30) | // If it dies on next byte we'll need an extra char - (3 << (30 - 2 * 6)) | // If it dies on last byte we'll need to subtract a char - (SupplimentarySeq) | (SupplimentarySeq >> 6) | - (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); - - // Our character count will be 2 characters for these 4 bytes, so subtract another char - charCount--; - } - else { - // 3 byte encoding - // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. - ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | - (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); - - // We'll expect 1 character for these 3 bytes, so subtract another char. - charCount--; - } - } - else { - // 2 byte encoding - - ch &= 0x1F; - - // check for non-shortest form - if (ch <= 1) { - ch |= 0xc0; - goto InvalidByteSequence; - } - - // Add bit flags so we'll be flagged correctly - ch |= (FinalByte >> 6); - } - continue; - } - - EncodeChar: - -#ifdef FASTLOOP - int availableBytes = PtrDiff(pEnd, pSrc); - - // don't fall into the fast decoding loop if we don't have enough bytes - if (availableBytes <= 13) { - // try to get over the remainder of the ascii characters fast though - unsigned char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered - while (pSrc < pLocalEnd) { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) - goto ProcessChar; - } - // we are done - ch = 0; - break; - } - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences - unsigned char *pStop = pSrc + availableBytes - 7; - - while (pSrc < pStop) { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) { - goto LongCode; - } - - // get pSrc 2-byte aligned - if (((size_t)pSrc & 0x1) != 0) { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) { - goto LongCode; - } - } - - // get pSrc 4-byte aligned - if (((size_t)pSrc & 0x2) != 0) { - ch = *(unsigned short*)pSrc; - if ((ch & 0x8080) != 0) { - goto LongCodeWithMask16; - } - pSrc += 2; - } - - - // Run 8 + 8 characters at a time! - while (pSrc < pStop) { - ch = *(int*)pSrc; - int chb = *(int*)(pSrc + 4); - if (((ch | chb) & (int)0x80808080) != 0) { - goto LongCodeWithMask32; - } - pSrc += 8; - - // This is a really small loop - unroll it - if (pSrc >= pStop) - break; - - ch = *(int*)pSrc; - chb = *(int*)(pSrc + 4); - if (((ch | chb) & (int)0x80808080) != 0) { - goto LongCodeWithMask32; - } - pSrc += 8; - } - break; - - LongCodeWithMask32 : -#if BIGENDIAN - // be careful about the sign extension - if (!treatAsLE) ch = (int)(((unsigned int)ch) >> 16); - else -#else - ch &= 0xFF; -#endif - - LongCodeWithMask16: -#if BIGENDIAN - if (!treatAsLE) ch = (int)(((unsigned int)ch) >> 8); - else -#else - ch &= 0xFF; -#endif - - pSrc++; - if (ch <= 0x7F) { - continue; - } - - LongCode: - int chc = *pSrc; - pSrc++; - - if ( - // bit 6 has to be zero - (ch & 0x40) == 0 || - // we are expecting to see trailing bytes like 10vvvvvv - (chc & 0xC0) != 0x80) - { - goto BadLongCode; - } - - chc &= 0x3F; - - // start a new long code - if ((ch & 0x20) != 0) { - - // fold the first two bytes together - chc |= (ch & 0x0F) << 6; - - if ((ch & 0x10) != 0) { - // 4 byte encoding - surrogate - ch = *pSrc; - if ( - // check that bit 4 is zero, the non-shortest form of surrogate - // and the valid surrogate range 0x000000 - 0x10FFFF at the same time - !InRange(chc >> 4, 0x01, 0x10) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & 0xC0) != 0x80) - { - goto BadLongCode; - } - - chc = (chc << 6) | (ch & 0x3F); - - ch = *(pSrc + 1); - // we are expecting to see trailing bytes like 10vvvvvv - if ((ch & 0xC0) != 0x80) { - goto BadLongCode; - } - pSrc += 2; - - // extra byte - charCount--; - } - else { - // 3 byte encoding - ch = *pSrc; - if ( - // check for non-shortest form of 3 byte seq - (chc & (0x1F << 5)) == 0 || - // Can't have surrogates here. - (chc & (0xF800 >> 6)) == (0xD800 >> 6) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & 0xC0) != 0x80) - { - goto BadLongCode; - } - pSrc++; - - // extra byte - charCount--; - } - } - else { - // 2 byte encoding - - // check for non-shortest form - if ((ch & 0x1E) == 0) { - goto BadLongCode; - } - } - - // extra byte - charCount--; - } -#endif // FASTLOOP - - // no pending bits at this point - ch = 0; - continue; - - BadLongCode: - pSrc -= 2; - ch = 0; - continue; - } - - // May have a problem if we have to flush - if (ch != 0) - { - // We were already adjusting for these, so need to unadjust - charCount += (ch >> 30); - // Have to do fallback for invalid bytes - if (fallback == nullptr) - { - fallback = decoderFallback->CreateFallbackBuffer(); - RETURN_ON_ERROR - fallback->InternalInitialize(bytes, nullptr); - } - charCount += FallbackInvalidByteSequence(pSrc, ch, fallback); - } - - // Shouldn't have anything in fallback buffer for GetCharCount - // (don't have to check m_throwOnOverflow for count) - ContractAssertFreeFallback(fallback == nullptr || fallback->GetRemaining() == 0) - - free(fallback); - - return charCount; - - } - - int GetChars(unsigned char* bytes, int byteCount, char16_t* chars, int charCount) - { - ContractAssert(chars != nullptr) - ContractAssert(byteCount >= 0) - ContractAssert(charCount >= 0) - ContractAssert(bytes != nullptr) - - unsigned char *pSrc = bytes; - char16_t *pTarget = chars; - - unsigned char *pEnd = pSrc + byteCount; - char16_t *pAllocatedBufferEnd = pTarget + charCount; - - int ch = 0; - - DecoderFallbackBuffer *fallback = nullptr; - - while (true) - { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - - if (pSrc >= pEnd) { - break; - } - - // read next byte. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - int cha = *pSrc; - - if (ch == 0) { - // no pending bits - goto ReadChar; - } - - pSrc++; - - // we are expecting to see trailing bytes like 10vvvvvv - if ((cha & 0xC0) != 0x80) { - // This can be a valid starting byte for another UTF8 byte sequence, so let's put - // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence - pSrc--; - goto InvalidByteSequence; - } - - // fold in the new byte - ch = (ch << 6) | (cha & 0x3F); - - if ((ch & FinalByte) == 0) { - // Not at last byte yet - ContractAssertFreeFallback((ch & (SupplimentarySeq | ThreeByteSeq)) != 0) - - if ((ch & SupplimentarySeq) != 0) { - // Its a 4-byte supplimentary sequence - if ((ch & (FinalByte >> 6)) != 0) { - // this is 3rd byte of 4 byte sequence - nothing to do - continue; - } - - // 2nd byte of 4 bytes - // check for non-shortest form of surrogate and the valid surrogate - // range 0x000000 - 0x10FFFF at the same time - if (!InRange(ch & 0x1F0, 0x10, 0x100)) { - goto InvalidByteSequence; - } - } - else { - // Must be 2nd byte of a 3-byte sequence - // check for non-shortest form of 3 byte seq - if ((ch & (0x1F << 5)) == 0 || // non-shortest form - (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate - { - goto InvalidByteSequence; - } - } - continue; - } - - // ready to punch - - // surrogate in shortest form? - // Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte? - if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq) { - // let the range check for the second char throw the exception - if (pTarget < pAllocatedBufferEnd) { - *pTarget = (char16_t)(((ch >> 10) & 0x7FF) + - (short)((CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10)))); - pTarget++; - - ch = (ch & 0x3FF) + - (int)(CharUnicodeInfo::LOW_SURROGATE_START); - } - } - - goto EncodeChar; - - InvalidByteSequence: - // this code fragment should be close to the gotos referencing it - // Have to do fallback for invalid bytes - if (fallback == nullptr) - { - fallback = decoderFallback->CreateFallbackBuffer(); - RETURN_ON_ERROR - fallback->InternalInitialize(bytes, pAllocatedBufferEnd); - } - - // That'll back us up the appropriate # of bytes if we didn't get anywhere - if (!FallbackInvalidByteSequence(&pSrc, ch, fallback, &pTarget)) - { - // Ran out of buffer space - // Need to throw an exception? - ContractAssertFreeFallback(pSrc >= bytes || pTarget == chars) - fallback->InternalReset(); - if (pTarget == chars) - { - errno = ERROR_INSUFFICIENT_BUFFER; - if (fallback) free(fallback); - return 0; - } - ch = 0; - break; - } - ContractAssert(pSrc >= bytes) - ch = 0; - continue; - - ReadChar: - ch = *pSrc; - pSrc++; - - ProcessChar: - if (ch > 0x7F) { - // If its > 0x7F, its start of a new multi-byte sequence - - // bit 6 has to be non-zero - if ((ch & 0x40) == 0) { - goto InvalidByteSequence; - } - - // start a new long code - if ((ch & 0x20) != 0) { - if ((ch & 0x10) != 0) { - // 4 byte encoding - supplimentary character (2 surrogates) - - ch &= 0x0F; - - // check that bit 4 is zero and the valid supplimentary character - // range 0x000000 - 0x10FFFF at the same time - if (ch > 0x04) { - ch |= 0xf0; - goto InvalidByteSequence; - } - - ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) | - (SupplimentarySeq) | (SupplimentarySeq >> 6) | - (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); - } - else { - // 3 byte encoding - ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | - (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); - } - } - else { - // 2 byte encoding - - ch &= 0x1F; - - // check for non-shortest form - if (ch <= 1) { - ch |= 0xc0; - goto InvalidByteSequence; - } - - ch |= (FinalByte >> 6); - } - continue; - } - - EncodeChar: - // write the pending character - if (pTarget >= pAllocatedBufferEnd) - { - // Fix chars so we make sure to throw if we didn't output anything - ch &= 0x1fffff; - if (ch > 0x7f) - { - if (ch > 0x7ff) - { - if (ch >= CharUnicodeInfo::LOW_SURROGATE_START && - ch <= CharUnicodeInfo::LOW_SURROGATE_END) - { - pSrc--; // It was 4 bytes - pTarget--; // 1 was stored already, but we can't remember 1/2, so back up - } - else if (ch > 0xffff) - { - pSrc--; // It was 4 bytes, nothing was stored - } - pSrc--; // It was at least 3 bytes - } - pSrc--; // It was at least 2 bytes - } - pSrc--; - - // Throw that we don't have enough room (pSrc could be < chars if we had started to process - // a 4 byte sequence already) - ContractAssert(pSrc >= bytes || pTarget == chars) - if (pTarget == chars) - { - errno = ERROR_INSUFFICIENT_BUFFER; - if (fallback) free(fallback); - return 0; - } - - // Don't store ch in decoder, we already backed up to its start - ch = 0; - - // Didn't throw, just use this buffer size. - break; - } - *pTarget = (char16_t)ch; - pTarget++; - -#ifdef FASTLOOP - int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget); - int availableBytes = PtrDiff(pEnd, pSrc); - - // don't fall into the fast decoding loop if we don't have enough bytes - // Test for availableChars is done because pStop would be <= pTarget. - if (availableBytes <= 13) { - // we may need as many as 1 character per byte - if (availableChars < availableBytes) { - // not enough output room. no pending bits at this point - ch = 0; - continue; - } - - // try to get over the remainder of the ascii characters fast though - unsigned char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered - while (pSrc < pLocalEnd) { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) - goto ProcessChar; - - *pTarget = (char16_t)ch; - pTarget++; - } - // we are done - ch = 0; - break; - } - - // we may need as many as 1 character per byte, so reduce the byte count if necessary. - // If availableChars is too small, pStop will be before pTarget and we won't do fast loop. - if (availableChars < availableBytes) { - availableBytes = availableChars; - } - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences - char16_t *pStop = pTarget + availableBytes - 7; - - while (pTarget < pStop) { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) { - goto LongCode; - } - *pTarget = (char16_t)ch; - pTarget++; - - // get pSrc to be 2-byte aligned - if ((((size_t)pSrc) & 0x1) != 0) { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) { - goto LongCode; - } - *pTarget = (char16_t)ch; - pTarget++; - } - - // get pSrc to be 4-byte aligned - if ((((size_t)pSrc) & 0x2) != 0) { - ch = *(unsigned short*)pSrc; - if ((ch & 0x8080) != 0) { - goto LongCodeWithMask16; - } - - // Unfortunately, this is endianness sensitive -#if BIGENDIAN - if (!treatAsLE) - { - *pTarget = (char16_t)((ch >> 8) & 0x7F); - pSrc += 2; - *(pTarget + 1) = (char16_t)(ch & 0x7F); - pTarget += 2; - } - else -#else - { - *pTarget = (char16_t)(ch & 0x7F); - pSrc += 2; - *(pTarget + 1) = (char16_t)((ch >> 8) & 0x7F); - pTarget += 2; - } -#endif - } - - // Run 8 characters at a time! - while (pTarget < pStop) { - ch = *(int*)pSrc; - int chb = *(int*)(pSrc + 4); - if (((ch | chb) & (int)0x80808080) != 0) { - goto LongCodeWithMask32; - } - - // Unfortunately, this is endianness sensitive -#if BIGENDIAN - if (!treatAsLE) - { - *pTarget = (char16_t)((ch >> 24) & 0x7F); - *(pTarget + 1) = (char16_t)((ch >> 16) & 0x7F); - *(pTarget + 2) = (char16_t)((ch >> 8) & 0x7F); - *(pTarget + 3) = (char16_t)(ch & 0x7F); - pSrc += 8; - *(pTarget + 4) = (char16_t)((chb >> 24) & 0x7F); - *(pTarget + 5) = (char16_t)((chb >> 16) & 0x7F); - *(pTarget + 6) = (char16_t)((chb >> 8) & 0x7F); - *(pTarget + 7) = (char16_t)(chb & 0x7F); - pTarget += 8; - } - else -#else - { - *pTarget = (char16_t)(ch & 0x7F); - *(pTarget + 1) = (char16_t)((ch >> 8) & 0x7F); - *(pTarget + 2) = (char16_t)((ch >> 16) & 0x7F); - *(pTarget + 3) = (char16_t)((ch >> 24) & 0x7F); - pSrc += 8; - *(pTarget + 4) = (char16_t)(chb & 0x7F); - *(pTarget + 5) = (char16_t)((chb >> 8) & 0x7F); - *(pTarget + 6) = (char16_t)((chb >> 16) & 0x7F); - *(pTarget + 7) = (char16_t)((chb >> 24) & 0x7F); - pTarget += 8; - } -#endif - } - break; - - LongCodeWithMask32 : -#if BIGENDIAN - // be careful about the sign extension - if (!treatAsLE) ch = (int)(((unsigned int)ch) >> 16); - else -#else - ch &= 0xFF; -#endif - - LongCodeWithMask16: -#if BIGENDIAN - if (!treatAsLE) ch = (int)(((unsigned int)ch) >> 8); - else -#else - ch &= 0xFF; -#endif - - pSrc++; - if (ch <= 0x7F) { - *pTarget = (char16_t)ch; - pTarget++; - continue; - } - - LongCode: - int chc = *pSrc; - pSrc++; - - if ( - // bit 6 has to be zero - (ch & 0x40) == 0 || - // we are expecting to see trailing bytes like 10vvvvvv - (chc & 0xC0) != 0x80) - { - goto BadLongCode; - } - - chc &= 0x3F; - - // start a new long code - if ((ch & 0x20) != 0) { - - // fold the first two bytes together - chc |= (ch & 0x0F) << 6; - - if ((ch & 0x10) != 0) { - // 4 byte encoding - surrogate - ch = *pSrc; - if ( - // check that bit 4 is zero, the non-shortest form of surrogate - // and the valid surrogate range 0x000000 - 0x10FFFF at the same time - !InRange(chc >> 4, 0x01, 0x10) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & 0xC0) != 0x80) - { - goto BadLongCode; - } - - chc = (chc << 6) | (ch & 0x3F); - - ch = *(pSrc + 1); - // we are expecting to see trailing bytes like 10vvvvvv - if ((ch & 0xC0) != 0x80) { - goto BadLongCode; - } - pSrc += 2; - - ch = (chc << 6) | (ch & 0x3F); - - *pTarget = (char16_t)(((ch >> 10) & 0x7FF) + - (short)(CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10))); - pTarget++; - - ch = (ch & 0x3FF) + - (short)(CharUnicodeInfo::LOW_SURROGATE_START); - - // extra byte, we're already planning 2 chars for 2 of these bytes, - // but the big loop is testing the target against pStop, so we need - // to subtract 2 more or we risk overrunning the input. Subtract - // one here and one below. - pStop--; - } - else { - // 3 byte encoding - ch = *pSrc; - if ( - // check for non-shortest form of 3 byte seq - (chc & (0x1F << 5)) == 0 || - // Can't have surrogates here. - (chc & (0xF800 >> 6)) == (0xD800 >> 6) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & 0xC0) != 0x80) - { - goto BadLongCode; - } - pSrc++; - - ch = (chc << 6) | (ch & 0x3F); - - // extra byte, we're only expecting 1 char for each of these 3 bytes, - // but the loop is testing the target (not source) against pStop, so - // we need to subtract 2 more or we risk overrunning the input. - // Subtract 1 here and one more below - pStop--; - } - } - else { - // 2 byte encoding - - ch &= 0x1F; - - // check for non-shortest form - if (ch <= 1) { - goto BadLongCode; - } - ch = (ch << 6) | chc; - } - - *pTarget = (char16_t)ch; - pTarget++; - - // extra byte, we're only expecting 1 char for each of these 2 bytes, - // but the loop is testing the target (not source) against pStop. - // subtract an extra count from pStop so that we don't overrun the input. - pStop--; - } -#endif // FASTLOOP - - ContractAssert(pTarget <= pAllocatedBufferEnd) - - // no pending bits at this point - ch = 0; - continue; - - BadLongCode: - pSrc -= 2; - ch = 0; - continue; - } - - if (ch != 0) - { - // Have to do fallback for invalid bytes - if (fallback == nullptr) - { - fallback = decoderFallback->CreateFallbackBuffer(); - RETURN_ON_ERROR - fallback->InternalInitialize(bytes, pAllocatedBufferEnd); - } - - // This'll back us up the appropriate # of bytes if we didn't get anywhere - if (!FallbackInvalidByteSequence(pSrc, ch, fallback)) - { - ContractAssertFreeFallback(pSrc >= bytes || pTarget == chars) - - // Ran out of buffer space - // Need to throw an exception? - fallback->InternalReset(); - if (pTarget == chars) - { - errno = ERROR_INSUFFICIENT_BUFFER; - if (fallback) free(fallback); - return 0; - } - } - ContractAssertFreeFallback(pSrc >= bytes) - ch = 0; - } - - // Shouldn't have anything in fallback buffer for GetChars - // (don't have to check m_throwOnOverflow for chars) - ContractAssert(fallback == nullptr || fallback->GetRemaining() == 0) - - free(fallback); - - return PtrDiff(pTarget, chars); - } - - int GetBytes(char16_t* chars, int charCount, unsigned char* bytes, int byteCount) - { - ContractAssert(chars != nullptr) - ContractAssert(byteCount >= 0) - ContractAssert(charCount >= 0) - ContractAssert(bytes != nullptr) - - // For fallback we may need a fallback buffer. - // We wait to initialize it though in case we don't have any broken input unicode - EncoderFallbackBuffer* fallback = nullptr; - char16_t *pSrc = chars; - unsigned char *pTarget = bytes; - - char16_t *pEnd = pSrc + charCount; - unsigned char *pAllocatedBufferEnd = pTarget + byteCount; - - int ch = 0; - - // assume that JIT will enregister pSrc, pTarget and ch - - while (true) { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - - if (pSrc >= pEnd) { - - if (ch == 0) { - // Check if there's anything left to get out of the fallback buffer - ch = fallback != nullptr ? fallback->InternalGetNextChar() : 0; - if (ch > 0) { - goto ProcessChar; - } - } - else { - // Case of leftover surrogates in the fallback buffer - if (fallback != nullptr && fallback->bFallingBack) { - ContractAssertFreeFallback(ch >= 0xD800 && ch <= 0xDBFF); //, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)) - - int cha = ch; - - ch = fallback->InternalGetNextChar(); - - if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo::LOW_SURROGATE_START - (CharUnicodeInfo::HIGH_SURROGATE_START << 10)); - goto EncodeChar; - } - else if (ch > 0){ - goto ProcessChar; - } - else { - break; - } - } - } - - // attempt to encode the partial surrogate (will fail or ignore) - if (ch > 0) - goto EncodeChar; - - // We're done - break; - } - - if (ch > 0) { - // We have a high surrogate left over from a previous loop. - ContractAssertFreeFallback(ch >= 0xD800 && ch <= 0xDBFF);//, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)) - - // use separate helper variables for local contexts so that the jit optimizations - // won't get confused about the variable lifetimes - int cha = *pSrc; - - // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. - // if (IsLowSurrogate(cha)) { - if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - ch = cha + (ch << 10) + - (0x10000 - - CharUnicodeInfo::LOW_SURROGATE_START - - (CharUnicodeInfo::HIGH_SURROGATE_START << 10)); - - pSrc++; - } - // else ch is still high surrogate and encoding will fail - - // attempt to encode the surrogate or partial surrogate - goto EncodeChar; - } - - // If we've used a fallback, then we have to check for it - if (fallback != nullptr) - { - ch = fallback->InternalGetNextChar(); - if (ch > 0) goto ProcessChar; - } - - // read next char. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - ch = *pSrc; - pSrc++; - - ProcessChar: - if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) { - continue; - } - // either good char or partial surrogate - - EncodeChar: - // throw exception on partial surrogate if necessary - if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) - { - // Lone surrogates aren't allowed, we have to do fallback for them - // Have to make a fallback buffer if we don't have one - if (fallback == nullptr) - { - // wait on fallbacks if we can - // For fallback we may need a fallback buffer - fallback = encoderFallback->CreateFallbackBuffer(); - RETURN_ON_ERROR - - // Set our internal fallback interesting things. - fallback->InternalInitialize(chars, pEnd, true); - } - - // Do our fallback. Actually we already know its a mixed up surrogate, - // so the ref pSrc isn't gonna do anything. - fallback->InternalFallback((char16_t)ch, &pSrc); - RETURN_ON_ERROR - - // Ignore it if we don't throw - ch = 0; - continue; - } - - // Count bytes needed - int bytesNeeded = 1; - if (ch > 0x7F) { - if (ch > 0x7FF) { - if (ch > 0xFFFF) { - bytesNeeded++; // 4 bytes (surrogate pair) - } - bytesNeeded++; // 3 bytes (800-FFFF) - } - bytesNeeded++; // 2 bytes (80-7FF) - } - - if (pTarget > pAllocatedBufferEnd - bytesNeeded) { - // Left over surrogate from last time will cause pSrc == chars, so we'll throw - if (fallback != nullptr && fallback->bFallingBack) - { - fallback->MovePrevious(); // Didn't use this fallback char - if (ch > 0xFFFF) - fallback->MovePrevious(); // Was surrogate, didn't use 2nd part either - } - else - { - pSrc--; // Didn't use this char - if (ch > 0xFFFF) - pSrc--; // Was surrogate, didn't use 2nd part either - } - ContractAssertFreeFallback(pSrc >= chars || pTarget == bytes) - if (pTarget == bytes) // Throw if we must - { - errno = ERROR_INSUFFICIENT_BUFFER; - if (fallback) free(fallback); - return 0; - } - ch = 0; // Nothing left over (we backed up to start of pair if supplimentary) - break; - } - - if (ch <= 0x7F) { - *pTarget = (unsigned char)ch; - } - else { - // use separate helper variables for local contexts so that the jit optimizations - // won't get confused about the variable lifetimes - int chb; - if (ch <= 0x7FF) { - // 2 unsigned char encoding - chb = (unsigned char)(0xC0 | (ch >> 6)); - } - else - { - if (ch <= 0xFFFF) { - chb = (unsigned char)(0xE0 | (ch >> 12)); - } - else - { - *pTarget = (unsigned char)(0xF0 | (ch >> 18)); - pTarget++; - - chb = 0x80 | ((ch >> 12) & 0x3F); - } - *pTarget = (unsigned char)chb; - pTarget++; - - chb = 0x80 | ((ch >> 6) & 0x3F); - } - *pTarget = (unsigned char)chb; - pTarget++; - - *pTarget = (unsigned char)0x80 | (ch & 0x3F); - } - pTarget++; - - -#ifdef FASTLOOP - // If still have fallback don't do fast loop - if (fallback != nullptr && (ch = fallback->InternalGetNextChar()) != 0) - goto ProcessChar; - - int availableChars = PtrDiff(pEnd, pSrc); - int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget); - - // don't fall into the fast decoding loop if we don't have enough characters - // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop. - if (availableChars <= 13) { - // we are hoping for 1 unsigned char per char - if (availableBytes < availableChars) { - // not enough output room. no pending bits at this point - ch = 0; - continue; - } - - // try to get over the remainder of the ascii characters fast though - char16_t* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered - while (pSrc < pLocalEnd) { - ch = *pSrc; - pSrc++; - - // Not ASCII, need more than 1 unsigned char per char - if (ch > 0x7F) - goto ProcessChar; - - *pTarget = (unsigned char)ch; - pTarget++; - } - // we are done, let ch be 0 to clear encoder - ch = 0; - break; - } - - // we need at least 1 unsigned char per character, but Convert might allow us to convert - // only part of the input, so try as much as we can. Reduce charCount if necessary - if (availableBytes < availableChars) - { - availableChars = availableBytes; - } - - // FASTLOOP: - // - optimistic range checks - // - fallbacks to the slow loop for all special cases, exception throwing, etc. - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates - // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop. - char16_t *pStop = pSrc + availableChars - 5; - - while (pSrc < pStop) { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) { - goto LongCode; - } - *pTarget = (unsigned char)ch; - pTarget++; - - // get pSrc aligned - if (((size_t)pSrc & 0x2) != 0) { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) { - goto LongCode; - } - *pTarget = (unsigned char)ch; - pTarget++; - } - - // Run 4 characters at a time! - while (pSrc < pStop) { - ch = *(int*)pSrc; - int chc = *(int*)(pSrc + 2); - if (((ch | chc) & (int)0xFF80FF80) != 0) { - goto LongCodeWithMask; - } - - // Unfortunately, this is endianness sensitive -#if BIGENDIAN - if (!treatAsLE) - { - *pTarget = (unsigned char)(ch >> 16); - *(pTarget + 1) = (unsigned char)ch; - pSrc += 4; - *(pTarget + 2) = (unsigned char)(chc >> 16); - *(pTarget + 3) = (unsigned char)chc; - pTarget += 4; - } - else -#else - { - *pTarget = (unsigned char)ch; - *(pTarget + 1) = (unsigned char)(ch >> 16); - pSrc += 4; - *(pTarget + 2) = (unsigned char)chc; - *(pTarget + 3) = (unsigned char)(chc >> 16); - pTarget += 4; - } -#endif - } - continue; - - LongCodeWithMask: -#if BIGENDIAN - // be careful about the sign extension - if (!treatAsLE) ch = (int)(((unsigned int)ch) >> 16); - else -#else - ch = (char16_t)ch; -#endif - - pSrc++; - - if (ch > 0x7F) { - goto LongCode; - } - *pTarget = (unsigned char)ch; - pTarget++; - continue; - - LongCode: - // use separate helper variables for slow and fast loop so that the jit optimizations - // won't get confused about the variable lifetimes - int chd; - if (ch <= 0x7FF) { - // 2 unsigned char encoding - chd = 0xC0 | (ch >> 6); - } - else { - if (!InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - // 3 unsigned char encoding - chd = 0xE0 | (ch >> 12); - } - else - { - // 4 unsigned char encoding - high surrogate + low surrogate - if (ch > CharUnicodeInfo::HIGH_SURROGATE_END) { - // low without high -> bad, try again in slow loop - pSrc -= 1; - break; - } - - chd = *pSrc; - pSrc++; - - // if (!IsLowSurrogate(chd)) { - if (!InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - // high not followed by low -> bad, try again in slow loop - pSrc -= 2; - break; - } - - ch = chd + (ch << 10) + - (0x10000 - - CharUnicodeInfo::LOW_SURROGATE_START - - (CharUnicodeInfo::HIGH_SURROGATE_START << 10)); - - *pTarget = (unsigned char)(0xF0 | (ch >> 18)); - // pStop - this unsigned char is compensated by the second surrogate character - // 2 input chars require 4 output bytes. 2 have been anticipated already - // and 2 more will be accounted for by the 2 pStop-- calls below. - pTarget++; - - chd = 0x80 | ((ch >> 12) & 0x3F); - } - *pTarget = (unsigned char)chd; - pStop--; // 3 unsigned char sequence for 1 char, so need pStop-- and the one below too. - pTarget++; - - chd = 0x80 | ((ch >> 6) & 0x3F); - } - *pTarget = (unsigned char)chd; - pStop--; // 2 unsigned char sequence for 1 char so need pStop--. - pTarget++; - - *pTarget = (unsigned char)(0x80 | (ch & 0x3F)); - // pStop - this unsigned char is already included - pTarget++; - } - - ContractAssertFreeFallback(pTarget <= pAllocatedBufferEnd) - -#endif // FASTLOOP - - // no pending char at this point - ch = 0; - } - - free(fallback); - - return (int)(pTarget - bytes); - } - - int GetByteCount(char16_t *chars, int count) - { - // For fallback we may need a fallback buffer. - // We wait to initialize it though in case we don't have any broken input unicode - EncoderFallbackBuffer* fallback = nullptr; - char16_t *pSrc = chars; - char16_t *pEnd = pSrc + count; - - // Start by assuming we have as many as count - int byteCount = count; - - int ch = 0; - - while (true) { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - if (pSrc >= pEnd) { - - if (ch == 0) { - // Unroll any fallback that happens at the end - ch = fallback != nullptr ? fallback->InternalGetNextChar() : 0; - if (ch > 0) { - byteCount++; - goto ProcessChar; - } - } - else { - // Case of surrogates in the fallback. - if (fallback != nullptr && fallback->bFallingBack) { - ContractAssertFreeFallback(ch >= 0xD800 && ch <= 0xDBFF);// , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)) - - ch = fallback->InternalGetNextChar(); - byteCount++; - - if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - ch = 0xfffd; - byteCount++; - goto EncodeChar; - } - else if (ch > 0){ - goto ProcessChar; - } - else { - byteCount--; // ignore last one. - break; - } - } - } - - if (ch <= 0) { - break; - } - - // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1. - byteCount++; - goto EncodeChar; - } - - if (ch > 0) { - ContractAssertFreeFallback(ch >= 0xD800 && ch <= 0xDBFF); // , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)) - - // use separate helper variables for local contexts so that the jit optimizations - // won't get confused about the variable lifetimes - int cha = *pSrc; - - // count the pending surrogate - byteCount++; - - // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. - // if (IsLowSurrogate(cha)) { - if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do. - ch = 0xfffd; - // ch = cha + (ch << 10) + - // (0x10000 - // - CharUnicodeInfo::LOW_SURROGATE_START - // - (CharUnicodeInfo::HIGH_SURROGATE_START << 10) ); - - // Use this next char - pSrc++; - } - // else ch is still high surrogate and encoding will fail (so don't add count) - - // attempt to encode the surrogate or partial surrogate - goto EncodeChar; - } - - // If we've used a fallback, then we have to check for it - if (fallback != nullptr) - { - ch = fallback->InternalGetNextChar(); - if (ch > 0) - { - // We have an extra byte we weren't expecting. - byteCount++; - goto ProcessChar; - } - } - - // read next char. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - ch = *pSrc; - pSrc++; - - ProcessChar: - if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) { - // we will count this surrogate next time around - byteCount--; - continue; - } - // either good char or partial surrogate - - EncodeChar: - // throw exception on partial surrogate if necessary - if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) - { - // Lone surrogates aren't allowed - // Have to make a fallback buffer if we don't have one - if (fallback == nullptr) - { - // wait on fallbacks if we can - // For fallback we may need a fallback buffer - fallback = encoderFallback->CreateFallbackBuffer(); - RETURN_ON_ERROR - - // Set our internal fallback interesting things. - fallback->InternalInitialize(chars, chars + count, false); - } - - // Do our fallback. Actually we already know its a mixed up surrogate, - // so the ref pSrc isn't gonna do anything. - fallback->InternalFallback((char16_t)ch, &pSrc); - RETURN_ON_ERROR - - // Ignore it if we don't throw (we had preallocated this ch) - byteCount--; - ch = 0; - continue; - } - - // Count them - if (ch > 0x7F) { - if (ch > 0x7FF) { - // the extra surrogate byte was compensated by the second surrogate character - // (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char) - byteCount++; - } - byteCount++; - } - -#if WIN64 - // check for overflow - if (byteCount < 0) { - break; - } -#endif - -#ifdef FASTLOOP - // If still have fallback don't do fast loop - if (fallback != nullptr && (ch = fallback->InternalGetNextChar()) != 0) - { - // We're reserving 1 byte for each char by default - byteCount++; - goto ProcessChar; - } - - int availableChars = PtrDiff(pEnd, pSrc); - - // don't fall into the fast decoding loop if we don't have enough characters - if (availableChars <= 13) { - // try to get over the remainder of the ascii characters fast though - char16_t* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered - while (pSrc < pLocalEnd) { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) - goto ProcessChar; - } - - // we are done - break; - } - -#if WIN64 - // make sure that we won't get a silent overflow inside the fast loop - // (Fall out to slow loop if we have this many characters) - availableChars &= 0x0FFFFFFF; -#endif - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates - char16_t *pStop = pSrc + availableChars - (3 + 4); - - while (pSrc < pStop) { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) // Not ASCII - { - if (ch > 0x7FF) // Not 2 Byte - { - if ((ch & 0xF800) == 0xD800) // See if its a Surrogate - goto LongCode; - byteCount++; - } - byteCount++; - } - - // get pSrc aligned - if (((size_t)pSrc & 0x2) != 0) { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) // Not ASCII - { - if (ch > 0x7FF) // Not 2 Byte - { - if ((ch & 0xF800) == 0xD800) // See if its a Surrogate - goto LongCode; - byteCount++; - } - byteCount++; - } - } - - // Run 2 * 4 characters at a time! - while (pSrc < pStop) { - ch = *(int*)pSrc; - int chc = *(int*)(pSrc + 2); - if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII - { - if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte - { - goto LongCodeWithMask; - } - - - if ((ch & (int)0xFF800000) != 0) // Actually 0x07800780 is all we care about (4 bits) - byteCount++; - if ((ch & (int)0xFF80) != 0) - byteCount++; - if ((chc & (int)0xFF800000) != 0) - byteCount++; - if ((chc & (int)0xFF80) != 0) - byteCount++; - } - pSrc += 4; - - ch = *(int*)pSrc; - chc = *(int*)(pSrc + 2); - if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII - { - if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte - { - goto LongCodeWithMask; - } - - if ((ch & (int)0xFF800000) != 0) - byteCount++; - if ((ch & (int)0xFF80) != 0) - byteCount++; - if ((chc & (int)0xFF800000) != 0) - byteCount++; - if ((chc & (int)0xFF80) != 0) - byteCount++; - } - pSrc += 4; - } - break; - - LongCodeWithMask: -#if BIGENDIAN - // be careful about the sign extension - if (!treatAsLE) ch = (int)(((unsigned int)ch) >> 16); - else -#else - ch = (char16_t)ch; -#endif - - pSrc++; - - if (ch <= 0x7F) { - continue; - } - - LongCode: - // use separate helper variables for slow and fast loop so that the jit optimizations - // won't get confused about the variable lifetimes - if (ch > 0x7FF) { - if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - // 4 byte encoding - high surrogate + low surrogate - - int chd = *pSrc; - if ( - ch > CharUnicodeInfo::HIGH_SURROGATE_END || - !InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) - { - // Back up and drop out to slow loop to figure out error - pSrc--; - break; - } - pSrc++; - - // byteCount - this byte is compensated by the second surrogate character - } - byteCount++; - } - byteCount++; - - // byteCount - the last byte is already included - } -#endif // FASTLOOP - - // no pending char at this point - ch = 0; - } - -#if WIN64 - // check for overflow - ContractAssertFreeFallback(byteCount >= 0) -#endif - ContractAssertFreeFallback(fallback == nullptr || fallback->GetRemaining() == 0) - - free(fallback); - - return byteCount; - } -}; - -int minipal_utf8_to_utf16_preallocated( - const char* lpSrcStr, - int cchSrc, - char16_t** lpDestStr, - int cchDest, - unsigned int dwFlags, - bool treatAsLE) -{ - int ret; - errno = 0; - - if (cchSrc < 0) - cchSrc = strlen(lpSrcStr) + 1; - - UTF8Encoding enc(dwFlags & MB_ERR_INVALID_CHARS, treatAsLE); - ret = enc.GetCharCount((unsigned char*)lpSrcStr, cchSrc); - if (cchDest) - { - if (ret > cchDest) - { - errno = ERROR_INSUFFICIENT_BUFFER; - ret = 0; - } - enc.GetChars((unsigned char*)lpSrcStr, cchSrc, (char16_t*)*lpDestStr, ret); - if (errno) ret = 0; - } - return ret; -} - -static int utf16_to_utf8_preallocated( - const char16_t* lpSrcStr, - int cchSrc, - char** lpDestStr, - int cchDest, - bool treatAsLE) -{ - int ret; - errno = 0; - - if (cchSrc < 0) - cchSrc = wcslen(lpSrcStr) + 1; - - UTF8Encoding enc(false, treatAsLE); - ret = enc.GetByteCount((char16_t*)lpSrcStr, cchSrc); - if (cchDest) - { - if (ret > cchDest) - { - errno = ERROR_INSUFFICIENT_BUFFER; - ret = 0; - } - enc.GetBytes((char16_t*)lpSrcStr, cchSrc, (unsigned char*)*lpDestStr, ret); - if (errno) ret = 0; - } - return ret; -} - -int minipal_utf16_to_utf8_preallocated( - const char16_t* lpSrcStr, - int cchSrc, - char** lpDestStr, - int cchDest) -{ - return utf16_to_utf8_preallocated(lpSrcStr, cchSrc, lpDestStr, cchDest, false); -} - -int minipal_utf8_to_utf16_allocate( - const char* lpSrcStr, - int cchSrc, - char16_t** lpDestStr, - unsigned int dwFlags, - bool treatAsLE) -{ - int cchDest = minipal_utf8_to_utf16_preallocated(lpSrcStr, cchSrc, nullptr, 0, dwFlags, !treatAsLE); - if (cchDest > 0) - { - *lpDestStr = (char16_t*)malloc((cchDest + 1) * sizeof(char16_t)); - cchDest = minipal_utf8_to_utf16_preallocated(lpSrcStr, cchSrc, lpDestStr, cchDest, dwFlags, !treatAsLE); - (*lpDestStr)[cchDest] = '\0'; - } - return cchDest; -} - -int minipal_utf16_to_utf8_allocate( - const char16_t* lpSrcStr, - int cchSrc, - char** lpDestStr, - bool treatAsLE) -{ - int cchDest = utf16_to_utf8_preallocated(lpSrcStr, cchSrc, nullptr, 0, treatAsLE); - if (cchDest > 0) - { - *lpDestStr = (char*)malloc((cchDest + 1) * sizeof(char)); - cchDest = utf16_to_utf8_preallocated(lpSrcStr, cchSrc, lpDestStr, cchDest, treatAsLE); - (*lpDestStr)[cchDest] = '\0'; - } - return cchDest; -} diff --git a/src/native/minipal/utf8.h b/src/native/minipal/utf8.h index 71b9a805aa11b5..29c5ba8ab308c2 100644 --- a/src/native/minipal/utf8.h +++ b/src/native/minipal/utf8.h @@ -8,23 +8,64 @@ #include #include -#define MB_ERR_INVALID_CHARS 0x00000008 -#define ERROR_NO_UNICODE_TRANSLATION 1113L -#define ERROR_INSUFFICIENT_BUFFER 122L -#define ERROR_INVALID_PARAMETER 87L +#define MINIPAL_MB_NO_REPLACE_INVALID_CHARS 0x00000008 +#define MINIPAL_TREAT_AS_LITTLE_ENDIAN 0x00000016 +#define MINIPAL_ERROR_INSUFFICIENT_BUFFER 122L #ifdef __cplusplus extern "C" { #endif // __cplusplus -int minipal_utf8_to_utf16_preallocated(const char* lpSrcStr, int cchSrc, char16_t** lpDestStr, int cchDest, unsigned int dwFlags, bool treatAsLE); +#ifdef TARGET_WINDOWS +typedef wchar_t CHAR16_T; +#else +typedef unsigned short CHAR16_T; +#endif -int minipal_utf16_to_utf8_preallocated(const char16_t* lpSrcStr, int cchSrc, char** lpDestStr, int cchDest); +/** + * Get length of destination needed for UTF-8 to UTF-16 (UCS-2) conversion + * + * @param source The source string in UTF-8 format. + * @param sourceLength Length of the source string. + * @param flags Flags to alter the behavior of converter. Supported flags are MINIPAL_MB_NO_REPLACE_INVALID_CHARS and MINIPAL_TREAT_AS_LITTLE_ENDIAN. + * @return Length of UTF-16 buffer required by the conversion. + */ +size_t minipal_get_length_utf8_to_utf16(const char* source, size_t sourceLength, unsigned int flags); -int minipal_utf8_to_utf16_allocate(const char* lpSrcStr, int cchSrc, char16_t** lpDestStr, unsigned int dwFlags, bool treatAsLE); +/** + * Get length of destination needed for UTF-16 (UCS-2) to UTF-8 conversion + * + * @param source The source string in UTF-16 format. + * @param sourceLength Length of the source string. + * @param flags Flags to alter the behavior of converter. Supported flags are MINIPAL_MB_NO_REPLACE_INVALID_CHARS and MINIPAL_TREAT_AS_LITTLE_ENDIAN. + * @return Length of UTF-8 buffer required by the conversion. + */ +size_t minipal_get_length_utf16_to_utf8(const CHAR16_T* source, size_t sourceLength, unsigned int flags); -int minipal_utf16_to_utf8_allocate(const char16_t* lpSrcStr, int cchSrc, char** lpDestStr, bool treatAsLE); +/** + * Convert a string from UTF-8 to UTF-16 (UCS-2) with preallocated memory + * + * @param source The source string in UTF-8 format. + * @param sourceLength Length of the source string. + * @param destination Pointer to the destination UTF-16 string. It can be NULL to query number of items required by the conversion. + * @param destinationLength Length of the destination string. + * @param flags Flags to alter the behavior of converter. Supported flags are MINIPAL_MB_NO_REPLACE_INVALID_CHARS and MINIPAL_TREAT_AS_LITTLE_ENDIAN. + * @return Number of items written by the conversion. + */ +size_t minipal_convert_utf8_to_utf16(const char* source, size_t sourceLength, CHAR16_T* destination, size_t destinationLength, unsigned int flags); + +/** + * Convert a string from UTF-16 (UCS-2) to UTF-8 with preallocated memory + * + * @param source The source string in UTF-16 format. + * @param sourceLength Length of the source string. + * @param destination Pointer to the destination UTF-8 string. It can be NULL to query number of items required by the conversion. + * @param destinationLength Length of the destination string. + * @param flags Flags to alter the behavior of converter. Supported flags are MINIPAL_MB_NO_REPLACE_INVALID_CHARS and MINIPAL_TREAT_AS_LITTLE_ENDIAN. + * @return Number of items written by the conversion. + */ +size_t minipal_convert_utf16_to_utf8(const CHAR16_T* source, size_t sourceLength, char* destination, size_t destinationLength, unsigned int flags); #ifdef __cplusplus } From 50b05498a44c484406fa8114429edd84b49de260 Mon Sep 17 00:00:00 2001 From: Adeel <3840695+am11@users.noreply.github.com> Date: Sat, 17 Jun 2023 13:08:36 +0300 Subject: [PATCH 6/9] Delete unused macros --- src/coreclr/inc/utilcode.h | 179 -------------------------------- src/coreclr/vm/rtlfunctions.cpp | 2 +- 2 files changed, 1 insertion(+), 180 deletions(-) diff --git a/src/coreclr/inc/utilcode.h b/src/coreclr/inc/utilcode.h index a332a6ccd66927..bc84e71644c9d8 100644 --- a/src/coreclr/inc/utilcode.h +++ b/src/coreclr/inc/utilcode.h @@ -185,15 +185,6 @@ typedef LPSTR LPUTF8; // given and ANSI String, copy it into a wide buffer. // be careful about scoping when using this macro! // -// how to use the below two macros: -// -// ... -// LPSTR pszA; -// pszA = MyGetAnsiStringRoutine(); -// MAKE_WIDEPTR_FROMANSI(pwsz, pszA); -// MyUseWideStringRoutine(pwsz); -// ... -// // similarily for MAKE_ANSIPTR_FROMWIDE. note that the first param does not // have to be declared, and no clean up must be done. // @@ -211,25 +202,6 @@ typedef LPSTR LPUTF8; #define MAKE_TRANSLATIONFAILED ThrowWin32(ERROR_NO_UNICODE_TRANSLATION) #endif -// This version throws on conversion errors (ie, no best fit character -// mapping to characters that look similar, and no use of the default char -// ('?') when printing out unrepresentable characters. Use this method for -// most development in the EE, especially anything like metadata or class -// names. See the BESTFIT version if you're printing out info to the console. -#define MAKE_MULTIBYTE_FROMWIDE(ptrname, widestr, codepage) \ - int __l##ptrname = (int)u16_strlen(widestr); \ - if (__l##ptrname > MAKE_MAX_LENGTH) \ - MAKE_TOOLONGACTION; \ - __l##ptrname = (int)((__l##ptrname + 1) * 2 * sizeof(char)); \ - CQuickBytes __CQuickBytes##ptrname; \ - __CQuickBytes##ptrname.AllocThrows(__l##ptrname); \ - BOOL __b##ptrname; \ - DWORD __cBytes##ptrname = WszWideCharToMultiByte(codepage, WC_NO_BEST_FIT_CHARS, widestr, -1, (LPSTR)__CQuickBytes##ptrname.Ptr(), __l##ptrname, NULL, &__b##ptrname); \ - if (__b##ptrname || (__cBytes##ptrname == 0 && (widestr[0] != W('\0')))) { \ - MAKE_TRANSLATIONFAILED; \ - } \ - LPSTR ptrname = (LPSTR)__CQuickBytes##ptrname.Ptr() - // This version does best fit character mapping and also allows the use // of the default char ('?') for any Unicode character that isn't // representable. This is reasonable for writing to the console, but @@ -247,40 +219,6 @@ typedef LPSTR LPUTF8; } \ LPSTR ptrname = (LPSTR)__CQuickBytes##ptrname.Ptr() -// Use for anything critical other than output to console, where weird -// character mappings are unacceptable. -#define MAKE_ANSIPTR_FROMWIDE(ptrname, widestr) MAKE_MULTIBYTE_FROMWIDE(ptrname, widestr, CP_ACP) - -// Use for output to the console. -#define MAKE_ANSIPTR_FROMWIDE_BESTFIT(ptrname, widestr) MAKE_MULTIBYTE_FROMWIDE_BESTFIT(ptrname, widestr, CP_ACP) - -#define MAKE_WIDEPTR_FROMANSI(ptrname, ansistr) \ - CQuickBytes __qb##ptrname; \ - int __l##ptrname; \ - __l##ptrname = WszMultiByteToWideChar(CP_ACP, 0, ansistr, -1, 0, 0); \ - if (__l##ptrname > MAKE_MAX_LENGTH) \ - MAKE_TOOLONGACTION; \ - LPWSTR ptrname = (LPWSTR) __qb##ptrname.AllocThrows((__l##ptrname+1)*sizeof(WCHAR)); \ - if (WszMultiByteToWideChar(CP_ACP, MB_ERR_INVALID_CHARS, ansistr, -1, ptrname, __l##ptrname) == 0) { \ - MAKE_TRANSLATIONFAILED; \ - } - -#define MAKE_WIDEPTR_FROMANSI_NOTHROW(ptrname, ansistr) \ - CQuickBytes __qb##ptrname; \ - LPWSTR ptrname = 0; \ - int __l##ptrname; \ - __l##ptrname = WszMultiByteToWideChar(CP_ACP, 0, ansistr, -1, 0, 0); \ - if (__l##ptrname <= MAKE_MAX_LENGTH) { \ - ptrname = (LPWSTR) __qb##ptrname.AllocNoThrow((__l##ptrname+1)*sizeof(WCHAR)); \ - if (ptrname) { \ - if (WszMultiByteToWideChar(CP_ACP, MB_ERR_INVALID_CHARS, ansistr, -1, ptrname, __l##ptrname) != 0) { \ - ptrname[__l##ptrname] = 0; \ - } else { \ - ptrname = 0; \ - } \ - } \ - } - #define MAKE_UTF8PTR_FROMWIDE(ptrname, widestr) CQuickBytes _##ptrname; _##ptrname.ConvertUnicode_Utf8(widestr); LPSTR ptrname = (LPSTR) _##ptrname.Ptr(); #define MAKE_UTF8PTR_FROMWIDE_NOTHROW(ptrname, widestr) \ @@ -312,22 +250,8 @@ typedef LPSTR LPUTF8; } \ } \ -#define MAKE_WIDEPTR_FROMUTF8N(ptrname, utf8str, n8chrs) \ - CQuickBytes __qb##ptrname; \ - int __l##ptrname; \ - __l##ptrname = WszMultiByteToWideChar(CP_UTF8, 0, utf8str, n8chrs, 0, 0); \ - if (__l##ptrname > MAKE_MAX_LENGTH) \ - MAKE_TOOLONGACTION; \ - LPWSTR ptrname = (LPWSTR) __qb##ptrname .AllocThrows((__l##ptrname+1)*sizeof(WCHAR)); \ - if (0==WszMultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, utf8str, n8chrs, ptrname, __l##ptrname)) { \ - MAKE_TRANSLATIONFAILED; \ - } \ - ptrname[__l##ptrname] = 0; - - #define MAKE_WIDEPTR_FROMUTF8(ptrname, utf8str) CQuickBytes _##ptrname; _##ptrname.ConvertUtf8_Unicode(utf8str); LPCWSTR ptrname = (LPCWSTR) _##ptrname.Ptr(); - #define MAKE_WIDEPTR_FROMUTF8N_NOTHROW(ptrname, utf8str, n8chrs) \ CQuickBytes __qb##ptrname; \ int __l##ptrname; \ @@ -346,42 +270,10 @@ typedef LPSTR LPUTF8; #define MAKE_WIDEPTR_FROMUTF8_NOTHROW(ptrname, utf8str) MAKE_WIDEPTR_FROMUTF8N_NOTHROW(ptrname, utf8str, -1) -// This method takes the number of characters -#define MAKE_MULTIBYTE_FROMWIDEN(ptrname, widestr, _nCharacters, _pCnt, codepage) \ - CQuickBytes __qb##ptrname; \ - int __l##ptrname; \ - __l##ptrname = WszWideCharToMultiByte(codepage, WC_NO_BEST_FIT_CHARS, widestr, _nCharacters, NULL, 0, NULL, NULL); \ - if (__l##ptrname > MAKE_MAX_LENGTH) \ - MAKE_TOOLONGACTION; \ - ptrname = (LPUTF8) __qb##ptrname .AllocThrows(__l##ptrname+1); \ - BOOL __b##ptrname; \ - DWORD _pCnt = WszWideCharToMultiByte(codepage, WC_NO_BEST_FIT_CHARS, widestr, _nCharacters, ptrname, __l##ptrname, NULL, &__b##ptrname); \ - if (__b##ptrname || (_pCnt == 0 && _nCharacters > 0)) { \ - MAKE_TRANSLATIONFAILED; \ - } \ - ptrname[__l##ptrname] = 0; - -#define MAKE_MULTIBYTE_FROMWIDEN_BESTFIT(ptrname, widestr, _nCharacters, _pCnt, codepage) \ - CQuickBytes __qb##ptrname; \ - int __l##ptrname; \ - __l##ptrname = WszWideCharToMultiByte(codepage, 0, widestr, _nCharacters, NULL, 0, NULL, NULL); \ - if (__l##ptrname > MAKE_MAX_LENGTH) \ - MAKE_TOOLONGACTION; \ - ptrname = (LPUTF8) __qb##ptrname .AllocThrows(__l##ptrname+1); \ - DWORD _pCnt = WszWideCharToMultiByte(codepage, 0, widestr, _nCharacters, ptrname, __l##ptrname, NULL, NULL); \ - if (_pCnt == 0 && _nCharacters > 0) { \ - MAKE_TRANSLATIONFAILED; \ - } \ - ptrname[__l##ptrname] = 0; - -#define MAKE_ANSIPTR_FROMWIDEN(ptrname, widestr, _nCharacters, _pCnt) \ - MAKE_MULTIBYTE_FROMWIDEN(ptrname, widestr, _nCharacters, _pCnt, CP_ACP) - const SIZE_T MaxSigned32BitDecString = ARRAY_SIZE("-2147483648") - 1; const SIZE_T MaxUnsigned32BitDecString = ARRAY_SIZE("4294967295") - 1; const SIZE_T MaxIntegerDecHexString = ARRAY_SIZE("-9223372036854775808") - 1; -const SIZE_T Max16BitHexString = ARRAY_SIZE("1234") - 1; const SIZE_T Max32BitHexString = ARRAY_SIZE("12345678") - 1; const SIZE_T Max64BitHexString = ARRAY_SIZE("1234567812345678") - 1; @@ -410,77 +302,6 @@ inline WCHAR* FormatInteger(WCHAR* str, size_t strCount, const char* fmt, I v) return str; } -inline -LPWSTR DuplicateString( - LPCWSTR wszString, - size_t cchString) -{ - STATIC_CONTRACT_NOTHROW; - - LPWSTR wszDup = NULL; - if (wszString != NULL) - { - wszDup = new (nothrow) WCHAR[cchString + 1]; - if (wszDup != NULL) - { - wcscpy_s(wszDup, cchString + 1, wszString); - } - } - return wszDup; -} - -inline -LPWSTR DuplicateString( - LPCWSTR wszString) -{ - STATIC_CONTRACT_NOTHROW; - - if (wszString != NULL) - { - return DuplicateString(wszString, u16_strlen(wszString)); - } - else - { - return NULL; - } -} - -void DECLSPEC_NORETURN ThrowOutOfMemory(); - -inline -LPWSTR DuplicateStringThrowing( - LPCWSTR wszString, - size_t cchString) -{ - STATIC_CONTRACT_THROWS; - - if (wszString == NULL) - return NULL; - - LPWSTR wszDup = DuplicateString(wszString, cchString); - if (wszDup == NULL) - ThrowOutOfMemory(); - - return wszDup; -} - -inline -LPWSTR DuplicateStringThrowing( - LPCWSTR wszString) -{ - STATIC_CONTRACT_THROWS; - - if (wszString == NULL) - return NULL; - - LPWSTR wszDup = DuplicateString(wszString); - if (wszDup == NULL) - ThrowOutOfMemory(); - - return wszDup; -} - - //***************************************************************************** // Placement new is used to new and object at an exact location. The pointer // is simply returned to the caller without actually using the heap. The diff --git a/src/coreclr/vm/rtlfunctions.cpp b/src/coreclr/vm/rtlfunctions.cpp index 23f662b4d600ae..f3f80338f3f8ec 100644 --- a/src/coreclr/vm/rtlfunctions.cpp +++ b/src/coreclr/vm/rtlfunctions.cpp @@ -103,7 +103,7 @@ VOID InstallEEFunctionTable ( } else { - NewArrayHolder wzTempName(DuplicateStringThrowing(ssTempName.GetUnicode())); + NewArrayHolder wzTempName(ssTempName.GetCopyOfUnicodeString()); // publish result if (InterlockedCompareExchangeT(&wszModuleName, (LPWSTR)wzTempName, nullptr) == nullptr) From 3b1e48b7d6526dbdf996f555f9e6a81da295a2ef Mon Sep 17 00:00:00 2001 From: Adeel <3840695+am11@users.noreply.github.com> Date: Sat, 17 Jun 2023 23:30:07 +0300 Subject: [PATCH 7/9] Fix custom alloc in mono --- src/mono/mono/eglib/giconv.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/mono/mono/eglib/giconv.c b/src/mono/mono/eglib/giconv.c index 7863d8cbd35cd6..93ee1157bf4dbe 100644 --- a/src/mono/mono/eglib/giconv.c +++ b/src/mono/mono/eglib/giconv.c @@ -36,10 +36,6 @@ #define FORCE_INLINE(RET_TYPE) inline RET_TYPE __attribute__((always_inline)) #endif -#define UNROLL_DECODE_UTF8 0 - -static FORCE_INLINE (int) decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar); - #if G_BYTE_ORDER == G_LITTLE_ENDIAN #define decode_utf16 decode_utf16le #else @@ -386,9 +382,15 @@ g_utf8_to_utf16le_custom_alloc_impl (const gchar *str, glong len, glong *items_r if (ret <= 0) return NULL; - gunichar2 *lpDestStr = custom_alloc_func((ret + 1) * sizeof (gunichar2), custom_alloc_data); + gunichar2 *lpDestStr = custom_alloc_func((ret + 1) * sizeof(gunichar2), custom_alloc_data); + if (G_UNLIKELY (!lpDestStr)) { + g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY, "Allocation failed."); + return NULL; + } + flags |= MINIPAL_MB_NO_REPLACE_INVALID_CHARS; ret = (glong)minipal_convert_utf8_to_utf16 (str, len, lpDestStr, ret, flags); + lpDestStr[ret] = '\0'; map_error(err); return lpDestStr; @@ -510,6 +512,8 @@ g_utf16_to_utf8_impl (const gunichar2 *str, glong len, glong *items_read, glong len = 0; while (str[len]) len++; + + len++; } glong ret = (glong)minipal_get_length_utf16_to_utf8 (str, len, flags); @@ -521,7 +525,7 @@ g_utf16_to_utf8_impl (const gunichar2 *str, glong len, glong *items_read, glong if (ret <= 0) return NULL; - lpDestStr = (gchar *)malloc((ret + 1) * sizeof(gchar)); + lpDestStr = (gchar *)g_malloc((ret + 1) * sizeof(gchar)); ret = (glong)minipal_convert_utf16_to_utf8 (str, len, lpDestStr, ret, flags); lpDestStr[ret] = '\0'; @@ -553,6 +557,8 @@ g_utf16_to_utf8_custom_alloc (const gunichar2 *str, glong len, glong *items_read len = 0; while (str[len]) len++; + + len++; } glong ret = (glong)minipal_get_length_utf16_to_utf8 (str, len, 0); @@ -565,7 +571,13 @@ g_utf16_to_utf8_custom_alloc (const gunichar2 *str, glong len, glong *items_read return NULL; gchar *lpDestStr = custom_alloc_func((ret + 1) * sizeof (gunichar2), custom_alloc_data); + if (G_UNLIKELY (!lpDestStr)) { + g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY, "Allocation failed."); + return NULL; + } + ret = (glong)minipal_convert_utf16_to_utf8 (str, len, lpDestStr, ret, 0); + lpDestStr[ret] = '\0'; map_error(err); return lpDestStr; From b7d26cdcd1c2977095189d003eff2e293546a2ec Mon Sep 17 00:00:00 2001 From: Adeel <3840695+am11@users.noreply.github.com> Date: Tue, 20 Jun 2023 10:41:35 +0300 Subject: [PATCH 8/9] Error on invalid sequences when caller requested --- src/mono/mono/eglib/giconv.c | 7 +++--- src/native/minipal/utf8.c | 43 ++++++++++++++---------------------- src/native/minipal/utf8.h | 1 + 3 files changed, 22 insertions(+), 29 deletions(-) diff --git a/src/mono/mono/eglib/giconv.c b/src/mono/mono/eglib/giconv.c index 93ee1157bf4dbe..8ae955c303fe25 100644 --- a/src/mono/mono/eglib/giconv.c +++ b/src/mono/mono/eglib/giconv.c @@ -321,8 +321,9 @@ static FORCE_INLINE (void) map_error(GError **err) { if (errno == MINIPAL_ERROR_INSUFFICIENT_BUFFER) { - g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY, - "Allocation failed."); + g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY, "Allocation failed."); + } else if (errno == MINIPAL_ERROR_NO_UNICODE_TRANSLATION) { + g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "Illegal byte sequence encountered in the input."); } } @@ -351,7 +352,7 @@ g_utf8_to_utf16_impl (const gchar *str, glong len, glong *items_read, glong *ite lpDestStr = malloc((ret + 1) * sizeof(gunichar2)); ret = (glong)minipal_convert_utf8_to_utf16 (str, len, lpDestStr, ret, flags); - lpDestStr[ret] = '\0'; + lpDestStr[ret] = '\0'; if (items_written) *items_written = errno == 0 ? ret : 0; diff --git a/src/native/minipal/utf8.c b/src/native/minipal/utf8.c index bacad116efcd8a..b93b7308c1c2c3 100644 --- a/src/native/minipal/utf8.c +++ b/src/native/minipal/utf8.c @@ -152,13 +152,6 @@ static void DecoderReplacementFallbackBuffer_Reset(DecoderBuffer* self) self->byteStart = NULL; } -// Set the above values -static void DecoderBuffer_InternalInitialize(DecoderBuffer* self, unsigned char* byteStart, CHAR16_T* charEnd) -{ - self->byteStart = byteStart; - self->charEnd = charEnd; -} - typedef struct { const CHAR16_T strDefault[3]; @@ -442,12 +435,17 @@ static size_t GetCharCount(UTF8Encoding* self, unsigned char* bytes, size_t coun goto EncodeChar; InvalidByteSequence: - // this code fragment should be close to the gotos referencing it - // Have to do fallback for invalid bytes + if (!self->useFallback) + { + errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION; + return 0; + } + if (!fallbackUsed) { fallbackUsed = true; - if (self->useFallback) DecoderBuffer_InternalInitialize(&self->buffer.decoder, bytes, NULL); + self->buffer.decoder.byteStart = bytes; + self->buffer.decoder.charEnd = NULL; } charCount += self->buffer.decoder.strDefaultLength; @@ -728,12 +726,6 @@ static size_t GetCharCount(UTF8Encoding* self, unsigned char* bytes, size_t coun { // We were already adjusting for these, so need to unadjust charCount += (ch >> 30); - // Have to do fallback for invalid bytes - if (!fallbackUsed) - { - fallbackUsed = true; - if (self->useFallback) DecoderBuffer_InternalInitialize(&self->buffer.decoder, bytes, NULL); - } charCount += self->buffer.decoder.strDefaultLength; } @@ -848,12 +840,19 @@ static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, goto EncodeChar; InvalidByteSequence: + if (!self->useFallback) + { + errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION; + return 0; + } + // this code fragment should be close to the gotos referencing it // Have to do fallback for invalid bytes if (!fallbackUsed) { fallbackUsed = true; - if (self->useFallback) DecoderBuffer_InternalInitialize(&self->buffer.decoder, bytes, pAllocatedBufferEnd); + self->buffer.decoder.byteStart = bytes; + self->buffer.decoder.charEnd = pAllocatedBufferEnd; } // That'll back us up the appropriate # of bytes if we didn't get anywhere @@ -862,7 +861,7 @@ static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, // Check if we ran out of buffer space assert(pSrc >= bytes || pTarget == chars); - if (self->useFallback) DecoderReplacementFallbackBuffer_Reset(&self->buffer.decoder); + DecoderReplacementFallbackBuffer_Reset(&self->buffer.decoder); if (pTarget == chars) { errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; @@ -1247,13 +1246,6 @@ static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, if (ch != 0) { - // Have to do fallback for invalid bytes - if (!fallbackUsed) - { - fallbackUsed = true; - if (self->useFallback) DecoderBuffer_InternalInitialize(&self->buffer.decoder, bytes, NULL); - } - // This'll back us up the appropriate # of bytes if we didn't get anywhere if (!self->useFallback) { @@ -1261,7 +1253,6 @@ static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, // Ran out of buffer space // Need to throw an exception? - if (self->useFallback) DecoderReplacementFallbackBuffer_Reset(&self->buffer.decoder); if (pTarget == chars) { errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; diff --git a/src/native/minipal/utf8.h b/src/native/minipal/utf8.h index 29c5ba8ab308c2..bd648f137a2bb1 100644 --- a/src/native/minipal/utf8.h +++ b/src/native/minipal/utf8.h @@ -11,6 +11,7 @@ #define MINIPAL_MB_NO_REPLACE_INVALID_CHARS 0x00000008 #define MINIPAL_TREAT_AS_LITTLE_ENDIAN 0x00000016 #define MINIPAL_ERROR_INSUFFICIENT_BUFFER 122L +#define MINIPAL_ERROR_NO_UNICODE_TRANSLATION 1113L #ifdef __cplusplus extern "C" From 18c04bfb2dbe680b90eb549d46212d70e1c34127 Mon Sep 17 00:00:00 2001 From: Adeel <3840695+am11@users.noreply.github.com> Date: Tue, 20 Jun 2023 22:52:17 +0300 Subject: [PATCH 9/9] Remove count from convert APIs --- src/coreclr/pal/src/locale/unicode.cpp | 4 - src/native/minipal/utf8.c | 138 ++++++++++++++----------- 2 files changed, 78 insertions(+), 64 deletions(-) diff --git a/src/coreclr/pal/src/locale/unicode.cpp b/src/coreclr/pal/src/locale/unicode.cpp index b9a0291394dc9b..8bfa58608e5942 100644 --- a/src/coreclr/pal/src/locale/unicode.cpp +++ b/src/coreclr/pal/src/locale/unicode.cpp @@ -253,8 +253,6 @@ MultiByteToWideChar( goto EXIT; } - // Use minipal_convert_utf8_to_utf16 on all systems, since it replaces - // invalid characters and Core Foundation doesn't do that. if (CodePage == CP_UTF8 || CodePage == CP_ACP) { if (cbMultiByte < 0) @@ -344,8 +342,6 @@ WideCharToMultiByte( defaultChar = *lpDefaultChar; } - // Use minipal_convert_utf16_to_utf8 on all systems because we use - // UTF8ToUnicode in MultiByteToWideChar() on all systems. if (CodePage == CP_UTF8 || CodePage == CP_ACP) { if (cchWideChar < 0) diff --git a/src/native/minipal/utf8.c b/src/native/minipal/utf8.c index b93b7308c1c2c3..a54b805540f897 100644 --- a/src/native/minipal/utf8.c +++ b/src/native/minipal/utf8.c @@ -92,7 +92,7 @@ static bool DecoderReplacementFallbackBuffer_Fallback(DecoderBuffer* self) // Right now this has both bytes and bytes[], since we might have extra bytes, hence the // array, and we might need the index, hence the byte* // Don't touch ref chars unless we succeed -static bool DecoderReplacementFallbackBuffer_InternalFallback_Copy(DecoderBuffer* self, CHAR16_T** chars) +static bool DecoderReplacementFallbackBuffer_InternalFallback_Copy(DecoderBuffer* self, CHAR16_T** chars, CHAR16_T* pAllocatedBufferEnd) { assert(self->byteStart != NULL); @@ -132,6 +132,11 @@ static bool DecoderReplacementFallbackBuffer_InternalFallback_Copy(DecoderBuffer } *(charTemp++) = ch; + if (charTemp > pAllocatedBufferEnd) + { + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return false; + } } // Need to make sure that bHighSurrogate isn't true @@ -332,13 +337,13 @@ static bool InRange(int c, int begin, int end) // During GetChars we had an invalid byte sequence // pSrc is backed up to the start of the bad sequence if we didn't have room to // fall it back. Otherwise pSrc remains where it is. -static bool FallbackInvalidByteSequence_Copy(UTF8Encoding* self, unsigned char** pSrc, CHAR16_T** pTarget) +static bool FallbackInvalidByteSequence_Copy(UTF8Encoding* self, unsigned char** pSrc, CHAR16_T** pTarget, CHAR16_T* pAllocatedBufferEnd) { assert(self->useFallback); // Get our byte[] unsigned char* pStart = *pSrc; - bool fallbackResult = DecoderReplacementFallbackBuffer_InternalFallback_Copy(&self->buffer.decoder, pTarget); + bool fallbackResult = DecoderReplacementFallbackBuffer_InternalFallback_Copy(&self->buffer.decoder, pTarget, pAllocatedBufferEnd); // Do the actual fallback if (!fallbackResult) @@ -736,6 +741,14 @@ static size_t GetCharCount(UTF8Encoding* self, unsigned char* bytes, size_t coun return charCount; } +#define ENSURE_BUFFER_INC \ + pTarget++; \ + if (pTarget > pAllocatedBufferEnd) \ + { \ + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; \ + return 0; \ + } + static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, CHAR16_T* chars, size_t charCount) { assert(chars != NULL); @@ -830,7 +843,8 @@ static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, { *pTarget = (CHAR16_T)(((ch >> 10) & 0x7FF) + (HIGH_SURROGATE_START - (0x10000 >> 10))); - pTarget++; + + ENSURE_BUFFER_INC ch = (ch & 0x3FF) + (int)(LOW_SURROGATE_START); @@ -856,17 +870,14 @@ static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, } // That'll back us up the appropriate # of bytes if we didn't get anywhere - if (!FallbackInvalidByteSequence_Copy(self, &pSrc, &pTarget)) + if (!FallbackInvalidByteSequence_Copy(self, &pSrc, &pTarget, pAllocatedBufferEnd)) { + if (errno == MINIPAL_ERROR_INSUFFICIENT_BUFFER) return 0; + // Check if we ran out of buffer space - assert(pSrc >= bytes || pTarget == chars); + assert(pSrc >= bytes); DecoderReplacementFallbackBuffer_Reset(&self->buffer.decoder); - if (pTarget == chars) - { - errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; - return 0; - } ch = 0; break; } @@ -960,15 +971,7 @@ static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, } pSrc--; - // Throw that we don't have enough room (pSrc could be < chars if we had started to process - // a 4 byte sequence already) - assert(pSrc >= bytes || pTarget == chars); - - if (pTarget == chars) - { - errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; - return 0; - } + assert(pSrc >= bytes); // Don't store ch in decoder, we already backed up to its start ch = 0; @@ -977,7 +980,7 @@ static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, break; } *pTarget = (CHAR16_T)ch; - pTarget++; + ENSURE_BUFFER_INC int availableChars = pAllocatedBufferEnd - pTarget; int availableBytes = pEnd - pSrc; @@ -1004,7 +1007,7 @@ static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, if (ch > 0x7F) goto ProcessChar; *pTarget = (CHAR16_T)ch; - pTarget++; + ENSURE_BUFFER_INC } // we are done ch = 0; @@ -1028,7 +1031,7 @@ static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, if (ch > 0x7F) goto LongCode; *pTarget = (CHAR16_T)ch; - pTarget++; + ENSURE_BUFFER_INC // get pSrc to be 2-byte aligned if ((((size_t)pSrc) & 0x1) != 0) @@ -1038,7 +1041,7 @@ static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, if (ch > 0x7F) goto LongCode; *pTarget = (CHAR16_T)ch; - pTarget++; + ENSURE_BUFFER_INC } // get pSrc to be 4-byte aligned @@ -1047,6 +1050,13 @@ static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, ch = *(unsigned short*)pSrc; if ((ch & 0x8080) != 0) goto LongCodeWithMask16; + + if (pTarget + 2 > pAllocatedBufferEnd) + { + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; + } + // Unfortunately, this is endianness sensitive #if BIGENDIAN if (!self->treatAsLE) @@ -1073,6 +1083,12 @@ static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, int chb = *(int*)(pSrc + 4); if (((ch | chb) & (int)0x80808080) != 0) goto LongCodeWithMask32; + if (pTarget + 8 > pAllocatedBufferEnd) + { + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; + } + // Unfortunately, this is endianness sensitive #if BIGENDIAN if (!self->treatAsLE) @@ -1124,7 +1140,7 @@ static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, if (ch <= 0x7F) { *pTarget = (CHAR16_T)ch; - pTarget++; + ENSURE_BUFFER_INC continue; } @@ -1176,7 +1192,7 @@ static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, *pTarget = (CHAR16_T)(((ch >> 10) & 0x7FF) + (HIGH_SURROGATE_START - (0x10000 >> 10))); - pTarget++; + ENSURE_BUFFER_INC ch = (ch & 0x3FF) + (LOW_SURROGATE_START); @@ -1224,7 +1240,7 @@ static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, } *pTarget = (CHAR16_T)ch; - pTarget++; + ENSURE_BUFFER_INC // extra byte, we're only expecting 1 char for each of these 2 bytes, // but the loop is testing the target (not source) against pStop. @@ -1267,6 +1283,12 @@ static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, // (don't have to check m_throwOnOverflow for chars) assert(!fallbackUsed || self->buffer.decoder.fallbackCount < 0); + if (pSrc < pEnd) + { + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; + } + return pTarget - chars; } @@ -1467,22 +1489,22 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un else { *pTarget = (unsigned char)(0xF0 | (ch >> 18)); - pTarget++; + ENSURE_BUFFER_INC chb = 0x80 | ((ch >> 12) & 0x3F); } *pTarget = (unsigned char)chb; - pTarget++; + ENSURE_BUFFER_INC chb = 0x80 | ((ch >> 6) & 0x3F); } *pTarget = (unsigned char)chb; - pTarget++; + ENSURE_BUFFER_INC *pTarget = (unsigned char)0x80 | (ch & 0x3F); } - pTarget++; + ENSURE_BUFFER_INC // If still have fallback don't do fast loop if (fallbackUsed && (ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder)) != 0) @@ -1514,7 +1536,7 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un if (ch > 0x7F) goto ProcessChar; *pTarget = (unsigned char)ch; - pTarget++; + ENSURE_BUFFER_INC } // we are done, let ch be 0 to clear encoder ch = 0; @@ -1546,7 +1568,7 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un if (ch > 0x7F) goto LongCode; *pTarget = (unsigned char)ch; - pTarget++; + ENSURE_BUFFER_INC // get pSrc aligned if (((size_t)pSrc & 0x2) != 0) @@ -1556,7 +1578,7 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un if (ch > 0x7F) goto LongCode; *pTarget = (unsigned char)ch; - pTarget++; + ENSURE_BUFFER_INC } // Run 4 characters at a time! @@ -1567,6 +1589,12 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un if (((ch | chc) & (int)0xFF80FF80) != 0) goto LongCodeWithMask; + if (pTarget + 4 > pAllocatedBufferEnd) + { + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; + } + // Unfortunately, this is endianness sensitive #if BIGENDIAN if (!self->treatAsLE) @@ -1603,7 +1631,7 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un if (ch > 0x7F) goto LongCode; *pTarget = (unsigned char)ch; - pTarget++; + ENSURE_BUFFER_INC continue; LongCode: @@ -1650,23 +1678,23 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un // pStop - this unsigned char is compensated by the second surrogate character // 2 input chars require 4 output bytes. 2 have been anticipated already // and 2 more will be accounted for by the 2 pStop-- calls below. - pTarget++; + ENSURE_BUFFER_INC chd = 0x80 | ((ch >> 12) & 0x3F); } *pTarget = (unsigned char)chd; pStop--; // 3 unsigned char sequence for 1 char, so need pStop-- and the one below too. - pTarget++; + ENSURE_BUFFER_INC chd = 0x80 | ((ch >> 6) & 0x3F); } *pTarget = (unsigned char)chd; pStop--; // 2 unsigned char sequence for 1 char so need pStop--. - pTarget++; + ENSURE_BUFFER_INC *pTarget = (unsigned char)(0x80 | (ch & 0x3F)); // pStop - this unsigned char is already included - pTarget++; + ENSURE_BUFFER_INC } assert(pTarget <= pAllocatedBufferEnd); @@ -1675,6 +1703,12 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un ch = 0; } + if (pSrc < pEnd) + { + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; + } + return (int)(pTarget - bytes); } @@ -2080,16 +2114,8 @@ size_t minipal_convert_utf8_to_utf16(const char* source, size_t sourceLength, CH #endif }; - if (GetCharCount(&enc, (unsigned char*)source, sourceLength) > destinationLength) - { - errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; - ret = 0; - } - else - { - ret = GetChars(&enc, (unsigned char*)source, sourceLength, destination, destinationLength); - if (errno) ret = 0; - } + ret = GetChars(&enc, (unsigned char*)source, sourceLength, destination, destinationLength); + if (errno) ret = 0; return ret; } @@ -2112,20 +2138,12 @@ size_t minipal_convert_utf16_to_utf8(const CHAR16_T* source, size_t sourceLength #endif }; - if (GetByteCount(&enc, (CHAR16_T*)source, sourceLength) > destinationLength) - { - errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; - ret = 0; - } - else - { #if !BIGENDIAN - (void)flags; // unused + (void)flags; // unused #endif - ret = GetBytes(&enc, (CHAR16_T*)source, sourceLength, (unsigned char*)destination, destinationLength); - if (errno) ret = 0; - } + ret = GetBytes(&enc, (CHAR16_T*)source, sourceLength, (unsigned char*)destination, destinationLength); + if (errno) ret = 0; return ret; }