Skip to content

Commit

Permalink
Merge 0cf9e2e into 8f58ec8
Browse files Browse the repository at this point in the history
  • Loading branch information
Kohki Akikaze authored Oct 3, 2020
2 parents 8f58ec8 + 0cf9e2e commit a6cd22f
Showing 1 changed file with 97 additions and 43 deletions.
140 changes: 97 additions & 43 deletions sakura_core/charset/CESI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -501,12 +501,20 @@ void CESI::GetEncodingInfo_meta( const char* pS, const int nLen )
{
// XML宣言は先頭にあるので、最初にチェック
ECodeType encoding = AutoDetectByXML( pS, nLen );
if( encoding == CODE_NONE ){
auto nret = CODE_NONE;
if( encoding == CODE_NONE || encoding == CODE_AUTODETECT ){
// スクリプト等Coding中にHTMLがあるのでCodingを優先
encoding = AutoDetectByCoding( pS, nLen );
}
if( encoding == CODE_NONE ){
encoding = AutoDetectByHTML( pS, nLen );
nret = AutoDetectByCoding( pS, nLen );
if( nret != CODE_NONE ){
// 判定に成功した場合はencodingを更新する
encoding = nret;
} else {
nret = AutoDetectByHTML( pS, nLen );
if( nret != CODE_NONE ){
// 判定に成功した場合はencodingを更新する
encoding = nret;
}
}
}
m_eMetaName = encoding;
}
Expand Down Expand Up @@ -745,24 +753,61 @@ void CESI::GuessUtf8OrCesu8( void )
}
}

// 2016.04.05 いくつかの互換名を追加
static const struct{
const char* name;
int nLen;
int nCode;
} encodingNameToCode[] = {
{ "shift_jis", 9, CODE_SJIS },
{ "windows-31j", 11, CODE_SJIS },
{ "x-sjis", 6, CODE_SJIS },
{ "shift_jis", 9, CODE_SJIS },
{ "cp932", 9, CODE_SJIS },
{ "cp932", 5, CODE_SJIS },
{ "ms932", 5, CODE_SJIS },
{ "shift-jis", 9, CODE_SJIS },
{ "csWindows31J", 12, CODE_SJIS },
{ "MS_Kanji", 8, CODE_SJIS },
{ "csShiftJIS", 10, CODE_SJIS },
{ "sjis", 4, CODE_SJIS },
{ "iso-2022-jp", 11, CODE_JIS },
{ "iso2022jp", 9, CODE_JIS },
{ "csISO2022jp", 11, CODE_JIS },
{ "euc-jp", 6, CODE_EUC },
{ "euc_jp", 6, CODE_EUC },
{ "eucjp", 5, CODE_EUC },
{ "cseucpkdfmtjapanese", 19, CODE_EUC },
{ "extended_unix_code_packed_format_for_japanese", 45, CODE_EUC },
{ "x-euc-jp", 8, CODE_EUC },
// { "utf-7", 5, CODE_UTF7 },
// { "csutf7", 6, CODE_UTF7 },
{ "utf-8", 5, CODE_UTF8 },
{ "utf_8", 5, CODE_UTF8 },
{ "utf8", 4, CODE_UTF8 },
{ "csutf8", 6, CODE_UTF8 },
{ "unicode-1-1-utf-8", 17, CODE_UTF8 },
{ "unicode11utf8", 13, CODE_UTF8 },
{ "unicode20utf8", 13, CODE_UTF8 },
{ "x-unicode20utf8", 15, CODE_UTF8 },
{ "cesu-8", 6, CODE_CESU8 },
{ "cscesu-8", 6, CODE_CESU8 },
{ "cscesu8", 7, CODE_CESU8 },
{ "iso-8859-1", 10, CODE_LATIN1 },
{ "latin1", 7, CODE_LATIN1 },
{ "latin-1", 8, CODE_LATIN1 },
{ "latin1", 6, CODE_LATIN1 },
{ "latin-1", 7, CODE_LATIN1 },
{ "iso8859_1", 9, CODE_LATIN1 },
{ "iso_8859_1", 10, CODE_LATIN1 },
{ "iso88591", 8, CODE_LATIN1 },
{ "cp819", 5, CODE_LATIN1 },
{ "csisolatin1", 11, CODE_LATIN1 },
{ "ibm819", 6, CODE_LATIN1 },
{ "iso-ir-100", 10, CODE_LATIN1 },
{ "iso8859-1", 9, CODE_LATIN1 },
{ "iso_8859-1", 10, CODE_LATIN1 },
{ "l1", 2, CODE_LATIN1 },
{ "windows-1252", 12, CODE_LATIN1 },
{ "cp1252", 6, CODE_LATIN1 },
{ "cswindows1252", 13, CODE_LATIN1 },
{ "x-cp1252", 8, CODE_LATIN1 },
{"ibm437", 6, 437},
{"asmo-708", 8, 708},
{"dos-720", 7, 720},
Expand All @@ -788,7 +833,6 @@ static const struct{
{"ibm1026", 7, 1026},
{"windows-1250", 12, 1250},
{"windows-1251", 12, 1251},
{"windows-1252", 12, 1252},
{"windows-1253", 12, 1253},
{"windows-1254", 12, 1254},
{"windows-1255", 12, 1255},
Expand Down Expand Up @@ -861,7 +905,6 @@ static const struct{
{"x-iscii-ma", 10, 57009},
{"x-iscii-gu", 10, 57010},
{"x-iscii-pa", 10, 57011},
{ NULL, 0, 0}
};

static bool IsXMLWhiteSpace( int c )
Expand All @@ -876,8 +919,28 @@ static bool IsXMLWhiteSpace( int c )
return false;
}

/*!
文字列からコードを判定する。
@param [in] pBuf 判定対象文字列
@param [in] nSize 文字列サイズ
@return 文字コード
*/
static ECodeType MatchEncoding(const char* pBuf, int nSize)
{
for(int k = 0; k < _countof(encodingNameToCode); k++ ){
const int nLen = encodingNameToCode[k].nLen;
if( nLen == nSize && 0 == _memicmp(encodingNameToCode[k].name, pBuf, nLen) ){
return static_cast<ECodeType>(encodingNameToCode[k].nCode);
}
}
return CODE_NONE;
}

/*! ファイル中のエンコーディング指定を利用した文字コード自動選択
* @return 決定した文字コード。 未決定は-1を返す
* @return 決定した文字コード。 未決定は-1を返す。
xml宣言ありでencodingがない場合、CODE_AUTODETECTなので注意
*/
ECodeType CESI::AutoDetectByXML( const char* pBuf, int nSize )
{
Expand Down Expand Up @@ -913,21 +976,16 @@ ECodeType CESI::AutoDetectByXML( const char* pBuf, int nSize )
quoteChar = pBuf[i];
i++;
int k;
for( k = 0; encodingNameToCode[k].name != NULL; k++ ){
const int nLen = encodingNameToCode[k].nLen;
if( i + nLen < nSize - 1
&& pBuf[i + nLen] == quoteChar
&& 0 == _memicmp( encodingNameToCode[k].name, pBuf + i, nLen ) ){
return static_cast<ECodeType>(encodingNameToCode[k].nCode);
}
}
for(k = i; pBuf[k] != quoteChar && k < nSize - 1; ++k){}
// 2016.04.05 不明なencoding名の場合にUTF-8になっていたのをNoneに変更
return MatchEncoding(pBuf + i, k - i);
}else{
if( pBuf[i] == '<' || pBuf[i] == '>' ){
break;
}
// encoding指定無しでxml宣言が終了した
if( pBuf[i] == '?' && pBuf[i + 1] == '>' ){
return CODE_UTF8;
return CODE_AUTODETECT;
}
}
}
Expand All @@ -937,7 +995,7 @@ ECodeType CESI::AutoDetectByXML( const char* pBuf, int nSize )
}
// encoding指定無しでxml宣言が終了した
if( pBuf[i] == '?' && pBuf[i + 1] == '>' ){
return CODE_UTF8;
return CODE_AUTODETECT;
}
}
}else
Expand Down Expand Up @@ -1041,29 +1099,20 @@ ECodeType CESI::AutoDetectByHTML( const char* pBuf, int nSize )
if( nEndAttVal <= i ){ i = nNextPos; continue; }
int nCharsetBegin = i;
while( i < nEndAttVal && !IsXMLWhiteSpace(pBuf[i]) ){ i++; }
int k;
for( k = 0; encodingNameToCode[k].name != NULL; k++ ){
const int nLen = encodingNameToCode[k].nLen;
if( i - nCharsetBegin == nLen
&& 0 == _memicmp( encodingNameToCode[k].name, pBuf + nCharsetBegin, nLen ) ){
if( bContentType ){
return static_cast<ECodeType>(encodingNameToCode[k].nCode);
}else{
encoding = static_cast<ECodeType>(encodingNameToCode[k].nCode);
break;
}
ECodeType eCode = MatchEncoding(pBuf + nCharsetBegin, i - nCharsetBegin);
if( eCode != CODE_NONE ){
if( bContentType ){
return eCode;
}else{
encoding = eCode;
}
}
}
i = nNextPos;
}else if( 3 == nAttType ){
int k;
for( k = 0; encodingNameToCode[k].name != NULL; k++ ){
const int nLen = encodingNameToCode[k].nLen;
if( nEndAttVal - nBeginAttVal == nLen
&& 0 == _memicmp( encodingNameToCode[k].name, pBuf + nBeginAttVal, nLen ) ){
return static_cast<ECodeType>(encodingNameToCode[k].nCode);
}
ECodeType eCode = MatchEncoding(pBuf + nBeginAttVal, nEndAttVal - nBeginAttVal);
if( eCode != CODE_NONE ){
return eCode;
}
}
}else if( '<' == pBuf[i] ){
Expand All @@ -1085,6 +1134,7 @@ static bool IsEncodingNameChar( int c )
{
return ('A' <= c && c <= 'Z')
|| ('a' <= c && c <= 'z')
|| ('0' <= c && c <= '9')
|| '_' == c
|| '-' == c
;
Expand All @@ -1111,7 +1161,7 @@ ECodeType CESI::AutoDetectByCoding( const char* pBuf, int nSize )
return CODE_NONE;
}
int k;
for( k = 0; encodingNameToCode[k].name != NULL; k++ ){
for( k = 0; k < _countof(encodingNameToCode); k++ ){
const int nLen = encodingNameToCode[k].nLen;
if( i - nBegin == nLen
&& 0 == _memicmp( encodingNameToCode[k].name, pBuf + nBegin, nLen ) ){
Expand Down Expand Up @@ -1196,7 +1246,7 @@ static ECodeType DetectUnicode( CESI* pcesi )
return pcesi->m_aWcInfo[ebom_type].eCodeID;
}

/*
/*!
日本語コードセット判定
*/
ECodeType CESI::CheckKanjiCode(const char* pBuf, size_t nBufLen) noexcept
Expand All @@ -1215,7 +1265,7 @@ ECodeType CESI::CheckKanjiCode(const char* pBuf, size_t nBufLen) noexcept
*/
SetInformation(pBuf, nBufLen);

if( GetMetaName() != CODE_NONE ){
if( GetMetaName() != CODE_NONE && GetMetaName() != CODE_AUTODETECT ){
return GetMetaName();
}
auto nret = DetectUnicode( this );
Expand All @@ -1226,6 +1276,10 @@ ECodeType CESI::CheckKanjiCode(const char* pBuf, size_t nBufLen) noexcept
if( nret != CODE_NONE && GetStatus() != ESI_NODETECTED ){
return nret;
}
if( GetMetaName() == CODE_AUTODETECT ){
// MetaがAUTODETECTの場合は、encodingがないxml文書。これまで通りUTF-8とみなす。
return CODE_UTF8;
}

// デフォルト文字コードを返す
return m_pEncodingConfig->m_eDefaultCodetype;
Expand Down

0 comments on commit a6cd22f

Please sign in to comment.