From c7b58e3fe777a1107e895ca2b0a5f3f75ab16db8 Mon Sep 17 00:00:00 2001 From: nathancorvussolis Date: Sat, 2 Aug 2014 19:19:51 +0900 Subject: [PATCH] 2.0.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ・約16KBを超過する行がファイルにあるとき文字コード変換に失敗することがあるバグを修正しました。 ・文字コード指定のオプションを変更しました。  ・cveuc.exe [-euwEUW] <入力ファイル> <出力ファイル>   ・入力ファイル文字コード指定    ・-e : EUC-JIS-2004 (改行LF) (デフォルト)    ・-u : UTF-8 (BOMなしまたはBOMあり、改行CR+LFまたはLF)    ・-w : UTF-16 (LE、BOMなしまたはBOMあり、改行CR+LFまたはLF)   ・出力ファイル文字コード指定    ・-E : EUC-JIS-2004 (改行LF)    ・-U : UTF-8 (BOMなし、改行LF)    ・-W : UTF-16 (LE、BOMあり、改行CR+LF) (デフォルト) --- README.TXT | 65 ++++++++---- cveuc/cveuc.cpp | 205 ++++++++++++++++++++---------------- cveuc/cveuc.vcxproj | 2 + cveuc/cveuc.vcxproj.filters | 6 ++ cveuc/eucjis2004.cpp | 62 ++++++++++- cveuc/eucjis2004.h | 6 ++ cveuc/stdafx.h | 2 + cveuc/utf8.cpp | 48 +++++++++ cveuc/utf8.h | 11 ++ 9 files changed, 293 insertions(+), 114 deletions(-) create mode 100644 cveuc/utf8.cpp create mode 100644 cveuc/utf8.h diff --git a/README.TXT b/README.TXT index da29772..8116d95 100644 --- a/README.TXT +++ b/README.TXT @@ -1,55 +1,78 @@  ------------------------------------------------------------------------------- - cveuc 1.2.0 + cveuc 2.0.0 ------------------------------------------------------------------------------- 文字コード変換コマンドラインプログラムです。 -EUC-JIS-2004(改行LF)、UTF-16(LE、BOMあり、改行CR+LF)、UTF-8(BOMなし、改行LF) を -相互に変換します。 +EUC-JIS-2004、UTF-16、UTF-8 を相互に変換します。 変換テーブルは cygwin の iconv 1.14 と同等としています。 実行方法 - cveuc.exe [-uUW] + cveuc.exe [-euwEUW] オプション - -u : 入力ファイル UTF-8 - -U : 出力ファイル UTF-8 - -W : 出力ファイル UTF-16 + 入力ファイル文字コード指定 + + -e : EUC-JIS-2004 (改行LF) (デフォルト) + -u : UTF-8 (BOMなしまたはBOMあり、改行CR+LFまたはLF) + -w : UTF-16 (LE、BOMなしまたはBOMあり、改行CR+LFまたはLF) + + 出力ファイル文字コード指定 + + -E : EUC-JIS-2004 (改行LF) + -U : UTF-8 (BOMなし、改行LF) + -W : UTF-16 (LE、BOMあり、改行CR+LF) (デフォルト) 使用例 - 1) EUC-JIS-2004 → UTF-16 - cveuc.exe euc.txt utf16.txt + 1) EUC-JIS-2004 → EUC-JIS-2004 - 2) EUC-JIS-2004 → UTF-16 - cveuc.exe -W euc.txt utf16.txt + cveuc.exe -E euc.txt euc_2.txt + cveuc.exe -e -E euc.txt euc_2.txt + + 2) EUC-JIS-2004 → UTF-8 - 3) EUC-JIS-2004 → UTF-8 cveuc.exe -U euc.txt utf8.txt + cveuc.exe -e -U euc.txt utf8.txt + + 3) EUC-JIS-2004 → UTF-16 + + cveuc.exe euc.txt utf16.txt + cveuc.exe -e euc.txt utf16.txt + cveuc.exe -W euc.txt utf16.txt + cveuc.exe -e -W euc.txt utf16.txt 4) UTF-16 → EUC-JIS-2004 - cveuc.exe utf16.txt euc.txt + + cveuc.exe -w -E utf16.txt euc.txt 5) UTF-16 → UTF-8 - cveuc.exe -U utf16.txt utf8.txt - 6) UTF-8 → EUC-JIS-2004 - cveuc.exe -u utf8.txt euc.txt + cveuc.exe -w -U utf16.txt utf8.txt - 7) UTF-8 → UTF-16 - cveuc.exe -u -W utf8.txt utf16.txt + 6) UTF-16 → UTF-16 + + cveuc.exe -w utf16.txt utf16_2.txt + cveuc.exe -w -W utf16.txt utf16_2.txt + + 7) UTF-8 → EUC-JIS-2004 - 8) UTF-16 → UTF-16 (単なるコピーと同じ) - cveuc.exe -W utf16.txt utf16_2.txt + cveuc.exe -u -E utf8.txt euc.txt + + 8) UTF-8 → UTF-8 - 9) UTF-8 → UTF-8 (単なるコピーと同じ) cveuc.exe -u -U utf8.txt utf8_2.txt + 9) UTF-8 → UTF-16 + + cveuc.exe -u utf8.txt utf16.txt + cveuc.exe -u -W utf8.txt utf16.txt + ------------------------------------------------------------------------------- diff --git a/cveuc/cveuc.cpp b/cveuc/cveuc.cpp index 54f7796..66b1480 100644 --- a/cveuc/cveuc.cpp +++ b/cveuc/cveuc.cpp @@ -1,16 +1,23 @@  #include "eucjis2004.h" +#include "utf8.h" -#define VERSION L"1.2.0" +#define VERSION L"2.0.0" -#define BUFSIZE 0x8000 +#define BUFSIZE 0x800 -#define RccsUTF8 L"r,ccs=UTF-8" -#define WccsUTF8 L"w,ccs=UTF-8" -#define WccsUTF16LE L"w,ccs=UTF-16LE" -#define RB L"rb" -#define WB L"wb" -#define BOM 0xFEFF +#define RccsUTF8 L"r,ccs=UTF-8" +#define WccsUTF8 L"w,ccs=UTF-8" +#define RccsUTF16LE L"r,ccs=UTF-16LE" +#define WccsUTF16LE L"w,ccs=UTF-16LE" +#define RB L"rb" +#define WB L"wb" + +enum enum_inenc { + in_euc, + in_utf16, + in_utf8 +}; enum enum_outenc { out_euc, @@ -21,27 +28,33 @@ enum enum_outenc { void print_usage(void) { fwprintf(stderr, L"\ncveuc %s\n\n", VERSION); - fwprintf(stderr, L"usage : cveuc [option] \n"); - fwprintf(stderr, L" option :\n"); - fwprintf(stderr, L" -u input file in UTF-8\n"); - fwprintf(stderr, L" -U output file in UTF-8 (LF, without BOM)\n"); - fwprintf(stderr, L" -W output file in UTF-16 (CR+LF, LE with BOM)\n"); - fwprintf(stderr, L"\ndefault file encodings : EUC-JIS-2004 and UTF-16(CR+LF, LE with BOM)\n"); + fwprintf(stderr, L"usage : cveuc [option] \n" + L" option :\n" + L" input file encoding :\n" + L" -e EUC-JIS-2004 (LF) (default)\n" + L" -u UTF-8 (with or without BOM, LF or CR+LF)\n" + L" -w UTF-16 (LE, with or without BOM, LF or CR+LF)\n" + L" output file encoding :\n" + L" -E EUC-JIS-2004 (LF)\n" + L" -U UTF-8 (without BOM, LF)\n" + L" -W UTF-16 (LE, with BOM, CR+LF) (default)\n" + ); } int wmain(int argc, wchar_t* argv[]) { FILE *fpi, *fpo; - WCHAR bom = L'\0'; - CHAR buf[BUFSIZE*2]; + CHAR buf[BUFSIZE * sizeof(WCHAR)]; + LPSTR pb; WCHAR wbuf[BUFSIZE]; + LPWSTR pwb; + std::string sbuf; + std::wstring wsbuf; size_t ds; BOOL ret; UINT line; - int ai; - LPCWSTR rflag = RB; - LPCWSTR wflag = WB; - int outenc = out_euc; + LPCWSTR rflag = RB, wflag = WccsUTF16LE; + int ai, inenc = in_euc, outenc = out_utf16; LPCWSTR infile, outfile; _wsetlocale(LC_ALL, L"JPN"); @@ -52,21 +65,33 @@ int wmain(int argc, wchar_t* argv[]) return -1; } - for(ai=1; ai<3; ai++) + for(ai = 1; ai < 3; ai++) { - if(wcscmp(argv[ai], L"-u") == 0) + if(wcscmp(argv[ai], L"-e") == 0) + { + } + else if(wcscmp(argv[ai], L"-u") == 0) { - bom = BOM; + inenc = in_utf8; rflag = RccsUTF8; } + else if(wcscmp(argv[ai], L"-w") == 0) + { + inenc = in_utf16; + rflag = RccsUTF16LE; + } + else if(wcscmp(argv[ai], L"-E") == 0) + { + outenc = out_euc; + wflag = WB; + } else if(wcscmp(argv[ai], L"-U") == 0) { outenc = out_utf8; + wflag = WB; } else if(wcscmp(argv[ai], L"-W") == 0) { - outenc = out_utf16; - wflag = WccsUTF16LE; } else { @@ -86,31 +111,7 @@ int wmain(int argc, wchar_t* argv[]) } infile = argv[ai]; - outfile = argv[ai+1]; - - if(bom == L'\0') - { - _wfopen_s(&fpi, infile, RB); - if(fpi == NULL) - { - fwprintf(stderr, L"error : cannot open %s.\n", infile); - return -1; - } - fread(&bom, 2, 1, fpi); - fclose(fpi); - if(bom == BOM) - { - rflag = RccsUTF8; - } - else - { - if(outenc == out_euc) - { - outenc = out_utf16; - wflag = WccsUTF16LE; - } - } - } + outfile = argv[ai + 1]; _wfopen_s(&fpi, infile, rflag); if(fpi == NULL) @@ -126,35 +127,47 @@ int wmain(int argc, wchar_t* argv[]) return -1; } - switch(bom) + if(inenc == in_utf8 || inenc == in_utf16) { - case BOM: - line = 1; - while(fgetws(wbuf, _countof(wbuf), fpi) != NULL) + for(line = 1; ; line++) { + sbuf.clear(); + wsbuf.clear(); + + while((pwb = fgetws(wbuf, _countof(wbuf), fpi)) != NULL) + { + wsbuf.append(wbuf); + + if(!wsbuf.empty() && wsbuf.back() == L'\n') + { + break; + } + } + + if(pwb == NULL) + { + break; + } + + ret = TRUE; + switch(outenc) { case out_euc: - ds = _countof(buf); - ret = WideCharToEucJis2004(wbuf, NULL, buf, &ds); + ds = -1; + ret = WideCharToEucJis2004(wsbuf.c_str(), NULL, NULL, &ds); if(ds > 0) { - fwrite(buf, ds - 1, 1, fpo); + sbuf = wstring_to_eucjis2004_string(wsbuf); + fwrite(sbuf.c_str(), sbuf.size(), 1, fpo); } break; case out_utf16: - fwprintf(fpo, L"%s", wbuf); + fwprintf(fpo, L"%s", wsbuf.c_str()); break; case out_utf8: - ds = WideCharToMultiByte(CP_UTF8, 0, wbuf, -1, buf, sizeof(buf), NULL, NULL); - if(ds > 0) - { - fwrite(buf, ds - 1, 1, fpo); - } - else - { - ret = FALSE; - } + sbuf = wstring_to_utf8_string(wsbuf); + fwrite(sbuf.c_str(), sbuf.size(), 1, fpo); break; default: break; @@ -165,45 +178,57 @@ int wmain(int argc, wchar_t* argv[]) fwprintf(stderr, L"error : cannot convert line %u\n", line); break; } - line++; } fclose(fpi); fclose(fpo); - break; - - default: - line = 1; - while(fgets(buf, _countof(buf), fpi) != NULL) + } + else + { + for(line = 1; ; line++) { + sbuf.clear(); + wsbuf.clear(); + + while((pb = fgets(buf, _countof(buf), fpi)) != NULL) + { + sbuf.append(buf); + + if(!sbuf.empty() && sbuf.back() == '\n') + { + break; + } + } + + if(pb == NULL) + { + break; + } + + ret = TRUE; + switch(outenc) { case out_euc: - ds = strlen(buf); - fwrite(buf, ds, 1, fpo); + fwrite(sbuf.c_str(), sbuf.size(), 1, fpo); break; case out_utf16: - ds = _countof(wbuf); - ret = EucJis2004ToWideChar(buf, NULL, wbuf, &ds); + ds = -1; + ret = EucJis2004ToWideChar(sbuf.c_str(), NULL, NULL, &ds); if(ds > 0) { - fwprintf(fpo, L"%s", wbuf); + wsbuf = eucjis2004_string_to_wstring(sbuf); + fwprintf(fpo, L"%s", wsbuf.c_str()); } break; case out_utf8: - ds = _countof(wbuf); - ret = EucJis2004ToWideChar(buf, NULL, wbuf, &ds); + ds = -1; + ret = EucJis2004ToWideChar(sbuf.c_str(), NULL, NULL, &ds); if(ds > 0) { - ds = WideCharToMultiByte(CP_UTF8, 0, wbuf, -1, buf, sizeof(buf), NULL, NULL); - if(ds > 0) - { - fwrite(buf, ds - 1, 1, fpo); - } - else - { - ret = FALSE; - } + wsbuf = eucjis2004_string_to_wstring(sbuf); + sbuf = wstring_to_utf8_string(wsbuf); + fwrite(sbuf.c_str(), sbuf.size(), 1, fpo); } break; default: @@ -215,12 +240,10 @@ int wmain(int argc, wchar_t* argv[]) fwprintf(stderr, L"error : cannot convert line %u\n", line); break; } - line++; } fclose(fpi); fclose(fpo); - break; } return 0; diff --git a/cveuc/cveuc.vcxproj b/cveuc/cveuc.vcxproj index ccb8ed1..0ba9e51 100644 --- a/cveuc/cveuc.vcxproj +++ b/cveuc/cveuc.vcxproj @@ -81,6 +81,7 @@ + @@ -90,6 +91,7 @@ Create Create + diff --git a/cveuc/cveuc.vcxproj.filters b/cveuc/cveuc.vcxproj.filters index a5098fc..f5d8a9b 100644 --- a/cveuc/cveuc.vcxproj.filters +++ b/cveuc/cveuc.vcxproj.filters @@ -24,6 +24,9 @@ Header Files + + Header Files + @@ -38,6 +41,9 @@ Source Files + + Source Files + diff --git a/cveuc/eucjis2004.cpp b/cveuc/eucjis2004.cpp index c1c1860..ee459c4 100644 --- a/cveuc/eucjis2004.cpp +++ b/cveuc/eucjis2004.cpp @@ -174,11 +174,17 @@ BOOL EucJis2004ToWideChar(LPCSTR src, size_t *srcsize, LPWSTR dst, size_t *dstsi WCHAR utf16[2][2]; size_t utf16num[2]; - if(src == NULL || dstsize == NULL) + if(dstsize == NULL) { return FALSE; } + if(src == NULL) + { + *dstsize = 0; + return FALSE; + } + if(srcsize != NULL) { ss = *srcsize; @@ -271,8 +277,14 @@ BOOL WideCharToEucJis2004(LPCWSTR src, size_t *srcsize, LPSTR dst, size_t *dstsi UCSCHAR ucp; BOOL exist; - if(src == NULL || dstsize == NULL) + if(dstsize == NULL) + { + return FALSE; + } + + if(src == NULL) { + *dstsize = 0; return FALSE; } @@ -495,3 +507,49 @@ BOOL WideCharToEucJis2004(LPCWSTR src, size_t *srcsize, LPSTR dst, size_t *dstsi } return TRUE; } + +std::string wstring_to_eucjis2004_string(const std::wstring &s) +{ + std::string ret; + size_t len = -1; + + WideCharToEucJis2004(s.c_str(), NULL, NULL, &len); + if(len > 0) + { + try + { + LPSTR euc = new CHAR[len]; + WideCharToEucJis2004(s.c_str(), NULL, euc, &len); + ret = euc; + delete[] euc; + } + catch(...) + { + } + } + + return ret; +} + +std::wstring eucjis2004_string_to_wstring(const std::string &s) +{ + std::wstring ret; + size_t len = -1; + + EucJis2004ToWideChar(s.c_str(), NULL, NULL, &len); + if(len > 0) + { + try + { + LPWSTR wcs = new WCHAR[len]; + EucJis2004ToWideChar(s.c_str(), NULL, wcs, &len); + ret = wcs; + delete[] wcs; + } + catch(...) + { + } + } + + return ret; +} diff --git a/cveuc/eucjis2004.h b/cveuc/eucjis2004.h index 0c7f32c..1cabe50 100644 --- a/cveuc/eucjis2004.h +++ b/cveuc/eucjis2004.h @@ -58,4 +58,10 @@ BOOL EucJis2004ToWideChar(LPCSTR src, size_t *srcsize, LPWSTR dst, size_t *dstsi // NULLのとき戻り値はFALSEになる BOOL WideCharToEucJis2004(LPCWSTR src, size_t *srcsize, LPSTR dst, size_t *dstsize); +std::string wstring_to_eucjis2004_string(const std::wstring &s); +std::wstring eucjis2004_string_to_wstring(const std::string &s); + +#define WCTOEUC(w) wstring_to_eucjis2004_string(w).c_str() +#define EUCTOWC(u) eucjis2004_string_to_wstring(u).c_str() + #endif //EUCJIS2004_H diff --git a/cveuc/stdafx.h b/cveuc/stdafx.h index 06fdcef..c480a4e 100644 --- a/cveuc/stdafx.h +++ b/cveuc/stdafx.h @@ -4,4 +4,6 @@ #include #include +#include + #include diff --git a/cveuc/utf8.cpp b/cveuc/utf8.cpp new file mode 100644 index 0000000..08063cb --- /dev/null +++ b/cveuc/utf8.cpp @@ -0,0 +1,48 @@ + +std::string wstring_to_utf8_string(const std::wstring &s) +{ + std::string ret; + + int len = WideCharToMultiByte(CP_UTF8, 0, s.c_str(), -1, NULL, 0, NULL, NULL); + if(len > 0) + { + try + { + LPSTR utf8 = new CHAR[len]; + if(WideCharToMultiByte(CP_UTF8, 0, s.c_str(), -1, utf8, len, NULL, NULL) > 0) + { + ret = utf8; + } + delete[] utf8; + } + catch(...) + { + } + } + + return ret; +} + +std::wstring utf8_string_to_wstring(const std::string &s) +{ + std::wstring ret; + + int len = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, NULL, 0); + if(len > 0) + { + try + { + LPWSTR wcs = new WCHAR[len]; + if(MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, wcs, len) > 0) + { + ret = wcs; + } + delete[] wcs; + } + catch(...) + { + } + } + + return ret; +} diff --git a/cveuc/utf8.h b/cveuc/utf8.h new file mode 100644 index 0000000..f7054cd --- /dev/null +++ b/cveuc/utf8.h @@ -0,0 +1,11 @@ + +#ifndef UTF8_H +#define UTF8_H + +std::string wstring_to_utf8_string(const std::wstring &s); +std::wstring utf8_string_to_wstring(const std::string &s); + +#define WCTOU8(w) wstring_to_utf8_string(w).c_str() +#define U8TOWC(u) utf8_string_to_wstring(u).c_str() + +#endif