From c7b58e3fe777a1107e895ca2b0a5f3f75ab16db8 Mon Sep 17 00:00:00 2001
From: nathancorvussolis <nathancorvussolis@gmail.com>
Date: Sat, 2 Aug 2014 19:19:51 +0900
Subject: [PATCH] 2.0.0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

・約16KBを超過する行がファイルにあるとき文字コード変換に失敗することがあるバグを修正しました。

・文字コード指定のオプションを変更しました。
　・cveuc.exe [-euwEUW] <入力ファイル> <出力ファイル>
　　・入力ファイル文字コード指定
　　　・-e : EUC-JIS-2004 (改行LF) (デフォルト)
　　　・-u : UTF-8 (BOMなしまたはBOMあり、改行CR+LFまたはLF)
　　　・-w : UTF-16 (LE、BOMなしまたはBOMあり、改行CR+LFまたはLF)
　　・出力ファイル文字コード指定
　　　・-E : EUC-JIS-2004 (改行LF)
　　　・-U : UTF-8 (BOMなし、改行LF)
　　　・-W : UTF-16 (LE、BOMあり、改行CR+LF) (デフォルト)
---
 README.TXT                  |  65 ++++++++----
 cveuc/cveuc.cpp             | 205 ++++++++++++++++++++----------------
 cveuc/cveuc.vcxproj         |   2 +
 cveuc/cveuc.vcxproj.filters |   6 ++
 cveuc/eucjis2004.cpp        |  62 ++++++++++-
 cveuc/eucjis2004.h          |   6 ++
 cveuc/stdafx.h              |   2 +
 cveuc/utf8.cpp              |  48 +++++++++
 cveuc/utf8.h                |  11 ++
 9 files changed, 293 insertions(+), 114 deletions(-)
 create mode 100644 cveuc/utf8.cpp
 create mode 100644 cveuc/utf8.h
diff --git a/README.TXT b/README.TXT
index da29772..8116d95 100644
--- a/README.TXT
+++ b/README.TXT
@@ -1,55 +1,78 @@
 ﻿
 -------------------------------------------------------------------------------
 
-   cveuc 1.2.0
+   cveuc 2.0.0
 
 -------------------------------------------------------------------------------
 
 文字コード変換コマンドラインプログラムです。
-EUC-JIS-2004(改行LF)、UTF-16(LE、BOMあり、改行CR+LF)、UTF-8(BOMなし、改行LF) を
-相互に変換します。
+EUC-JIS-2004、UTF-16、UTF-8 を相互に変換します。
 
 変換テーブルは cygwin の iconv 1.14 と同等としています。
 
 実行方法
 
-   cveuc.exe [-uUW] <input file> <output file>
+   cveuc.exe [-euwEUW] <input file> <output file>
 
 オプション
 
-   -u : 入力ファイル UTF-8
-   -U : 出力ファイル UTF-8
-   -W : 出力ファイル UTF-16
+   入力ファイル文字コード指定
+
+      -e : EUC-JIS-2004 (改行LF) (デフォルト)
+      -u : UTF-8 (BOMなしまたはBOMあり、改行CR+LFまたはLF)
+      -w : UTF-16 (LE、BOMなしまたはBOMあり、改行CR+LFまたはLF)
+
+   出力ファイル文字コード指定
+
+      -E : EUC-JIS-2004 (改行LF)
+      -U : UTF-8 (BOMなし、改行LF)
+      -W : UTF-16 (LE、BOMあり、改行CR+LF) (デフォルト)
 
 使用例
 
- 1) EUC-JIS-2004 → UTF-16
-   cveuc.exe euc.txt utf16.txt
+ 1) EUC-JIS-2004 → EUC-JIS-2004
 
- 2) EUC-JIS-2004 → UTF-16
-   cveuc.exe -W euc.txt utf16.txt
+   cveuc.exe -E euc.txt euc_2.txt
+   cveuc.exe -e -E euc.txt euc_2.txt
+
+ 2) EUC-JIS-2004 → UTF-8
 
- 3) EUC-JIS-2004 → UTF-8
    cveuc.exe -U euc.txt utf8.txt
+   cveuc.exe -e -U euc.txt utf8.txt
+
+ 3) EUC-JIS-2004 → UTF-16
+
+   cveuc.exe euc.txt utf16.txt
+   cveuc.exe -e euc.txt utf16.txt
+   cveuc.exe -W euc.txt utf16.txt
+   cveuc.exe -e -W euc.txt utf16.txt
 
  4) UTF-16 → EUC-JIS-2004
-   cveuc.exe utf16.txt euc.txt
+
+   cveuc.exe -w -E utf16.txt euc.txt
 
  5) UTF-16 → UTF-8
-   cveuc.exe -U utf16.txt utf8.txt
 
- 6) UTF-8 → EUC-JIS-2004
-   cveuc.exe -u utf8.txt euc.txt
+   cveuc.exe -w -U utf16.txt utf8.txt
 
- 7) UTF-8 → UTF-16
-   cveuc.exe -u -W utf8.txt utf16.txt
+ 6) UTF-16 → UTF-16
+
+   cveuc.exe -w utf16.txt utf16_2.txt
+   cveuc.exe -w -W utf16.txt utf16_2.txt
+
+ 7) UTF-8 → EUC-JIS-2004
 
- 8) UTF-16 → UTF-16 (単なるコピーと同じ)
-   cveuc.exe -W utf16.txt utf16_2.txt
+   cveuc.exe -u -E utf8.txt euc.txt
+
+ 8) UTF-8 → UTF-8
 
- 9) UTF-8 → UTF-8 (単なるコピーと同じ)
    cveuc.exe -u -U utf8.txt utf8_2.txt
 
+ 9) UTF-8 → UTF-16
+
+   cveuc.exe -u utf8.txt utf16.txt
+   cveuc.exe -u -W utf8.txt utf16.txt
+
 
 
 -------------------------------------------------------------------------------
diff --git a/cveuc/cveuc.cpp b/cveuc/cveuc.cpp
index 54f7796..66b1480 100644
--- a/cveuc/cveuc.cpp
+++ b/cveuc/cveuc.cpp
@@ -1,16 +1,23 @@
 ﻿
 #include "eucjis2004.h"
+#include "utf8.h"
 
-#define VERSION		L"1.2.0"
+#define VERSION			L"2.0.0"
 
-#define BUFSIZE 0x8000
+#define BUFSIZE			0x800
 
-#define RccsUTF8 L"r,ccs=UTF-8"
-#define WccsUTF8 L"w,ccs=UTF-8"
-#define WccsUTF16LE L"w,ccs=UTF-16LE"
-#define RB L"rb"
-#define WB L"wb"
-#define BOM 0xFEFF
+#define RccsUTF8		L"r,ccs=UTF-8"
+#define WccsUTF8		L"w,ccs=UTF-8"
+#define RccsUTF16LE		L"r,ccs=UTF-16LE"
+#define WccsUTF16LE		L"w,ccs=UTF-16LE"
+#define RB				L"rb"
+#define WB				L"wb"
+
+enum enum_inenc {
+	in_euc,
+	in_utf16,
+	in_utf8
+};
 
 enum enum_outenc {
 	out_euc,
@@ -21,27 +28,33 @@ enum enum_outenc {
 void print_usage(void)
 {
 	fwprintf(stderr, L"\ncveuc %s\n\n", VERSION);
-	fwprintf(stderr, L"usage : cveuc [option] <input file> <output file>\n");
-	fwprintf(stderr, L"   option :\n");
-	fwprintf(stderr, L"      -u   input file in UTF-8\n");
-	fwprintf(stderr, L"      -U   output file in UTF-8 (LF, without BOM)\n");
-	fwprintf(stderr, L"      -W   output file in UTF-16 (CR+LF, LE with BOM)\n");
-	fwprintf(stderr, L"\ndefault file encodings : EUC-JIS-2004 and UTF-16(CR+LF, LE with BOM)\n");
+	fwprintf(stderr, L"usage : cveuc [option] <input file> <output file>\n"
+		L"   option :\n"
+		L"      input file encoding :\n"
+		L"         -e   EUC-JIS-2004 (LF) (default)\n"
+		L"         -u   UTF-8 (with or without BOM, LF or CR+LF)\n"
+		L"         -w   UTF-16 (LE, with or without BOM, LF or CR+LF)\n"
+		L"      output file encoding :\n"
+		L"         -E   EUC-JIS-2004 (LF)\n"
+		L"         -U   UTF-8 (without BOM, LF)\n"
+		L"         -W   UTF-16 (LE, with BOM, CR+LF) (default)\n"
+		);
 }
 
 int wmain(int argc, wchar_t* argv[])
 {
 	FILE *fpi, *fpo;
-	WCHAR bom = L'\0';
-	CHAR buf[BUFSIZE*2];
+	CHAR buf[BUFSIZE * sizeof(WCHAR)];
+	LPSTR pb;
 	WCHAR wbuf[BUFSIZE];
+	LPWSTR pwb;
+	std::string sbuf;
+	std::wstring wsbuf;
 	size_t ds;
 	BOOL ret;
 	UINT line;
-	int ai;
-	LPCWSTR rflag = RB;
-	LPCWSTR wflag = WB;
-	int outenc = out_euc;
+	LPCWSTR rflag = RB, wflag = WccsUTF16LE;
+	int ai, inenc = in_euc, outenc = out_utf16;
 	LPCWSTR infile, outfile;
 
 	_wsetlocale(LC_ALL, L"JPN");
@@ -52,21 +65,33 @@ int wmain(int argc, wchar_t* argv[])
 		return -1;
 	}
 
-	for(ai=1; ai<3; ai++)
+	for(ai = 1; ai < 3; ai++)
 	{
-		if(wcscmp(argv[ai], L"-u") == 0)
+		if(wcscmp(argv[ai], L"-e") == 0)
+		{
+		}
+		else if(wcscmp(argv[ai], L"-u") == 0)
 		{
-			bom = BOM;
+			inenc = in_utf8;
 			rflag = RccsUTF8;
 		}
+		else if(wcscmp(argv[ai], L"-w") == 0)
+		{
+			inenc = in_utf16;
+			rflag = RccsUTF16LE;
+		}
+		else if(wcscmp(argv[ai], L"-E") == 0)
+		{
+			outenc = out_euc;
+			wflag = WB;
+		}
 		else if(wcscmp(argv[ai], L"-U") == 0)
 		{
 			outenc = out_utf8;
+			wflag = WB;
 		}
 		else if(wcscmp(argv[ai], L"-W") == 0)
 		{
-			outenc = out_utf16;
-			wflag = WccsUTF16LE;
 		}
 		else
 		{
@@ -86,31 +111,7 @@ int wmain(int argc, wchar_t* argv[])
 	}
 
 	infile = argv[ai];
-	outfile = argv[ai+1];
-
-	if(bom == L'\0')
-	{
-		_wfopen_s(&fpi, infile, RB);
-		if(fpi == NULL)
-		{
-			fwprintf(stderr, L"error : cannot open %s.\n", infile);
-			return -1;
-		}
-		fread(&bom, 2, 1, fpi);
-		fclose(fpi);
-		if(bom == BOM)
-		{
-			rflag = RccsUTF8;
-		}
-		else
-		{
-			if(outenc == out_euc)
-			{
-				outenc = out_utf16;
-				wflag = WccsUTF16LE;
-			}
-		}
-	}
+	outfile = argv[ai + 1];
 
 	_wfopen_s(&fpi, infile, rflag);
 	if(fpi == NULL)
@@ -126,35 +127,47 @@ int wmain(int argc, wchar_t* argv[])
 		return -1;
 	}
 
-	switch(bom)
+	if(inenc == in_utf8 || inenc == in_utf16)
 	{
-	case BOM:
-		line = 1;
-		while(fgetws(wbuf, _countof(wbuf), fpi) != NULL)
+		for(line = 1; ; line++)
 		{
+			sbuf.clear();
+			wsbuf.clear();
+
+			while((pwb = fgetws(wbuf, _countof(wbuf), fpi)) != NULL)
+			{
+				wsbuf.append(wbuf);
+
+				if(!wsbuf.empty() && wsbuf.back() == L'\n')
+				{
+					break;
+				}
+			}
+
+			if(pwb == NULL)
+			{
+				break;
+			}
+
+			ret = TRUE;
+
 			switch(outenc)
 			{
 			case out_euc:
-				ds = _countof(buf);
-				ret = WideCharToEucJis2004(wbuf, NULL, buf, &ds);
+				ds = -1;
+				ret = WideCharToEucJis2004(wsbuf.c_str(), NULL, NULL, &ds);
 				if(ds > 0)
 				{
-					fwrite(buf, ds - 1, 1, fpo);
+					sbuf = wstring_to_eucjis2004_string(wsbuf);
+					fwrite(sbuf.c_str(), sbuf.size(), 1, fpo);
 				}
 				break;
 			case out_utf16:
-				fwprintf(fpo, L"%s", wbuf);
+				fwprintf(fpo, L"%s", wsbuf.c_str());
 				break;
 			case out_utf8:
-				ds = WideCharToMultiByte(CP_UTF8, 0, wbuf, -1, buf, sizeof(buf), NULL, NULL);
-				if(ds > 0)
-				{
-					fwrite(buf, ds - 1, 1, fpo);
-				}
-				else
-				{
-					ret = FALSE;
-				}
+				sbuf = wstring_to_utf8_string(wsbuf);
+				fwrite(sbuf.c_str(), sbuf.size(), 1, fpo);
 				break;
 			default:
 				break;
@@ -165,45 +178,57 @@ int wmain(int argc, wchar_t* argv[])
 				fwprintf(stderr, L"error : cannot convert line %u\n", line);
 				break;
 			}
-			line++;
 		}
 
 		fclose(fpi);
 		fclose(fpo);
-		break;
-
-	default:
-		line = 1;
-		while(fgets(buf, _countof(buf), fpi) != NULL)
+	}
+	else
+	{
+		for(line = 1; ; line++)
 		{
+			sbuf.clear();
+			wsbuf.clear();
+
+			while((pb = fgets(buf, _countof(buf), fpi)) != NULL)
+			{
+				sbuf.append(buf);
+
+				if(!sbuf.empty() && sbuf.back() == '\n')
+				{
+					break;
+				}
+			}
+
+			if(pb == NULL)
+			{
+				break;
+			}
+
+			ret = TRUE;
+
 			switch(outenc)
 			{
 			case out_euc:
-				ds = strlen(buf);
-				fwrite(buf, ds, 1, fpo);
+				fwrite(sbuf.c_str(), sbuf.size(), 1, fpo);
 				break;
 			case out_utf16:
-				ds = _countof(wbuf);
-				ret = EucJis2004ToWideChar(buf, NULL, wbuf, &ds);
+				ds = -1;
+				ret = EucJis2004ToWideChar(sbuf.c_str(), NULL, NULL, &ds);
 				if(ds > 0)
 				{
-					fwprintf(fpo, L"%s", wbuf);
+					wsbuf = eucjis2004_string_to_wstring(sbuf);
+					fwprintf(fpo, L"%s", wsbuf.c_str());
 				}
 				break;
 			case out_utf8:
-				ds = _countof(wbuf);
-				ret = EucJis2004ToWideChar(buf, NULL, wbuf, &ds);
+				ds = -1;
+				ret = EucJis2004ToWideChar(sbuf.c_str(), NULL, NULL, &ds);
 				if(ds > 0)
 				{
-					ds = WideCharToMultiByte(CP_UTF8, 0, wbuf, -1, buf, sizeof(buf), NULL, NULL);
-					if(ds > 0)
-					{
-						fwrite(buf, ds - 1, 1, fpo);
-					}
-					else
-					{
-						ret = FALSE;
-					}
+					wsbuf = eucjis2004_string_to_wstring(sbuf);
+					sbuf = wstring_to_utf8_string(wsbuf);
+					fwrite(sbuf.c_str(), sbuf.size(), 1, fpo);
 				}
 				break;
 			default:
@@ -215,12 +240,10 @@ int wmain(int argc, wchar_t* argv[])
 				fwprintf(stderr, L"error : cannot convert line %u\n", line);
 				break;
 			}
-			line++;
 		}
 
 		fclose(fpi);
 		fclose(fpo);
-		break;
 	}
 
 	return 0;
diff --git a/cveuc/cveuc.vcxproj b/cveuc/cveuc.vcxproj
index ccb8ed1..0ba9e51 100644
--- a/cveuc/cveuc.vcxproj
+++ b/cveuc/cveuc.vcxproj
@@ -81,6 +81,7 @@
     <ClInclude Include="eucjis2004.h" />
     <ClInclude Include="eucjis2004table.h" />
     <ClInclude Include="stdafx.h" />
+    <ClInclude Include="utf8.h" />
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="cveuc.cpp" />
@@ -90,6 +91,7 @@
       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
     </ClCompile>
+    <ClCompile Include="utf8.cpp" />
   </ItemGroup>
   <ItemGroup>
     <Text Include="..\README.TXT" />
diff --git a/cveuc/cveuc.vcxproj.filters b/cveuc/cveuc.vcxproj.filters
index a5098fc..f5d8a9b 100644
--- a/cveuc/cveuc.vcxproj.filters
+++ b/cveuc/cveuc.vcxproj.filters
@@ -24,6 +24,9 @@
     <ClInclude Include="eucjis2004table.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="utf8.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="stdafx.cpp">
@@ -38,6 +41,9 @@
     <ClCompile Include="eucjis2004table.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="utf8.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <Text Include="..\README.TXT" />
diff --git a/cveuc/eucjis2004.cpp b/cveuc/eucjis2004.cpp
index c1c1860..ee459c4 100644
--- a/cveuc/eucjis2004.cpp
+++ b/cveuc/eucjis2004.cpp
@@ -174,11 +174,17 @@ BOOL EucJis2004ToWideChar(LPCSTR src, size_t *srcsize, LPWSTR dst, size_t *dstsi
 	WCHAR utf16[2][2];
 	size_t utf16num[2];
 
-	if(src == NULL || dstsize == NULL)
+	if(dstsize == NULL)
 	{
 		return FALSE;
 	}
 
+	if(src == NULL)
+	{
+		*dstsize = 0;
+		return FALSE;
+	}
+
 	if(srcsize != NULL)
 	{
 		ss = *srcsize;
@@ -271,8 +277,14 @@ BOOL WideCharToEucJis2004(LPCWSTR src, size_t *srcsize, LPSTR dst, size_t *dstsi
 	UCSCHAR ucp;
 	BOOL exist;
 
-	if(src == NULL || dstsize == NULL)
+	if(dstsize == NULL)
+	{
+		return FALSE;
+	}
+
+	if(src == NULL)
 	{
+		*dstsize = 0;
 		return FALSE;
 	}
 
@@ -495,3 +507,49 @@ BOOL WideCharToEucJis2004(LPCWSTR src, size_t *srcsize, LPSTR dst, size_t *dstsi
 	}
 	return TRUE;
 }
+
+std::string wstring_to_eucjis2004_string(const std::wstring &s)
+{
+	std::string ret;
+	size_t len = -1;
+
+	WideCharToEucJis2004(s.c_str(), NULL, NULL, &len);
+	if(len > 0)
+	{
+		try
+		{
+			LPSTR euc = new CHAR[len];
+			WideCharToEucJis2004(s.c_str(), NULL, euc, &len);
+			ret = euc;
+			delete[] euc;
+		}
+		catch(...)
+		{
+		}
+	}
+
+	return ret;
+}
+
+std::wstring eucjis2004_string_to_wstring(const std::string &s)
+{
+	std::wstring ret;
+	size_t len = -1;
+
+	EucJis2004ToWideChar(s.c_str(), NULL, NULL, &len);
+	if(len > 0)
+	{
+		try
+		{
+			LPWSTR wcs = new WCHAR[len];
+			EucJis2004ToWideChar(s.c_str(), NULL, wcs, &len);
+			ret = wcs;
+			delete[] wcs;
+		}
+		catch(...)
+		{
+		}
+	}
+
+	return ret;
+}
diff --git a/cveuc/eucjis2004.h b/cveuc/eucjis2004.h
index 0c7f32c..1cabe50 100644
--- a/cveuc/eucjis2004.h
+++ b/cveuc/eucjis2004.h
@@ -58,4 +58,10 @@ BOOL EucJis2004ToWideChar(LPCSTR src, size_t *srcsize, LPWSTR dst, size_t *dstsi
 //                NULLのとき戻り値はFALSEになる
 BOOL WideCharToEucJis2004(LPCWSTR src, size_t *srcsize, LPSTR dst, size_t *dstsize);
 
+std::string wstring_to_eucjis2004_string(const std::wstring &s);
+std::wstring eucjis2004_string_to_wstring(const std::string &s);
+
+#define WCTOEUC(w) wstring_to_eucjis2004_string(w).c_str()
+#define EUCTOWC(u) eucjis2004_string_to_wstring(u).c_str()
+
 #endif //EUCJIS2004_H
diff --git a/cveuc/stdafx.h b/cveuc/stdafx.h
index 06fdcef..c480a4e 100644
--- a/cveuc/stdafx.h
+++ b/cveuc/stdafx.h
@@ -4,4 +4,6 @@
 #include <stdio.h>
 #include <locale.h>
 
+#include <string>
+
 #include <Windows.h>
diff --git a/cveuc/utf8.cpp b/cveuc/utf8.cpp
new file mode 100644
index 0000000..08063cb
--- /dev/null
+++ b/cveuc/utf8.cpp
@@ -0,0 +1,48 @@
+﻿
+std::string wstring_to_utf8_string(const std::wstring &s)
+{
+	std::string ret;
+
+	int len = WideCharToMultiByte(CP_UTF8, 0, s.c_str(), -1, NULL, 0, NULL, NULL);
+	if(len > 0)
+	{
+		try
+		{
+			LPSTR utf8 = new CHAR[len];
+			if(WideCharToMultiByte(CP_UTF8, 0, s.c_str(), -1, utf8, len, NULL, NULL) > 0)
+			{
+				ret = utf8;
+			}
+			delete[] utf8;
+		}
+		catch(...)
+		{
+		}
+	}
+
+	return ret;
+}
+
+std::wstring utf8_string_to_wstring(const std::string &s)
+{
+	std::wstring ret;
+
+	int len = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, NULL, 0);
+	if(len > 0)
+	{
+		try
+		{
+			LPWSTR wcs = new WCHAR[len];
+			if(MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, wcs, len) > 0)
+			{
+				ret = wcs;
+			}
+			delete[] wcs;
+		}
+		catch(...)
+		{
+		}
+	}
+
+	return ret;
+}
diff --git a/cveuc/utf8.h b/cveuc/utf8.h
new file mode 100644
index 0000000..f7054cd
--- /dev/null
+++ b/cveuc/utf8.h
@@ -0,0 +1,11 @@
+﻿
+#ifndef UTF8_H
+#define UTF8_H
+
+std::string wstring_to_utf8_string(const std::wstring &s);
+std::wstring utf8_string_to_wstring(const std::string &s);
+
+#define WCTOU8(w) wstring_to_utf8_string(w).c_str()
+#define U8TOWC(u) utf8_string_to_wstring(u).c_str()
+
+#endif