From 73f788a41c6de45bf19e8b64ec2bbf59818bff37 Mon Sep 17 00:00:00 2001 From: Dana Robinson Date: Sun, 17 Mar 2024 22:53:19 -0700 Subject: [PATCH 1/3] Address code page issues w/ Windows file paths On Windows, HDF5 attempted to convert file paths passed to open() and remove() to UTF-16 in order to handle Unicode file paths. This scheme does not work when the system uses code pages to handle non-ASCII file names. As suggested in the forum post below, we now also try to see if we can open the file with open(), which should handle systems where non-ASCII code pages are in use. https://forum.hdfgroup.org/t/open-create-hdf5-files-with-non-utf8-chars-such-as-shift-jis/11785 --- src/H5system.c | 62 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/src/H5system.c b/src/H5system.c index be886ae52f3..406166a19c0 100644 --- a/src/H5system.c +++ b/src/H5system.c @@ -516,13 +516,10 @@ H5_get_utf16_str(const char *s) /*------------------------------------------------------------------------- * Function: Wopen_utf8 * - * Purpose: UTF-8 equivalent of open(2) for use on Windows. - * Converts a UTF-8 input path to UTF-16 and then opens the - * file via _wopen() under the hood + * Purpose: UTF-8 equivalent of open(2) for use on Windows * * Return: Success: A POSIX file descriptor * Failure: -1 - * *------------------------------------------------------------------------- */ int @@ -532,10 +529,6 @@ Wopen_utf8(const char *path, int oflag, ...) wchar_t *wpath = NULL; /* UTF-16 version of the path */ int pmode = 0; /* mode (optionally set via variable args) */ - /* Convert the input UTF-8 path to UTF-16 */ - if (NULL == (wpath = H5_get_utf16_str(path))) - goto done; - /* _O_BINARY must be set in Windows to avoid CR-LF <-> LF EOL * transformations when performing I/O. Note that this will * produce Unix-style text files, though. @@ -551,12 +544,33 @@ Wopen_utf8(const char *path, int oflag, ...) va_end(vl); } - /* Open the file */ + /* First try opening the file with the normal POSIX open() call. + * This will handle ASCII without additional processing as well as + * systems where code pages are being used instead of true Unicode. + */ + if ((fd = open(path, oflag, pmode)) >= 0) { + /* If this succeeds, we're done */ + goto done; + } + + if (errno == ENOENT) { + /* Not found, reset errno and try with UTF-16 */ + errno = 0; + } + else { + /* Some other error (like permissions), so just exit */ + goto done; + } + + /* Convert the input UTF-8 path to UTF-16 */ + if (NULL == (wpath = H5_get_utf16_str(path))) + goto done; + + /* Open the file using a UTF-16 path */ fd = _wopen(wpath, oflag, pmode); done: - if (wpath) - H5MM_xfree((void *)wpath); + H5MM_xfree(wpath); return fd; } /* end Wopen_utf8() */ @@ -565,12 +579,9 @@ Wopen_utf8(const char *path, int oflag, ...) * Function: Wremove_utf8 * * Purpose: UTF-8 equivalent of remove(3) for use on Windows. - * Converts a UTF-8 input path to UTF-16 and then opens the - * file via _wremove() under the hood * * Return: Success: 0 * Failure: -1 - * *------------------------------------------------------------------------- */ int @@ -579,16 +590,33 @@ Wremove_utf8(const char *path) wchar_t *wpath = NULL; /* UTF-16 version of the path */ int ret = -1; + /* First try opening the file with the normal POSIX open() call. + * This will handle ASCII without additional processing as well as + * systems where code pages are being used instead of true Unicode. + */ + if ((ret = remove(path)) >= 0) { + /* If this succeeds, we're done */ + goto done; + } + + if (errno == ENOENT) { + /* Not found, reset errno and try with UTF-16 */ + errno = 0; + } + else { + /* Some other error (like permissions), so just exit */ + goto done; + } + /* Convert the input UTF-8 path to UTF-16 */ if (NULL == (wpath = H5_get_utf16_str(path))) goto done; - /* Open the file */ + /* Remove the file using a UTF-16 path */ ret = _wremove(wpath); done: - if (wpath) - H5MM_xfree((void *)wpath); + H5MM_xfree(wpath); return ret; } /* end Wremove_utf8() */ From 6bb220be117face75cf1e95ab91a4d49acec1c5f Mon Sep 17 00:00:00 2001 From: Dana Robinson Date: Mon, 18 Mar 2024 00:18:11 -0700 Subject: [PATCH 2/3] Rename Windows open/remove functions --- src/H5system.c | 18 ++++++++++-------- src/H5win32defs.h | 12 ++++++------ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/src/H5system.c b/src/H5system.c index 406166a19c0..8f5b6f451d1 100644 --- a/src/H5system.c +++ b/src/H5system.c @@ -514,16 +514,17 @@ H5_get_utf16_str(const char *s) } /* end H5_get_utf16_str() */ /*------------------------------------------------------------------------- - * Function: Wopen_utf8 + * Function: Wopen * - * Purpose: UTF-8 equivalent of open(2) for use on Windows + * Purpose: Equivalent of open(2) for use on Windows. Necessary to + * handle code pages and Unicode on that platform. * * Return: Success: A POSIX file descriptor * Failure: -1 *------------------------------------------------------------------------- */ int -Wopen_utf8(const char *path, int oflag, ...) +Wopen(const char *path, int oflag, ...) { int fd = -1; /* POSIX file descriptor to be returned */ wchar_t *wpath = NULL; /* UTF-16 version of the path */ @@ -573,19 +574,20 @@ Wopen_utf8(const char *path, int oflag, ...) H5MM_xfree(wpath); return fd; -} /* end Wopen_utf8() */ +} /* end Wopen() */ /*------------------------------------------------------------------------- - * Function: Wremove_utf8 + * Function: Wremove * - * Purpose: UTF-8 equivalent of remove(3) for use on Windows. + * Purpose: Equivalent of remove(3) for use on Windows. Necessary to + * handle code pages and Unicode on that platform. * * Return: Success: 0 * Failure: -1 *------------------------------------------------------------------------- */ int -Wremove_utf8(const char *path) +Wremove(const char *path) { wchar_t *wpath = NULL; /* UTF-16 version of the path */ int ret = -1; @@ -619,7 +621,7 @@ Wremove_utf8(const char *path) H5MM_xfree(wpath); return ret; -} /* end Wremove_utf8() */ +} /* end Wremove() */ #endif /* H5_HAVE_WIN32_API */ diff --git a/src/H5win32defs.h b/src/H5win32defs.h index 9630c5e2d42..05d291ec03b 100644 --- a/src/H5win32defs.h +++ b/src/H5win32defs.h @@ -39,7 +39,7 @@ struct timezone { }; #endif -#define HDcreat(S, M) Wopen_utf8(S, O_CREAT | O_TRUNC | O_RDWR, M) +#define HDcreat(S, M) Wopen(S, O_CREAT | O_TRUNC | O_RDWR, M) #define HDflock(F, L) Wflock(F, L) #define HDfstat(F, B) _fstati64(F, B) #define HDftell(F) _ftelli64(F) @@ -59,13 +59,13 @@ struct timezone { */ #if (defined(_MSC_VER) && !defined(_MSVC_TRADITIONAL)) || _MSVC_TRADITIONAL /* Using the MSVC traditional preprocessor */ -#define HDopen(S, F, ...) Wopen_utf8(S, F, __VA_ARGS__) +#define HDopen(S, F, ...) Wopen(S, F, __VA_ARGS__) #else /* Using a standards conformant preprocessor */ -#define HDopen(S, F, ...) Wopen_utf8(S, F, ##__VA_ARGS__) +#define HDopen(S, F, ...) Wopen(S, F, ##__VA_ARGS__) #endif -#define HDremove(S) Wremove_utf8(S) +#define HDremove(S) Wremove(S) #define HDsetenv(N, V, O) Wsetenv(N, V, O) #define HDsetvbuf(F, S, M, Z) setvbuf(F, S, M, (Z > 1 ? Z : 2)) #define HDsleep(S) Sleep(S * 1000) @@ -89,8 +89,8 @@ H5_DLL int Wsetenv(const char *name, const char *value, int overwrite); H5_DLL int Wflock(int fd, int operation); H5_DLL herr_t H5_expand_windows_env_vars(char **env_var); H5_DLL wchar_t *H5_get_utf16_str(const char *s); -H5_DLL int Wopen_utf8(const char *path, int oflag, ...); -H5_DLL int Wremove_utf8(const char *path); +H5_DLL int Wopen(const char *path, int oflag, ...); +H5_DLL int Wremove(const char *path); H5_DLL int H5_get_win32_times(H5_timevals_t *tvs); H5_DLL char *H5_strndup(const char *s, size_t n); H5_DLL char *Wstrcasestr_wrap(const char *haystack, const char *needle); From 38b345686396ee514cdf0cb182767c61582bbf63 Mon Sep 17 00:00:00 2001 From: Dana Robinson Date: Mon, 18 Mar 2024 00:21:38 -0700 Subject: [PATCH 3/3] open --> remove in Wremove() comments --- src/H5system.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/H5system.c b/src/H5system.c index 8f5b6f451d1..6057c6ff639 100644 --- a/src/H5system.c +++ b/src/H5system.c @@ -592,7 +592,7 @@ Wremove(const char *path) wchar_t *wpath = NULL; /* UTF-16 version of the path */ int ret = -1; - /* First try opening the file with the normal POSIX open() call. + /* First try removing the file with the normal POSIX remove() call. * This will handle ASCII without additional processing as well as * systems where code pages are being used instead of true Unicode. */