Skip to content

Commit

Permalink
Address code page issues w/ Windows file paths (#4172)
Browse files Browse the repository at this point in the history
On Windows, HDF5 attempted to convert file paths passed to open() and
remove() to UTF-16 in order to handle Unicode file paths. This scheme
does not work when the system uses code pages to handle non-ASCII
file names.

As suggested in the forum post below, we now also try to see if we
can open the file with open(), which should handle systems where
non-ASCII code pages are in use.

https://forum.hdfgroup.org/t/open-create-hdf5-files-with-non-utf8-chars-such-as-shift-jis/11785
  • Loading branch information
derobins authored Mar 18, 2024
1 parent 840476e commit eb0351e
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 30 deletions.
78 changes: 54 additions & 24 deletions src/H5system.c
Original file line number Diff line number Diff line change
Expand Up @@ -514,28 +514,22 @@ H5_get_utf16_str(const char *s)
} /* end H5_get_utf16_str() */

/*-------------------------------------------------------------------------
* Function: Wopen_utf8
* Function: Wopen
*
* Purpose: UTF-8 equivalent of open(2) for use on Windows.
* Converts a UTF-8 input path to UTF-16 and then opens the
* file via _wopen() under the hood
* Purpose: Equivalent of open(2) for use on Windows. Necessary to
* handle code pages and Unicode on that platform.
*
* Return: Success: A POSIX file descriptor
* Failure: -1
*
*-------------------------------------------------------------------------
*/
int
Wopen_utf8(const char *path, int oflag, ...)
Wopen(const char *path, int oflag, ...)
{
int fd = -1; /* POSIX file descriptor to be returned */
wchar_t *wpath = NULL; /* UTF-16 version of the path */
int pmode = 0; /* mode (optionally set via variable args) */

/* Convert the input UTF-8 path to UTF-16 */
if (NULL == (wpath = H5_get_utf16_str(path)))
goto done;

/* _O_BINARY must be set in Windows to avoid CR-LF <-> LF EOL
* transformations when performing I/O. Note that this will
* produce Unix-style text files, though.
Expand All @@ -551,47 +545,83 @@ Wopen_utf8(const char *path, int oflag, ...)
va_end(vl);
}

/* Open the file */
/* First try opening the file with the normal POSIX open() call.
* This will handle ASCII without additional processing as well as
* systems where code pages are being used instead of true Unicode.
*/
if ((fd = open(path, oflag, pmode)) >= 0) {
/* If this succeeds, we're done */
goto done;
}

if (errno == ENOENT) {
/* Not found, reset errno and try with UTF-16 */
errno = 0;
}
else {
/* Some other error (like permissions), so just exit */
goto done;
}

/* Convert the input UTF-8 path to UTF-16 */
if (NULL == (wpath = H5_get_utf16_str(path)))
goto done;

/* Open the file using a UTF-16 path */
fd = _wopen(wpath, oflag, pmode);

done:
if (wpath)
H5MM_xfree((void *)wpath);
H5MM_xfree(wpath);

return fd;
} /* end Wopen_utf8() */
} /* end Wopen() */

/*-------------------------------------------------------------------------
* Function: Wremove_utf8
* Function: Wremove
*
* Purpose: UTF-8 equivalent of remove(3) for use on Windows.
* Converts a UTF-8 input path to UTF-16 and then opens the
* file via _wremove() under the hood
* Purpose: Equivalent of remove(3) for use on Windows. Necessary to
* handle code pages and Unicode on that platform.
*
* Return: Success: 0
* Failure: -1
*
*-------------------------------------------------------------------------
*/
int
Wremove_utf8(const char *path)
Wremove(const char *path)
{
wchar_t *wpath = NULL; /* UTF-16 version of the path */
int ret = -1;

/* First try removing the file with the normal POSIX remove() call.
* This will handle ASCII without additional processing as well as
* systems where code pages are being used instead of true Unicode.
*/
if ((ret = remove(path)) >= 0) {
/* If this succeeds, we're done */
goto done;
}

if (errno == ENOENT) {
/* Not found, reset errno and try with UTF-16 */
errno = 0;
}
else {
/* Some other error (like permissions), so just exit */
goto done;
}

/* Convert the input UTF-8 path to UTF-16 */
if (NULL == (wpath = H5_get_utf16_str(path)))
goto done;

/* Open the file */
/* Remove the file using a UTF-16 path */
ret = _wremove(wpath);

done:
if (wpath)
H5MM_xfree((void *)wpath);
H5MM_xfree(wpath);

return ret;
} /* end Wremove_utf8() */
} /* end Wremove() */

#endif /* H5_HAVE_WIN32_API */

Expand Down
12 changes: 6 additions & 6 deletions src/H5win32defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ struct timezone {
};
#endif

#define HDcreat(S, M) Wopen_utf8(S, O_CREAT | O_TRUNC | O_RDWR, M)
#define HDcreat(S, M) Wopen(S, O_CREAT | O_TRUNC | O_RDWR, M)
#define HDflock(F, L) Wflock(F, L)
#define HDfstat(F, B) _fstati64(F, B)
#define HDftell(F) _ftelli64(F)
Expand All @@ -59,13 +59,13 @@ struct timezone {
*/
#if (defined(_MSC_VER) && !defined(_MSVC_TRADITIONAL)) || _MSVC_TRADITIONAL
/* Using the MSVC traditional preprocessor */
#define HDopen(S, F, ...) Wopen_utf8(S, F, __VA_ARGS__)
#define HDopen(S, F, ...) Wopen(S, F, __VA_ARGS__)
#else
/* Using a standards conformant preprocessor */
#define HDopen(S, F, ...) Wopen_utf8(S, F, ##__VA_ARGS__)
#define HDopen(S, F, ...) Wopen(S, F, ##__VA_ARGS__)
#endif

#define HDremove(S) Wremove_utf8(S)
#define HDremove(S) Wremove(S)
#define HDsetenv(N, V, O) Wsetenv(N, V, O)
#define HDsetvbuf(F, S, M, Z) setvbuf(F, S, M, (Z > 1 ? Z : 2))
#define HDsleep(S) Sleep(S * 1000)
Expand All @@ -89,8 +89,8 @@ H5_DLL int Wsetenv(const char *name, const char *value, int overwrite);
H5_DLL int Wflock(int fd, int operation);
H5_DLL herr_t H5_expand_windows_env_vars(char **env_var);
H5_DLL wchar_t *H5_get_utf16_str(const char *s);
H5_DLL int Wopen_utf8(const char *path, int oflag, ...);
H5_DLL int Wremove_utf8(const char *path);
H5_DLL int Wopen(const char *path, int oflag, ...);
H5_DLL int Wremove(const char *path);
H5_DLL int H5_get_win32_times(H5_timevals_t *tvs);
H5_DLL char *H5_strndup(const char *s, size_t n);
H5_DLL char *Wstrcasestr_wrap(const char *haystack, const char *needle);
Expand Down

0 comments on commit eb0351e

Please sign in to comment.