Skip to content

Commit

Permalink
fix(windows): handle invalid wide chars in file names (fixes gh #241)
Browse files Browse the repository at this point in the history
For some reason, Windows allows invalid UTF-16 characters in file names.
Try to handle these gracefully when converting to UTF-8.
  • Loading branch information
mhx committed Oct 12, 2024
1 parent f9737b8 commit 0eb2ecb
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 12 deletions.
1 change: 1 addition & 0 deletions include/dwarfs/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ void utf8_sanitize(std::string& str);
void shorten_path_string(std::string& path, char separator, size_t max_len);

std::filesystem::path canonical_path(std::filesystem::path p);
std::string path_to_utf8_string_sanitized(std::filesystem::path const& p);

bool getenv_is_enabled(char const* var);

Expand Down
3 changes: 3 additions & 0 deletions include/dwarfs/writer/internal/entry.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,9 @@ class entry : public entry_interface {
private:
std::u8string u8name() const;

#ifdef _WIN32
std::filesystem::path path_;
#endif
std::string name_;
std::weak_ptr<entry> parent_;
file_stat stat_;
Expand Down
20 changes: 20 additions & 0 deletions src/util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <type_traits>

#if __has_include(<utf8cpp/utf8.h>)
#include <utf8cpp/utf8.h>
Expand Down Expand Up @@ -264,6 +265,25 @@ std::filesystem::path canonical_path(std::filesystem::path p) {
return p;
}

std::string path_to_utf8_string_sanitized(std::filesystem::path const& p) {
#ifdef _WIN32
if constexpr (std::is_same_v<std::filesystem::path::value_type, wchar_t>) {
auto const& in = p.native();
if (in.empty()) {
return {};
}
int size_needed = ::WideCharToMultiByte(
CP_UTF8, 0, in.data(), (int)in.size(), NULL, 0, NULL, NULL);
std::string out(size_needed, 0);
::WideCharToMultiByte(CP_UTF8, 0, in.data(), (int)in.size(), &out[0],
size_needed, NULL, NULL);
return out;
}
#endif

return u8string_to_string(p.u8string());
}

bool getenv_is_enabled(char const* var) {
if (auto val = std::getenv(var)) {
if (auto maybeBool = try_to<bool>(val); maybeBool && *maybeBool) {
Expand Down
27 changes: 16 additions & 11 deletions src/writer/internal/entry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,20 +58,19 @@ bool is_root_path(std::string_view path) {
#endif
}

std::string entry_name(fs::path const& path, bool has_parent) {
if (has_parent) {
return u8string_to_string(path.filename().u8string());
}
return u8string_to_string(path.u8string());
}

} // namespace

entry::entry(fs::path const& path, std::shared_ptr<entry> parent,
file_stat const& st)
: name_{entry_name(path, static_cast<bool>(parent))}
#ifdef _WIN32
: path_{parent ? path.filename() : path}
, name_{path_to_utf8_string_sanitized(path_)}
#else
: name_{path_to_utf8_string_sanitized(parent ? path.filename() : path)}
#endif
, parent_{std::move(parent)}
, stat_{st} {}
, stat_{st} {
}

bool entry::has_parent() const {
if (parent_.lock()) {
Expand All @@ -88,11 +87,17 @@ void entry::set_name(const std::string& name) { name_ = name; }
std::u8string entry::u8name() const { return string_to_u8string(name_); }

fs::path entry::fs_path() const {
#ifdef _WIN32
fs::path self = path_;
#else
fs::path self = name_;
#endif

if (auto parent = parent_.lock()) {
return parent->fs_path() / u8name();
return parent->fs_path() / self;
}

return fs::path(u8name());
return self;
}

std::string entry::path_as_string() const {
Expand Down
23 changes: 22 additions & 1 deletion src/writer/scanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <stdexcept>
#include <string>
#include <system_error>
#include <unordered_set>
#include <utility>
#include <vector>

Expand Down Expand Up @@ -330,6 +331,7 @@ class scanner_ final : public scanner::impl {
os_access const& os_;
std::vector<std::unique_ptr<entry_filter>> filters_;
std::vector<std::unique_ptr<entry_transformer>> transformers_;
std::unordered_set<std::string> invalid_filenames_;
};

template <typename LoggerPolicy>
Expand Down Expand Up @@ -362,6 +364,24 @@ scanner_<LoggerPolicy>::add_entry(std::filesystem::path const& name,
file_scanner& fs, bool debug_filter) {
try {
auto pe = entry_factory_.create(os_, name, parent);

if constexpr (!std::is_same_v<std::filesystem::path::value_type, char>) {
try {
auto tmp [[maybe_unused]] = name.filename().u8string();
} catch (std::system_error const& e) {
LOG_ERROR << fmt::format(
"invalid file name in \"{}\", storing as \"{}\": {}",
path_to_utf8_string_sanitized(name.parent_path()), pe->name(),
e.what());
if (!invalid_filenames_.emplace(path_to_utf8_string_sanitized(name))
.second) {
LOG_ERROR << fmt::format(
"cannot store \"{}\" as the name already exists", pe->name());
return nullptr;
}
}
}

bool const exclude =
std::any_of(filters_.begin(), filters_.end(), [&pe](auto const& f) {
return f->filter(*pe) == filter_action::remove;
Expand Down Expand Up @@ -452,7 +472,8 @@ scanner_<LoggerPolicy>::add_entry(std::filesystem::path const& name,

return pe;
} catch (const std::system_error& e) {
LOG_ERROR << fmt::format("error reading entry (path={}): {}", name.string(),
LOG_ERROR << fmt::format("error reading entry (path={}): {}",
path_to_utf8_string_sanitized(name),
exception_str(e));
prog.errors++;
}
Expand Down

0 comments on commit 0eb2ecb

Please sign in to comment.