From 2b9928561dd5d83e8d7332a09c3a77ead5f43a95 Mon Sep 17 00:00:00 2001 From: Antoine du Hamel Date: Thu, 31 Oct 2024 22:11:27 +0100 Subject: [PATCH] url: refactor `pathToFileURL` to native PR-URL: https://github.com/nodejs/node/pull/55476 Reviewed-By: Yagiz Nizipli --- lib/internal/url.js | 92 +++++++-------------------------------- src/node_url.cc | 104 ++++++++++++++++++++++++++++++++++++++++++++ src/node_url.h | 1 + 3 files changed, 120 insertions(+), 77 deletions(-) diff --git a/lib/internal/url.js b/lib/internal/url.js index 64f1b0b6cc0ac6..14b0ef61d2f91c 100644 --- a/lib/internal/url.js +++ b/lib/internal/url.js @@ -16,7 +16,6 @@ const { ObjectSetPrototypeOf, ReflectGetOwnPropertyDescriptor, ReflectOwnKeys, - RegExpPrototypeSymbolReplace, SafeMap, SafeSet, StringPrototypeCharAt, @@ -779,6 +778,8 @@ function isURL(self) { * for invalid URL inputs. */ const kParseURLSymbol = Symbol('kParseURL'); +const kCreateURLFromPosixPathSymbol = Symbol('kCreateURLFromPosixPath'); +const kCreateURLFromWindowsPathSymbol = Symbol('kCreateURLFromWindowsPath'); class URL { #context = new URLContext(); @@ -812,8 +813,17 @@ class URL { base = `${base}`; } - const raiseException = parseSymbol !== kParseURLSymbol; - const href = bindingUrl.parse(input, base, raiseException); + let href; + if (arguments.length < 3) { + href = bindingUrl.parse(input, base, true); + } else { + const raiseException = parseSymbol !== kParseURLSymbol; + const interpretAsWindowsPath = parseSymbol === kCreateURLFromWindowsPathSymbol; + const pathToFileURL = interpretAsWindowsPath || (parseSymbol === kCreateURLFromPosixPathSymbol); + href = pathToFileURL ? + bindingUrl.pathToFileURL(input, interpretAsWindowsPath, base) : + bindingUrl.parse(input, base, raiseException); + } if (href) { this.#updateContext(href); } @@ -1500,76 +1510,9 @@ function fileURLToPath(path, options = kEmptyObject) { return (windows ?? isWindows) ? getPathFromURLWin32(path) : getPathFromURLPosix(path); } -// RFC1738 defines the following chars as "unsafe" for URLs -// @see https://www.ietf.org/rfc/rfc1738.txt 2.2. URL Character Encoding Issues -const percentRegEx = /%/g; -const newlineRegEx = /\n/g; -const carriageReturnRegEx = /\r/g; -const tabRegEx = /\t/g; -const quoteRegEx = /"/g; -const hashRegex = /#/g; -const spaceRegEx = / /g; -const questionMarkRegex = /\?/g; -const openSquareBracketRegEx = /\[/g; -const backslashRegEx = /\\/g; -const closeSquareBracketRegEx = /]/g; -const caretRegEx = /\^/g; -const verticalBarRegEx = /\|/g; -const tildeRegEx = /~/g; - -function encodePathChars(filepath, options = kEmptyObject) { - if (StringPrototypeIncludes(filepath, '%')) { - filepath = RegExpPrototypeSymbolReplace(percentRegEx, filepath, '%25'); - } - - if (StringPrototypeIncludes(filepath, '\t')) { - filepath = RegExpPrototypeSymbolReplace(tabRegEx, filepath, '%09'); - } - if (StringPrototypeIncludes(filepath, '\n')) { - filepath = RegExpPrototypeSymbolReplace(newlineRegEx, filepath, '%0A'); - } - if (StringPrototypeIncludes(filepath, '\r')) { - filepath = RegExpPrototypeSymbolReplace(carriageReturnRegEx, filepath, '%0D'); - } - if (StringPrototypeIncludes(filepath, ' ')) { - filepath = RegExpPrototypeSymbolReplace(spaceRegEx, filepath, '%20'); - } - if (StringPrototypeIncludes(filepath, '"')) { - filepath = RegExpPrototypeSymbolReplace(quoteRegEx, filepath, '%22'); - } - if (StringPrototypeIncludes(filepath, '#')) { - filepath = RegExpPrototypeSymbolReplace(hashRegex, filepath, '%23'); - } - if (StringPrototypeIncludes(filepath, '?')) { - filepath = RegExpPrototypeSymbolReplace(questionMarkRegex, filepath, '%3F'); - } - if (StringPrototypeIncludes(filepath, '[')) { - filepath = RegExpPrototypeSymbolReplace(openSquareBracketRegEx, filepath, '%5B'); - } - // Back-slashes must be special-cased on Windows, where they are treated as path separator. - if (!options.windows && StringPrototypeIncludes(filepath, '\\')) { - filepath = RegExpPrototypeSymbolReplace(backslashRegEx, filepath, '%5C'); - } - if (StringPrototypeIncludes(filepath, ']')) { - filepath = RegExpPrototypeSymbolReplace(closeSquareBracketRegEx, filepath, '%5D'); - } - if (StringPrototypeIncludes(filepath, '^')) { - filepath = RegExpPrototypeSymbolReplace(caretRegEx, filepath, '%5E'); - } - if (StringPrototypeIncludes(filepath, '|')) { - filepath = RegExpPrototypeSymbolReplace(verticalBarRegEx, filepath, '%7C'); - } - if (StringPrototypeIncludes(filepath, '~')) { - filepath = RegExpPrototypeSymbolReplace(tildeRegEx, filepath, '%7E'); - } - - return filepath; -} - function pathToFileURL(filepath, options = kEmptyObject) { const windows = options?.windows ?? isWindows; if (windows && StringPrototypeStartsWith(filepath, '\\\\')) { - const outURL = new URL('file://'); // UNC path format: \\server\share\resource // Handle extended UNC path and standard UNC path // "\\?\UNC\" path prefix should be ignored. @@ -1592,12 +1535,7 @@ function pathToFileURL(filepath, options = kEmptyObject) { ); } const hostname = StringPrototypeSlice(filepath, prefixLength, hostnameEndIndex); - outURL.hostname = domainToASCII(hostname); - outURL.pathname = encodePathChars( - RegExpPrototypeSymbolReplace(backslashRegEx, StringPrototypeSlice(filepath, hostnameEndIndex), '/'), - { windows }, - ); - return outURL; + return new URL(StringPrototypeSlice(filepath, hostnameEndIndex), hostname, kCreateURLFromWindowsPathSymbol); } let resolved = windows ? path.win32.resolve(filepath) : path.posix.resolve(filepath); // path.resolve strips trailing slashes so we must add them back @@ -1608,7 +1546,7 @@ function pathToFileURL(filepath, options = kEmptyObject) { resolved[resolved.length - 1] !== path.sep) resolved += '/'; - return new URL(`file://${encodePathChars(resolved, { windows })}`); + return new URL(resolved, undefined, windows ? kCreateURLFromWindowsPathSymbol : kCreateURLFromPosixPathSymbol); } function toPathIfFileURL(fileURLOrPath) { diff --git a/src/node_url.cc b/src/node_url.cc index d49229f2b1f536..5b854cd9aeaa8b 100644 --- a/src/node_url.cc +++ b/src/node_url.cc @@ -75,6 +75,108 @@ void BindingData::Deserialize(v8::Local context, CHECK_NOT_NULL(binding); } +#ifndef LARGEST_ASCII_CHAR_CODE_TO_ENCODE +#define LARGEST_ASCII_CHAR_CODE_TO_ENCODE '~' +#endif + +// RFC1738 defines the following chars as "unsafe" for URLs +// @see https://www.ietf.org/rfc/rfc1738.txt 2.2. URL Character Encoding Issues +constexpr auto lookup_table = []() consteval { + // Each entry is an array that can hold up to 3 chars + null terminator + std::array, LARGEST_ASCII_CHAR_CODE_TO_ENCODE + 1> + result{}; + + for (uint8_t i = 0; i <= LARGEST_ASCII_CHAR_CODE_TO_ENCODE; i++) { + switch (i) { +#define ENCODE_CHAR(CHAR, HEX_DIGIT_2, HEX_DIGIT_1) \ + case CHAR: \ + result[i] = {{'%', HEX_DIGIT_2, HEX_DIGIT_1, 0}}; \ + break; + + ENCODE_CHAR('\0', '0', '0') // '\0' == 0x00 + ENCODE_CHAR('\t', '0', '9') // '\t' == 0x09 + ENCODE_CHAR('\n', '0', 'A') // '\n' == 0x0A + ENCODE_CHAR('\r', '0', 'D') // '\r' == 0x0D + ENCODE_CHAR(' ', '2', '0') // ' ' == 0x20 + ENCODE_CHAR('"', '2', '2') // '"' == 0x22 + ENCODE_CHAR('#', '2', '3') // '#' == 0x23 + ENCODE_CHAR('%', '2', '5') // '%' == 0x25 + ENCODE_CHAR('?', '3', 'F') // '?' == 0x3F + ENCODE_CHAR('[', '5', 'B') // '[' == 0x5B + ENCODE_CHAR('\\', '5', 'C') // '\\' == 0x5C + ENCODE_CHAR(']', '5', 'D') // ']' == 0x5D + ENCODE_CHAR('^', '5', 'E') // '^' == 0x5E + ENCODE_CHAR('|', '7', 'C') // '|' == 0x7C + ENCODE_CHAR('~', '7', 'E') // '~' == 0x7E +#undef ENCODE_CHAR + + default: + result[i] = {{static_cast(i), '\0', '\0', '\0'}}; + break; + } + } + + return result; +} +(); + +enum class OS { WINDOWS, POSIX }; + +std::string EncodePathChars(std::string_view input_str, OS operating_system) { + std::string encoded = "file://"; + encoded.reserve(input_str.size() + + 7); // Reserve space for "file://" and input_str + for (size_t i : input_str) { + if (i > LARGEST_ASCII_CHAR_CODE_TO_ENCODE) [[unlikely]] { + encoded.push_back(i); + continue; + } + if (operating_system == OS::WINDOWS) { + if (i == '\\') { + encoded.push_back('/'); + continue; + } + } + encoded.append(lookup_table[i].data()); + } + + return encoded; +} + +void BindingData::PathToFileURL(const FunctionCallbackInfo& args) { + CHECK_GE(args.Length(), 2); // input + CHECK(args[0]->IsString()); + CHECK(args[1]->IsBoolean()); + + Realm* realm = Realm::GetCurrent(args); + BindingData* binding_data = realm->GetBindingData(); + Isolate* isolate = realm->isolate(); + OS os = args[1]->IsTrue() ? OS::WINDOWS : OS::POSIX; + + Utf8Value input(isolate, args[0]); + auto input_str = input.ToStringView(); + CHECK(!input_str.empty()); + + auto out = + ada::parse(EncodePathChars(input_str, os), nullptr); + + if (!out) { + return ThrowInvalidURL(realm->env(), input.ToStringView(), nullptr); + } + + if (os == OS::WINDOWS && args.Length() > 2 && !args[2]->IsUndefined()) + [[unlikely]] { + CHECK(args[2]->IsString()); + Utf8Value hostname(isolate, args[2]); + CHECK(out->set_hostname(hostname.ToStringView())); + } + + binding_data->UpdateComponents(out->get_components(), out->type); + + args.GetReturnValue().Set( + ToV8Value(realm->context(), out->get_href(), isolate).ToLocalChecked()); +} + void BindingData::DomainToASCII(const FunctionCallbackInfo& args) { Environment* env = Environment::GetCurrent(args); CHECK_GE(args.Length(), 1); // input @@ -371,6 +473,7 @@ void BindingData::CreatePerIsolateProperties(IsolateData* isolate_data, SetMethodNoSideEffect(isolate, target, "format", Format); SetMethodNoSideEffect(isolate, target, "getOrigin", GetOrigin); SetMethod(isolate, target, "parse", Parse); + SetMethod(isolate, target, "pathToFileURL", PathToFileURL); SetMethod(isolate, target, "update", Update); SetFastMethodNoSideEffect( isolate, target, "canParse", CanParse, {fast_can_parse_methods_, 2}); @@ -391,6 +494,7 @@ void BindingData::RegisterExternalReferences( registry->Register(Format); registry->Register(GetOrigin); registry->Register(Parse); + registry->Register(PathToFileURL); registry->Register(Update); registry->Register(CanParse); registry->Register(FastCanParse); diff --git a/src/node_url.h b/src/node_url.h index 39fe9c9c8506e9..74f8a49955ce46 100644 --- a/src/node_url.h +++ b/src/node_url.h @@ -59,6 +59,7 @@ class BindingData : public SnapshotableObject { static void Format(const v8::FunctionCallbackInfo& args); static void GetOrigin(const v8::FunctionCallbackInfo& args); static void Parse(const v8::FunctionCallbackInfo& args); + static void PathToFileURL(const v8::FunctionCallbackInfo& args); static void Update(const v8::FunctionCallbackInfo& args); static void CreatePerIsolateProperties(IsolateData* isolate_data,