From 4d6b2ffb8fcc277d2009678cbb9eca31187aae3b Mon Sep 17 00:00:00 2001 From: Johannes Kaufmann Date: Sun, 17 Nov 2024 18:14:36 +0100 Subject: [PATCH] converter: allow base domain with url scheme --- cli/cmd/exec_test.go | 17 ++++++++--------- converter/url.go | 30 +++++++++++++++++++++++------- converter/url_test.go | 30 +++++++++++++++++++++++++++++- 3 files changed, 60 insertions(+), 17 deletions(-) diff --git a/cli/cmd/exec_test.go b/cli/cmd/exec_test.go index 27eab1e..688f8fb 100644 --- a/cli/cmd/exec_test.go +++ b/cli/cmd/exec_test.go @@ -454,15 +454,14 @@ func TestExecute_General(t *testing.T) { expectedStdout: []byte("![](http://example.com/image.png)\n"), }, - // TODO: with https domain - // { - // desc: "[domain] with https domain", - // - // inputStdin: []byte(``), - // inputArgs: []string{"html2markdown", "--domain", "https://example.com"}, - // - // expectedStdout: []byte("![](https://example.com/image.png)\n"), - // }, + { + desc: "[domain] with https domain", + + inputStdin: []byte(``), + inputArgs: []string{"html2markdown", "--domain", "https://example.com"}, + + expectedStdout: []byte("![](https://example.com/image.png)\n"), + }, // - - - - - selectors - - - - - // { diff --git a/converter/url.go b/converter/url.go index ede0361..febd4b5 100644 --- a/converter/url.go +++ b/converter/url.go @@ -16,6 +16,25 @@ var percentEncodingReplacer = strings.NewReplacer( ">", "%3E", ) +func parseBaseDomain(rawDomain string) *url.URL { + if rawDomain == "" { + return nil + } + + u1, err := url.Parse(rawDomain) + if err == nil && u1.Host != "" { + // Yes, we got valid domain (probably with a http/https scheme) + return u1 + } + + u2, err := url.Parse("http://" + rawDomain) + if err == nil && u2.Host != "" { + // Yes, we got a valid domain (by choosing a fallback scheme) + return u2 + } + + return nil +} func defaultAssembleAbsoluteURL(tagName string, rawURL string, domain string) string { rawURL = strings.TrimSpace(rawURL) @@ -51,13 +70,10 @@ func defaultAssembleAbsoluteURL(tagName string, rawURL string, domain string) st // e.g. the email reading "Hi+Johannes" instead of "Hi Johannes" u.RawQuery = strings.ReplaceAll(u.RawQuery, "+", "%20") - if domain != "" { - if u.Scheme == "" { - u.Scheme = "http" - } - if u.Host == "" { - u.Host = domain - } + if base := parseBaseDomain(domain); base != nil { + // If a "domain" is provided, we use that to convert relative links + // to absolute links. + u = base.ResolveReference(u) } return percentEncodingReplacer.Replace(u.String()) diff --git a/converter/url_test.go b/converter/url_test.go index 5f4375c..f87b6ed 100644 --- a/converter/url_test.go +++ b/converter/url_test.go @@ -66,6 +66,34 @@ func TestDefaultAssembleAbsoluteURL(t *testing.T) { expected: "http://test.com/page.html?key=val#hash", }, + { + desc: "with http domain", + + tagName: "a", + input: "/page.html?key=val#hash", + domain: "http://test.com", + + expected: "http://test.com/page.html?key=val#hash", + }, + { + desc: "with https domain", + + tagName: "a", + input: "/page.html?key=val#hash", + domain: "https://test.com", + + expected: "https://test.com/page.html?key=val#hash", + }, + { + desc: "with domain that includes path", + + tagName: "a", + input: "/page.html?key=val#hash", + domain: "https://test.com/random_stuff", + + expected: "https://test.com/page.html?key=val#hash", + }, + { desc: "data uri", @@ -223,7 +251,7 @@ func TestDefaultAssembleAbsoluteURL(t *testing.T) { } } -func TestParseAndEncode(t *testing.T) { +func TestParseAndEncodeQuery(t *testing.T) { runs := []struct { desc string