-
Notifications
You must be signed in to change notification settings - Fork 55
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[backport] fix(webconnectivitylte): handle i18n domains
This diff backports f62b76f. This diff introduces a standalone inputparser package that implements the same functionality of the namesake code that previously was part of webconnectivitylte. Additionally, the new inputparser automaticallt converts i18n domains to ASCII. By doing that, we: 1. preserve the original URL in its i18n format; 2. perform the measurement with the ASCII punycode URL; 3. avoid reporting an anomaly, which we documented in ooni/probe#1925 (comment).
1 parent
a414378
commit 691e539
Showing
4 changed files
with
322 additions
and
66 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
// Package inputparser contains code to parse experiments input. | ||
package inputparser | ||
|
||
import ( | ||
"errors" | ||
"fmt" | ||
"net" | ||
"net/url" | ||
"reflect" | ||
|
||
"github.com/ooni/probe-cli/v3/internal/model" | ||
"github.com/ooni/probe-cli/v3/internal/runtimex" | ||
"golang.org/x/net/idna" | ||
) | ||
|
||
// Config contains config for parsing experiments input. You MUST set | ||
// the fields marked as MANDATORY otherwise Parse will fail. | ||
type Config struct { | ||
// AcceptedSchemes is the list of accepted URL schemes. This field is | ||
// MANDATORY except when parsing endpoints where we do not need to | ||
// validate the scheme since we use DefaultScheme. | ||
AcceptedSchemes []string | ||
|
||
// AllowEndpoints OPTIONALLY tells the input parser to also | ||
// accept endpoints as experiment inputs. | ||
AllowEndpoints bool | ||
|
||
// DefaultScheme is the scheme to use when accepting endpoints, | ||
// which is MANDATORY iff AllowEndpoints is true. | ||
DefaultScheme string | ||
} | ||
|
||
// ErrEmptyDefaultScheme indicates that the default scheme is empty. | ||
var ErrEmptyDefaultScheme = errors.New("inputparser: empty default scheme") | ||
|
||
// ErrEmptyHostname indicates that the URL.Hostname() is empty. | ||
var ErrEmptyHostname = errors.New("inputparser: empty URL.Hostname()") | ||
|
||
// ErrIDNAToASCII indicates that we cannot convert IDNA to ASCII. | ||
var ErrIDNAToASCII = errors.New("inputparser: cannot convert IDNA to ASCII") | ||
|
||
// ErrInvalidEndpoint indicates that we are not parsing a valid endpoint. | ||
var ErrInvalidEndpoint = errors.New("inputparser: invalid endpoint") | ||
|
||
// ErrURLParse indicates that we could not parse the URL. | ||
var ErrURLParse = errors.New("inputparser: cannot parse URL") | ||
|
||
// ErrUnsupportedScheme indicates that we do not support the given URL.Scheme. | ||
var ErrUnsupportedScheme = errors.New("inputparser: unsupported URL.Scheme") | ||
|
||
// Parse parses the experiment input using the given config and returns | ||
// to the caller either the resulting URL or an error. | ||
func Parse(config *Config, input model.MeasurementTarget) (*url.URL, error) { | ||
runtimex.Assert(config != nil, "passed nil config") | ||
runtimex.Assert(input != "", "passed empty input") | ||
|
||
// Attempt to parse the input as an URL. | ||
URL, err := url.Parse(string(input)) | ||
if err != nil { | ||
return nil, fmt.Errorf("%w: %s", ErrURLParse, err.Error()) | ||
} | ||
|
||
// Reject empty URL.Hostname(). | ||
if URL.Hostname() == "" { | ||
// If we are not allowed to parse endpoints, just emit an error. | ||
if !config.AllowEndpoints { | ||
return nil, ErrEmptyHostname | ||
} | ||
|
||
// Check whether we could interpret the URL as an endpoint. | ||
URL, err = maybeEndpointToURL(config, URL) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
// Fallthrough on success. | ||
} | ||
|
||
// Reject schemes that are not allowed for this experiment. | ||
if !isSchemeOK(config, URL) { | ||
return nil, ErrUnsupportedScheme | ||
} | ||
|
||
// Possibly rewrite the URL.Host to be in punycode. | ||
return maybeConvertHostnameToASCII(URL) | ||
} | ||
|
||
// maybeEndpointToURL takes in input an already parsed URL and returns | ||
// in output either a new URL containing an endpoint with the configured | ||
// default scheme or an error. For example, given this input: | ||
// | ||
// &url.URL{Scheme:"example.com",Opaque:"80"} | ||
// | ||
// and `http` as the config.DefaultScheme, this function would return: | ||
// | ||
// &url.URL{Scheme:"http",Host:"example.com:80"} | ||
// | ||
// See https://go.dev/play/p/Rk5pS_zGY5U for additional information on how | ||
// URL.Parse will parse "example.com:80" and other endpoints. | ||
func maybeEndpointToURL(config *Config, URL *url.URL) (*url.URL, error) { | ||
// Make sure the parsing result is exactly what we expected. | ||
expect := &url.URL{ | ||
Scheme: URL.Scheme, | ||
Opaque: URL.Opaque, | ||
} | ||
if !reflect.DeepEqual(URL, expect) { | ||
return nil, ErrInvalidEndpoint | ||
} | ||
|
||
// Make sure we actually have a valid default scheme. | ||
if config.DefaultScheme == "" { | ||
return nil, ErrEmptyDefaultScheme | ||
} | ||
|
||
// Rewrite the URL to contain the endpoint. | ||
URL = &url.URL{ | ||
Scheme: config.DefaultScheme, | ||
Host: net.JoinHostPort(expect.Scheme, expect.Opaque), | ||
} | ||
return URL, nil | ||
} | ||
|
||
// maybeConvertHostnameToASCII takes in input a URL and converts | ||
// the URL.Host to become ASCII. This function MUTATES the input URL | ||
// in place and returns either the mutated URL or an error. | ||
func maybeConvertHostnameToASCII(URL *url.URL) (*url.URL, error) { | ||
hostname := URL.Hostname() | ||
|
||
// Obtain an ASCII representation of the URL.Hostname(). | ||
asciiHostname, err := idna.ToASCII(hostname) | ||
if err != nil { | ||
return nil, fmt.Errorf("%w: %s", ErrIDNAToASCII, err.Error()) | ||
} | ||
|
||
// Possibly rewrite the URL.Host to be in punycode. | ||
if asciiHostname != hostname { | ||
if port := URL.Port(); port != "" { | ||
URL.Host = net.JoinHostPort(asciiHostname, port) | ||
} else { | ||
URL.Host = asciiHostname | ||
} | ||
} | ||
|
||
// Return the parsed URL to the caller. | ||
return URL, nil | ||
} | ||
|
||
// isSchemeOK indicates whether the given URL scheme is OK. | ||
func isSchemeOK(config *Config, URL *url.URL) bool { | ||
for _, scheme := range config.AcceptedSchemes { | ||
if URL.Scheme == scheme { | ||
return true | ||
} | ||
} | ||
// We don't need to provide AcceptedSchemes when ONLY parsing endpoints. | ||
return config.AllowEndpoints && URL.Scheme == config.DefaultScheme | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
package inputparser | ||
|
||
import ( | ||
"errors" | ||
"net/url" | ||
"testing" | ||
|
||
"github.com/google/go-cmp/cmp" | ||
"github.com/ooni/probe-cli/v3/internal/model" | ||
) | ||
|
||
func TestParse(t *testing.T) { | ||
|
||
// testCase describes a test case. | ||
type testCase struct { | ||
// name is the MANDATORY name of the test case. | ||
name string | ||
|
||
// config is the MANDATORY Config to use. | ||
config *Config | ||
|
||
// input is the MANDATORY string-format input-URL to parse. | ||
input model.MeasurementTarget | ||
|
||
// expectURL is the OPTIONAL URL we expect in output. | ||
expectURL *url.URL | ||
|
||
// expectErr is the OPTIONAL error we expect in output. | ||
expectErr error | ||
} | ||
|
||
var allTestCases = []testCase{{ | ||
name: "when the input is an endpoint and we accept endpoints", | ||
config: &Config{ | ||
// We don't need to provide an AcceptedScheme when ONLY parsing endpoints. | ||
AcceptedSchemes: []string{""}, | ||
AllowEndpoints: true, | ||
DefaultScheme: "http", | ||
}, | ||
input: "example.com:80", | ||
expectURL: &url.URL{ | ||
Scheme: "http", | ||
Host: "example.com:80", | ||
}, | ||
expectErr: nil, | ||
}, { | ||
name: "when the input is an endpoint and we don't accept endpoints", | ||
config: &Config{ | ||
AcceptedSchemes: []string{"http"}, | ||
AllowEndpoints: false, | ||
DefaultScheme: "", | ||
}, | ||
input: "example.com:80", | ||
expectURL: nil, | ||
expectErr: ErrEmptyHostname, | ||
}, { | ||
name: "when the input is a domain or IP address and we accept endpoints", | ||
config: &Config{ | ||
AcceptedSchemes: []string{"http"}, | ||
AllowEndpoints: true, | ||
DefaultScheme: "http", | ||
}, | ||
input: "example.com", | ||
expectURL: nil, | ||
expectErr: ErrInvalidEndpoint, | ||
}, { | ||
name: "when the URL does not parse", | ||
config: &Config{ | ||
AcceptedSchemes: []string{"http"}, | ||
AllowEndpoints: false, | ||
DefaultScheme: "", | ||
}, | ||
input: "http://\t/\r\n", | ||
expectURL: nil, | ||
expectErr: ErrURLParse, | ||
}, { | ||
name: "when the URL scheme is unsupported", | ||
config: &Config{ | ||
AcceptedSchemes: []string{"http"}, | ||
AllowEndpoints: false, | ||
DefaultScheme: "", | ||
}, | ||
input: "smtp://example.com:53", | ||
expectURL: nil, | ||
expectErr: ErrUnsupportedScheme, | ||
}, { | ||
name: "when the default scheme is empty", | ||
config: &Config{ | ||
AcceptedSchemes: []string{}, | ||
AllowEndpoints: true, | ||
DefaultScheme: "", | ||
}, | ||
input: "example.com:80", | ||
expectURL: nil, | ||
expectErr: ErrEmptyDefaultScheme, | ||
}, { | ||
name: "for IDNA URL without a port", | ||
config: &Config{ | ||
AcceptedSchemes: []string{"http"}, | ||
AllowEndpoints: false, | ||
DefaultScheme: "", | ||
}, | ||
input: "http://ουτοπία.δπθ.gr/", | ||
expectURL: &url.URL{ | ||
Scheme: "http", | ||
Host: "xn--kxae4bafwg.xn--pxaix.gr", | ||
Path: "/", | ||
}, | ||
expectErr: nil, | ||
}, { | ||
name: "for IDNA URL with a port", | ||
config: &Config{ | ||
AcceptedSchemes: []string{"http"}, | ||
AllowEndpoints: false, | ||
DefaultScheme: "", | ||
}, | ||
input: "http://ουτοπία.δπθ.gr:80/", | ||
expectURL: &url.URL{ | ||
Scheme: "http", | ||
Host: "xn--kxae4bafwg.xn--pxaix.gr:80", | ||
Path: "/", | ||
}, | ||
expectErr: nil, | ||
}, { | ||
name: "when we cannot convert IDNA to ASCII", | ||
config: &Config{ | ||
AcceptedSchemes: []string{"http"}, | ||
AllowEndpoints: false, | ||
DefaultScheme: "", | ||
}, | ||
// See https://www.farsightsecurity.com/blog/txt-record/punycode-20180711/ | ||
input: "http://xn--0000h/", | ||
expectURL: nil, | ||
expectErr: ErrIDNAToASCII, | ||
}} | ||
|
||
for _, tc := range allTestCases { | ||
t.Run(tc.name, func(t *testing.T) { | ||
URL, err := Parse(tc.config, tc.input) | ||
|
||
// parse the error | ||
switch { | ||
case err == nil && tc.expectErr == nil: | ||
// nothing | ||
case err == nil && tc.expectErr != nil: | ||
t.Fatal("expected", tc.expectErr, "got", err) | ||
case err != nil && tc.expectErr == nil: | ||
t.Fatal("expected", tc.expectErr, "got", err) | ||
default: | ||
if !errors.Is(err, tc.expectErr) { | ||
t.Fatal("unexpected error", err) | ||
} | ||
} | ||
|
||
// validate the returned URL | ||
if diff := cmp.Diff(tc.expectURL, URL); diff != "" { | ||
t.Fatal(diff) | ||
} | ||
}) | ||
} | ||
} |