Skip to content

Commit

Permalink
[backport] fix(webconnectivitylte): handle i18n domains
Browse files Browse the repository at this point in the history
This diff backports f62b76f.

This diff introduces a standalone inputparser package that
implements the same functionality of the namesake code that
previously was part of webconnectivitylte. Additionally,
the new inputparser automaticallt converts i18n domains to
ASCII. By doing that, we:

1. preserve the original URL in its i18n format;

2. perform the measurement with the ASCII punycode URL;

3. avoid reporting an anomaly, which we documented in
ooni/probe#1925 (comment).
bassosimone committed Mar 16, 2023
1 parent a414378 commit 691e539
Showing 4 changed files with 322 additions and 66 deletions.
63 changes: 0 additions & 63 deletions internal/experiment/webconnectivitylte/inputparser.go

This file was deleted.

7 changes: 4 additions & 3 deletions internal/experiment/webconnectivitylte/measurer.go
Original file line number Diff line number Diff line change
@@ -12,6 +12,7 @@ import (
"sync/atomic"

"github.com/ooni/probe-cli/v3/internal/experiment/webconnectivity"
"github.com/ooni/probe-cli/v3/internal/inputparser"
"github.com/ooni/probe-cli/v3/internal/model"
"golang.org/x/net/publicsuffix"
)
@@ -36,7 +37,7 @@ func (m *Measurer) ExperimentName() string {

// ExperimentVersion implements model.ExperimentMeasurer.
func (m *Measurer) ExperimentVersion() string {
return "0.5.21"
return "0.5.22"
}

// Run implements model.ExperimentMeasurer.
@@ -60,15 +61,15 @@ func (m *Measurer) Run(ctx context.Context, args *model.ExperimentArgs) error {
}

// convert the input string to a URL
inputParser := &InputParser{
inputParserConfig := &inputparser.Config{
AcceptedSchemes: []string{
"http",
"https",
},
AllowEndpoints: false,
DefaultScheme: "",
}
URL, err := inputParser.Parse(string(measurement.Input))
URL, err := inputparser.Parse(inputParserConfig, measurement.Input)
if err != nil {
return err
}
157 changes: 157 additions & 0 deletions internal/inputparser/inputparser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
// Package inputparser contains code to parse experiments input.
package inputparser

import (
"errors"
"fmt"
"net"
"net/url"
"reflect"

"github.com/ooni/probe-cli/v3/internal/model"
"github.com/ooni/probe-cli/v3/internal/runtimex"
"golang.org/x/net/idna"
)

// Config contains config for parsing experiments input. You MUST set
// the fields marked as MANDATORY otherwise Parse will fail.
type Config struct {
// AcceptedSchemes is the list of accepted URL schemes. This field is
// MANDATORY except when parsing endpoints where we do not need to
// validate the scheme since we use DefaultScheme.
AcceptedSchemes []string

// AllowEndpoints OPTIONALLY tells the input parser to also
// accept endpoints as experiment inputs.
AllowEndpoints bool

// DefaultScheme is the scheme to use when accepting endpoints,
// which is MANDATORY iff AllowEndpoints is true.
DefaultScheme string
}

// ErrEmptyDefaultScheme indicates that the default scheme is empty.
var ErrEmptyDefaultScheme = errors.New("inputparser: empty default scheme")

// ErrEmptyHostname indicates that the URL.Hostname() is empty.
var ErrEmptyHostname = errors.New("inputparser: empty URL.Hostname()")

// ErrIDNAToASCII indicates that we cannot convert IDNA to ASCII.
var ErrIDNAToASCII = errors.New("inputparser: cannot convert IDNA to ASCII")

// ErrInvalidEndpoint indicates that we are not parsing a valid endpoint.
var ErrInvalidEndpoint = errors.New("inputparser: invalid endpoint")

// ErrURLParse indicates that we could not parse the URL.
var ErrURLParse = errors.New("inputparser: cannot parse URL")

// ErrUnsupportedScheme indicates that we do not support the given URL.Scheme.
var ErrUnsupportedScheme = errors.New("inputparser: unsupported URL.Scheme")

// Parse parses the experiment input using the given config and returns
// to the caller either the resulting URL or an error.
func Parse(config *Config, input model.MeasurementTarget) (*url.URL, error) {
runtimex.Assert(config != nil, "passed nil config")
runtimex.Assert(input != "", "passed empty input")

// Attempt to parse the input as an URL.
URL, err := url.Parse(string(input))
if err != nil {
return nil, fmt.Errorf("%w: %s", ErrURLParse, err.Error())
}

// Reject empty URL.Hostname().
if URL.Hostname() == "" {
// If we are not allowed to parse endpoints, just emit an error.
if !config.AllowEndpoints {
return nil, ErrEmptyHostname
}

// Check whether we could interpret the URL as an endpoint.
URL, err = maybeEndpointToURL(config, URL)
if err != nil {
return nil, err
}

// Fallthrough on success.
}

// Reject schemes that are not allowed for this experiment.
if !isSchemeOK(config, URL) {
return nil, ErrUnsupportedScheme
}

// Possibly rewrite the URL.Host to be in punycode.
return maybeConvertHostnameToASCII(URL)
}

// maybeEndpointToURL takes in input an already parsed URL and returns
// in output either a new URL containing an endpoint with the configured
// default scheme or an error. For example, given this input:
//
// &url.URL{Scheme:"example.com",Opaque:"80"}
//
// and `http` as the config.DefaultScheme, this function would return:
//
// &url.URL{Scheme:"http",Host:"example.com:80"}
//
// See https://go.dev/play/p/Rk5pS_zGY5U for additional information on how
// URL.Parse will parse "example.com:80" and other endpoints.
func maybeEndpointToURL(config *Config, URL *url.URL) (*url.URL, error) {
// Make sure the parsing result is exactly what we expected.
expect := &url.URL{
Scheme: URL.Scheme,
Opaque: URL.Opaque,
}
if !reflect.DeepEqual(URL, expect) {
return nil, ErrInvalidEndpoint
}

// Make sure we actually have a valid default scheme.
if config.DefaultScheme == "" {
return nil, ErrEmptyDefaultScheme
}

// Rewrite the URL to contain the endpoint.
URL = &url.URL{
Scheme: config.DefaultScheme,
Host: net.JoinHostPort(expect.Scheme, expect.Opaque),
}
return URL, nil
}

// maybeConvertHostnameToASCII takes in input a URL and converts
// the URL.Host to become ASCII. This function MUTATES the input URL
// in place and returns either the mutated URL or an error.
func maybeConvertHostnameToASCII(URL *url.URL) (*url.URL, error) {
hostname := URL.Hostname()

// Obtain an ASCII representation of the URL.Hostname().
asciiHostname, err := idna.ToASCII(hostname)
if err != nil {
return nil, fmt.Errorf("%w: %s", ErrIDNAToASCII, err.Error())
}

// Possibly rewrite the URL.Host to be in punycode.
if asciiHostname != hostname {
if port := URL.Port(); port != "" {
URL.Host = net.JoinHostPort(asciiHostname, port)
} else {
URL.Host = asciiHostname
}
}

// Return the parsed URL to the caller.
return URL, nil
}

// isSchemeOK indicates whether the given URL scheme is OK.
func isSchemeOK(config *Config, URL *url.URL) bool {
for _, scheme := range config.AcceptedSchemes {
if URL.Scheme == scheme {
return true
}
}
// We don't need to provide AcceptedSchemes when ONLY parsing endpoints.
return config.AllowEndpoints && URL.Scheme == config.DefaultScheme
}
161 changes: 161 additions & 0 deletions internal/inputparser/inputparser_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
package inputparser

import (
"errors"
"net/url"
"testing"

"github.com/google/go-cmp/cmp"
"github.com/ooni/probe-cli/v3/internal/model"
)

func TestParse(t *testing.T) {

// testCase describes a test case.
type testCase struct {
// name is the MANDATORY name of the test case.
name string

// config is the MANDATORY Config to use.
config *Config

// input is the MANDATORY string-format input-URL to parse.
input model.MeasurementTarget

// expectURL is the OPTIONAL URL we expect in output.
expectURL *url.URL

// expectErr is the OPTIONAL error we expect in output.
expectErr error
}

var allTestCases = []testCase{{
name: "when the input is an endpoint and we accept endpoints",
config: &Config{
// We don't need to provide an AcceptedScheme when ONLY parsing endpoints.
AcceptedSchemes: []string{""},
AllowEndpoints: true,
DefaultScheme: "http",
},
input: "example.com:80",
expectURL: &url.URL{
Scheme: "http",
Host: "example.com:80",
},
expectErr: nil,
}, {
name: "when the input is an endpoint and we don't accept endpoints",
config: &Config{
AcceptedSchemes: []string{"http"},
AllowEndpoints: false,
DefaultScheme: "",
},
input: "example.com:80",
expectURL: nil,
expectErr: ErrEmptyHostname,
}, {
name: "when the input is a domain or IP address and we accept endpoints",
config: &Config{
AcceptedSchemes: []string{"http"},
AllowEndpoints: true,
DefaultScheme: "http",
},
input: "example.com",
expectURL: nil,
expectErr: ErrInvalidEndpoint,
}, {
name: "when the URL does not parse",
config: &Config{
AcceptedSchemes: []string{"http"},
AllowEndpoints: false,
DefaultScheme: "",
},
input: "http://\t/\r\n",
expectURL: nil,
expectErr: ErrURLParse,
}, {
name: "when the URL scheme is unsupported",
config: &Config{
AcceptedSchemes: []string{"http"},
AllowEndpoints: false,
DefaultScheme: "",
},
input: "smtp://example.com:53",
expectURL: nil,
expectErr: ErrUnsupportedScheme,
}, {
name: "when the default scheme is empty",
config: &Config{
AcceptedSchemes: []string{},
AllowEndpoints: true,
DefaultScheme: "",
},
input: "example.com:80",
expectURL: nil,
expectErr: ErrEmptyDefaultScheme,
}, {
name: "for IDNA URL without a port",
config: &Config{
AcceptedSchemes: []string{"http"},
AllowEndpoints: false,
DefaultScheme: "",
},
input: "http://ουτοπία.δπθ.gr/",
expectURL: &url.URL{
Scheme: "http",
Host: "xn--kxae4bafwg.xn--pxaix.gr",
Path: "/",
},
expectErr: nil,
}, {
name: "for IDNA URL with a port",
config: &Config{
AcceptedSchemes: []string{"http"},
AllowEndpoints: false,
DefaultScheme: "",
},
input: "http://ουτοπία.δπθ.gr:80/",
expectURL: &url.URL{
Scheme: "http",
Host: "xn--kxae4bafwg.xn--pxaix.gr:80",
Path: "/",
},
expectErr: nil,
}, {
name: "when we cannot convert IDNA to ASCII",
config: &Config{
AcceptedSchemes: []string{"http"},
AllowEndpoints: false,
DefaultScheme: "",
},
// See https://www.farsightsecurity.com/blog/txt-record/punycode-20180711/
input: "http://xn--0000h/",
expectURL: nil,
expectErr: ErrIDNAToASCII,
}}

for _, tc := range allTestCases {
t.Run(tc.name, func(t *testing.T) {
URL, err := Parse(tc.config, tc.input)

// parse the error
switch {
case err == nil && tc.expectErr == nil:
// nothing
case err == nil && tc.expectErr != nil:
t.Fatal("expected", tc.expectErr, "got", err)
case err != nil && tc.expectErr == nil:
t.Fatal("expected", tc.expectErr, "got", err)
default:
if !errors.Is(err, tc.expectErr) {
t.Fatal("unexpected error", err)
}
}

// validate the returned URL
if diff := cmp.Diff(tc.expectURL, URL); diff != "" {
t.Fatal(diff)
}
})
}
}

0 comments on commit 691e539

Please sign in to comment.