From ae5abc26f6b23e87d8074745149b20cadfbe60df Mon Sep 17 00:00:00 2001
From: Alex Goodman <wagoodman@users.noreply.github.com>
Date: Wed, 23 Feb 2022 15:18:12 -0500
Subject: [PATCH] Allow for CPE strings that can later be sanitized (#844)

---
 syft/pkg/cpe.go      | 40 ++++++++++++++++++---
 syft/pkg/cpe_test.go | 82 ++++++++++++++++++++++++++++++++++++--------
 2 files changed, 103 insertions(+), 19 deletions(-)

diff --git a/syft/pkg/cpe.go b/syft/pkg/cpe.go
index db0194d9217c..9c87df709b36 100644
--- a/syft/pkg/cpe.go
+++ b/syft/pkg/cpe.go
@@ -10,6 +10,10 @@ import (
 
 type CPE = wfn.Attributes
 
+const (
+	allowedCPEPunctuation = "-!\"#$%&'()+,./:;<=>@[]^`{|}~"
+)
+
 // This regex string is taken from
 // https://csrc.nist.gov/schema/cpe/2.3/cpe-naming_2.3.xsd which has the official cpe spec
 // This first part matches CPE urls and the second part matches binding strings
@@ -20,12 +24,35 @@ const cpeRegexString = ((`^([c][pP][eE]:/[AHOaho]?(:[A-Za-z0-9\._\-~%]*){0,6})`)
 
 var cpeRegex = regexp.MustCompile(cpeRegexString)
 
+// NewCPE will parse a formatted CPE string and return a CPE object. Some input, such as the existence of whitespace
+// characters is allowed, however, a more strict validation is done after this sanitization process.
 func NewCPE(cpeStr string) (CPE, error) {
+	// get a CPE object based on the given string --don't validate yet since it may be possible to escape select cases on the callers behalf
+	c, err := newCPEWithoutValidation(cpeStr)
+	if err != nil {
+		return CPE{}, fmt.Errorf("unable to prase CPE string: %w", err)
+	}
+
+	// ensure that this CPE can be validated after being fully sanitized
+	if validateCPEString(CPEString(c)) != nil {
+		return CPE{}, err
+	}
+
+	// we don't return the sanitized string, as this is a concern for later when creating CPE strings. In fact, since
+	// sanitization is lossy (whitespace is replaced, not escaped) it's important that the raw values are left as.
+	return c, nil
+}
+
+func validateCPEString(cpeStr string) error {
 	// We should filter out all CPEs that do not match the official CPE regex
 	// The facebook nvdtools parser can sometimes incorrectly parse invalid CPE strings
 	if !cpeRegex.MatchString(cpeStr) {
-		return CPE{}, fmt.Errorf("failed to parse CPE=%q as it doesn't match the regex=%s", cpeStr, cpeRegexString)
+		return fmt.Errorf("failed to parse CPE=%q as it doesn't match the regex=%s", cpeStr, cpeRegexString)
 	}
+	return nil
+}
+
+func newCPEWithoutValidation(cpeStr string) (CPE, error) {
 	value, err := wfn.Parse(cpeStr)
 	if err != nil {
 		return CPE{}, fmt.Errorf("failed to parse CPE=%q: %w", cpeStr, err)
@@ -60,6 +87,9 @@ func MustCPE(cpeStr string) CPE {
 }
 
 func normalizeCpeField(field string) string {
+	// replace spaces with underscores (per section 5.3.2 of the CPE spec v 2.3)
+	field = strings.ReplaceAll(field, " ", "_")
+
 	// keep dashes and forward slashes unescaped
 	if field == "*" {
 		return wfn.Any
@@ -71,10 +101,9 @@ func normalizeCpeField(field string) string {
 // It correctly removes slashes that are followed by allowed puncts.
 // This is to allow for a correct round trip parsing of cpes with quoted characters.
 func stripSlashes(s string) string {
-	const allowedPunct = "-!\"#$%&'()+,./:;<=>@[]^`{|}!~"
 	sb := strings.Builder{}
 	for i, c := range s {
-		if c == '\\' && i+1 < len(s) && strings.ContainsRune(allowedPunct, rune(s[i+1])) {
+		if c == '\\' && i+1 < len(s) && strings.ContainsRune(allowedCPEPunctuation, rune(s[i+1])) {
 			continue
 		} else {
 			sb.WriteRune(c)
@@ -110,12 +139,13 @@ func CPEString(c CPE) string {
 // end up becoming "prefix" instead causing loss of information and
 // incorrect CPEs being generated.
 func sanitize(s string) string {
-	const allowedPunct = "-!\"#$%&'()+,./:;<=>@[]^`{|}!~"
 	// replace spaces with underscores
 	in := strings.ReplaceAll(s, " ", "_")
+
+	// escape allowable punctuation per section 5.3.2 in the CPE 2.3 spec
 	sb := strings.Builder{}
 	for _, c := range in {
-		if strings.ContainsRune(allowedPunct, c) {
+		if strings.ContainsRune(allowedCPEPunctuation, c) {
 			sb.WriteRune('\\')
 		}
 		sb.WriteRune(c)
diff --git a/syft/pkg/cpe_test.go b/syft/pkg/cpe_test.go
index a74ab05e4e7a..1f610541c477 100644
--- a/syft/pkg/cpe_test.go
+++ b/syft/pkg/cpe_test.go
@@ -4,9 +4,11 @@ import (
 	"encoding/json"
 	"fmt"
 	"io/ioutil"
+	"strings"
 	"testing"
 
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )
 
 func must(c CPE, e error) CPE {
@@ -91,10 +93,9 @@ func Test_CPEParser(t *testing.T) {
 		WFN       CPE    `json:"wfn"`
 	}{}
 	out, err := ioutil.ReadFile("test-fixtures/cpe-data.json")
-	if err != nil {
-		t.Fatal("Unable to read test-fixtures/cpe-data.json: ", err)
-	}
-	json.Unmarshal(out, &testCases)
+	require.NoError(t, err)
+	require.NoError(t, json.Unmarshal(out, &testCases))
+
 	for _, test := range testCases {
 		t.Run(test.CPEString, func(t *testing.T) {
 			c1, err := NewCPE(test.CPEString)
@@ -110,19 +111,72 @@ func Test_CPEParser(t *testing.T) {
 }
 
 func Test_InvalidCPE(t *testing.T) {
+	type testcase struct {
+		name        string
+		in          string
+		expected    string
+		expectedErr bool
+	}
 
-	testCases := []string{
-		"cpe:2.3:a:some-vendor:name:1:3.2:*:*:*:*:*:*:*",
-		"cpe:2.3:a:some-vendor:name:1^:*:*:*:*:*:*:*",
-		"cpe:2.3:a:some-vendor:name:**:*:*:*:*:*:*:*",
-		"cpe:2.3:a:some-vendor:name:*\\:*:*:*:*:*:*:*",
+	tests := []testcase{
+		{
+			// 5.3.2: The underscore (x5f) MAY be used, and it SHOULD be used in place of whitespace characters (which SHALL NOT be used)
+			name:     "translates spaces",
+			in:       "cpe:2.3:a:some-vendor:name:1 2:*:*:*:*:*:*:*",
+			expected: "cpe:2.3:a:some-vendor:name:1_2:*:*:*:*:*:*:*",
+		},
+		{
+			// it isn't easily possible in the string formatted string to detect improper escaping of : (it will fail parsing)
+			name:        "unescaped ':' cannot be helped -- too many fields",
+			in:          "cpe:2.3:a:some-vendor:name:::*:*:*:*:*:*:*",
+			expectedErr: true,
+		},
+		{
+			name:     "too few fields",
+			in:       "cpe:2.3:a:some-vendor:name:*:*:*:*:*:*:*",
+			expected: "cpe:2.3:a:some-vendor:name:*:*:*:*:*:*:*:*",
+		},
+		// Note: though the CPE spec does not allow for ? and * as escaped character input, these seem to be allowed in
+		// the NVD CPE validator for this reason these edge cases were removed
 	}
 
-	for _, test := range testCases {
-		t.Run(test, func(t *testing.T) {
-			_, err := NewCPE(test)
-			assert.Error(t, err)
-			assert.Contains(t, fmt.Sprint(err), "regex")
+	// the wfn library does not account for escapes of . and -
+	exceptions := ".-"
+	// it isn't easily possible in the string formatted string to detect improper escaping of : (it will fail parsing)
+	skip := ":"
+
+	// make escape exceptions for section 5.3.2 of the CPE spec (2.3)
+	for _, char := range allowedCPEPunctuation {
+		if strings.Contains(skip, string(char)) {
+			continue
+		}
+
+		in := fmt.Sprintf("cpe:2.3:a:some-vendor:name:*:%s:*:*:*:*:*:*", string(char))
+		exp := fmt.Sprintf(`cpe:2.3:a:some-vendor:name:*:\%s:*:*:*:*:*:*`, string(char))
+		if strings.Contains(exceptions, string(char)) {
+			exp = in
+		}
+
+		tests = append(tests, testcase{
+			name:        fmt.Sprintf("allowes future escape of character (%s)", string(char)),
+			in:          in,
+			expected:    exp,
+			expectedErr: false,
+		})
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			c, err := NewCPE(test.in)
+			if test.expectedErr {
+				assert.Error(t, err)
+				if t.Failed() {
+					t.Logf("got CPE: %q details: %+v", CPEString(c), c)
+				}
+				return
+			}
+			require.NoError(t, err)
+			assert.Equal(t, test.expected, CPEString(c))
 		})
 	}
 }