-
-
Notifications
You must be signed in to change notification settings - Fork 72
/
prefix.go
135 lines (124 loc) · 5.66 KB
/
prefix.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
package ctystrings
import (
"fmt"
"unicode/utf8"
"github.com/apparentlymart/go-textseg/v15/textseg"
"golang.org/x/text/unicode/norm"
)
// SafeKnownPrefix takes a string intended to represent a known prefix of
// another string and modifies it so that it would be safe to use with
// byte-based prefix matching against another NFC-normalized string. It
// also takes into account grapheme cluster boundaries and trims off any
// suffix that could potentially be an incomplete grapheme cluster.
//
// Specifically, SafeKnownPrefix first applies NFC normalization to the prefix
// and then trims off one or more characters from the end of the string which
// could potentially be transformed into a different character if another
// string were appended to it. For example, a trailing latin letter will
// typically be trimmed because appending a combining diacritic mark would
// transform it into a different character.
//
// This transformation is important whenever the remainder of the string is
// arbitrary user input not directly controlled by the application. If an
// application can guarantee that the remainder of the string will not begin
// with combining marks then it is safe to instead just normalize the prefix
// string with [Normalize].
func SafeKnownPrefix(prefix string) string {
prefix = Normalize(prefix)
// Our starting approach here is essentially what a streaming parser would
// do when consuming a Unicode string in chunks and needing to determine
// what prefix of the current buffer is safe to process without waiting for
// more information, which is described in TR15 section 13.1
// "Buffering with Unicode Normalization":
// https://unicode.org/reports/tr15/#Buffering_with_Unicode_Normalization
//
// The general idea here is to find the last character in the string that
// could potentially start a sequence of codepoints that would combine
// together, and then truncate the string to exclude that character and
// everything after it.
form := norm.NFC
lastBoundary := form.LastBoundary([]byte(prefix))
if lastBoundary != -1 && lastBoundary != len(prefix) {
prefix = prefix[:lastBoundary]
// If we get here then we've already shortened the prefix and so
// further analysis below is unnecessary because it would be relying
// on an incomplete prefix anyway.
return prefix
}
// Now we'll use the textseg package's grapheme cluster scanner to scan
// as far through the string as we can without the scanner telling us
// that it would need more bytes to decide.
//
// This step is conservative because the grapheme cluster rules are not
// designed with prefix-matching in mind. In the base case we'll just
// always discard the last grapheme cluster, although we do have some
// special cases for trailing codepoints that can't possibly combine with
// subsequent codepoints to form a single grapheme cluster and which seem
// likely to arise often in practical use.
remain := []byte(prefix)
prevBoundary := 0
thisBoundary := 0
for len(remain) > 0 {
advance, _, err := textseg.ScanGraphemeClusters(remain, false)
if err != nil {
// ScanGraphemeClusters should never return an error because
// any sequence of valid UTF-8 encodings is valid input.
panic(fmt.Sprintf("textseg.ScanGraphemeClusters returned error: %s", err))
}
if advance == 0 {
// If we have at least one byte remaining but the scanner cannot
// advance then that means the remainder might be an incomplete
// grapheme cluster and so we need to stop here, discarding the
// rest of the input. However, we do now know that we can safely
// include what we found on the previous iteration of this loop.
prevBoundary = thisBoundary
break
}
prevBoundary = thisBoundary
thisBoundary += advance
remain = remain[advance:]
}
// This is our heuristic for detecting cases where we can be sure that
// the above algorithm was too conservative because the last segment
// we found is definitely not subject to the grapheme cluster "do not split"
// rules.
suspect := prefix[prevBoundary:thisBoundary]
if sequenceMustEndGraphemeCluster(suspect) {
prevBoundary = thisBoundary
}
return prefix[:prevBoundary]
}
// sequenceMustEndGraphemeCluster is a heuristic we use to avoid discarding
// the final grapheme cluster of a prefix in SafeKnownPrefix by recognizing
// that a particular sequence is one known to not be subject to any of
// the UAX29 "do not break" rules.
//
// If this function returns true then it is safe to include the given byte
// sequence at the end of a safe prefix. Otherwise we don't know whether or
// not it is safe.
func sequenceMustEndGraphemeCluster(s string) bool {
// For now we're only considering sequences that represent a single
// codepoint. We'll assume that any sequence of two or more codepoints
// that could be a grapheme cluster might be extendable.
if utf8.RuneCountInString(s) != 1 {
return false
}
r, _ := utf8.DecodeRuneInString(s)
// Our initial ruleset is focused on characters that are commonly used
// as delimiters in text intended for both human and machine use, such
// as JSON documents.
//
// We don't include any letters or digits of any script here intentionally
// because those are the ones most likely to be subject to combining rules
// in either current or future Unicode specifications.
//
// We can safely grow this set over time, but we should be very careful
// about shrinking it because it could cause value refinements to loosen
// and thus cause results that were once known to become unknown.
switch r {
case '-', '_', ':', ';', '/', '\\', ',', '.', '(', ')', '{', '}', '[', ']', '|', '?', '!', '~', ' ', '\t', '@', '#', '$', '%', '^', '&', '*', '+', '"', '\'':
return true
default:
return false
}
}