From 3990f2417f091da99f4e8df7a3026b55f6aef039 Mon Sep 17 00:00:00 2001 From: ugla Date: Fri, 11 Nov 2022 16:04:34 +0100 Subject: [PATCH 1/2] [bugfix] Fix unicode-unaware word boundary check in hashtag regex Go `\b` does not care for Unicode, and without lookahead, the workarounds got very ugly. So I replaced the regex with a parser. The parser runs in O(n) time and performance should not be affected. --- internal/regexes/regexes.go | 8 +-- internal/text/common.go | 48 ++++++++++-------- internal/util/statustools.go | 81 ++++++++++++++++++++++++++++--- internal/util/statustools_test.go | 40 +++++++++++---- 4 files changed, 133 insertions(+), 44 deletions(-) diff --git a/internal/regexes/regexes.go b/internal/regexes/regexes.go index dd3d9ce406..9066191ecf 100644 --- a/internal/regexes/regexes.go +++ b/internal/regexes/regexes.go @@ -66,17 +66,11 @@ var ( // such as @whatever_user@example.org, returning whatever_user and example.org (without the @ symbols) MentionName = regexp.MustCompile(mentionName) - // mention regex can be played around with here: https://regex101.com/r/G1oGR0/1 + // mention regex can be played around with here: https://regex101.com/r/P0vpYG/1 mentionFinder = `(?:^|\s)(@\w+(?:@[a-zA-Z0-9_\-\.]+)?)` // MentionFinder extracts mentions from a piece of text. MentionFinder = regexp.MustCompile(mentionFinder) - // hashtag regex can be played with here: https://regex101.com/r/bpyGlj/1 - hashtagFinder = fmt.Sprintf(`(?:^|\s)(?:#*)(#[\p{L}\p{N}]{1,%d})(?:#|\b)`, maximumHashtagLength) - // HashtagFinder finds possible hashtags in a string. - // It returns just the string part of the hashtag, not the # symbol. - HashtagFinder = regexp.MustCompile(hashtagFinder) - emojiShortcode = fmt.Sprintf(`\w{2,%d}`, maximumEmojiShortcodeLength) // EmojiShortcode validates an emoji name. EmojiShortcode = regexp.MustCompile(fmt.Sprintf("^%s$", emojiShortcode)) diff --git a/internal/text/common.go b/internal/text/common.go index 005f9dfe1e..ca4b974653 100644 --- a/internal/text/common.go +++ b/internal/text/common.go @@ -27,36 +27,46 @@ import ( "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" "github.com/superseriousbusiness/gotosocial/internal/log" "github.com/superseriousbusiness/gotosocial/internal/regexes" + "github.com/superseriousbusiness/gotosocial/internal/util" ) func (f *formatter) ReplaceTags(ctx context.Context, in string, tags []*gtsmodel.Tag) string { - return regexes.ReplaceAllStringFunc(regexes.HashtagFinder, in, func(match string, buf *bytes.Buffer) string { - // we have a match - matchTrimmed := strings.TrimSpace(match) - tagAsEntered := matchTrimmed[1:] + spans := util.FindHashtagSpansInText(in) + + if len(spans) == 0 { + return in + } + + var b strings.Builder + i := 0 + +spans: + for _, t := range spans { + b.WriteString(in[i:t.First]) + i = t.Second + tagAsEntered := in[t.First+1 : t.Second] - // check through the tags to find what we're matching for _, tag := range tags { if strings.EqualFold(tagAsEntered, tag.Name) { - // Add any dropped space from match - if unicode.IsSpace(rune(match[0])) { - buf.WriteByte(match[0]) - } - // replace the #tag with the formatted tag content // ` - buf.WriteString(``) - return buf.String() + b.WriteString(``) + continue spans } } - // the match wasn't in the list of tags for whatever reason, so just return the match as we found it so nothing changes - return match - }) + b.WriteString(in[t.First:t.Second]) + } + + // Get the last bits. + i = spans[len(spans)-1].Second + b.WriteString(in[i:]) + + return b.String() } func (f *formatter) ReplaceMentions(ctx context.Context, in string, mentions []*gtsmodel.Mention) string { diff --git a/internal/util/statustools.go b/internal/util/statustools.go index b2b7fffa17..d34cb2635d 100644 --- a/internal/util/statustools.go +++ b/internal/util/statustools.go @@ -19,7 +19,8 @@ package util import ( - "strings" + "unicode" + "unicode/utf8" "github.com/superseriousbusiness/gotosocial/internal/regexes" ) @@ -36,16 +37,69 @@ func DeriveMentionNamesFromText(text string) []string { return UniqueStrings(mentionedAccounts) } -// DeriveHashtagsFromText takes a plaintext (ie., not html-formatted) text, -// and applies a regex to it to return a deduplicated list of hashtags -// used in that text, without the leading #. The case of the returned -// tags will be lowered, for consistency. +type Pair[A, B any] struct { + First A + Second B +} + +// Byte index in original string +// `First` includes `#`. +type Span = Pair[int, int] + +// Takes a plaintext (ie., not HTML-formatted) text, +// and returns a slice of unique hashtags. func DeriveHashtagsFromText(text string) []string { + tagsMap := make(map[string]bool) tags := []string{} - for _, m := range regexes.HashtagFinder.FindAllStringSubmatch(text, -1) { - tags = append(tags, strings.TrimPrefix(m[1], "#")) + + for _, v := range FindHashtagSpansInText(text) { + t := text[v.First+1 : v.Second] + if _, value := tagsMap[t]; !value { + tagsMap[t] = true + tags = append(tags, t) + } + } + + return tags +} + +// Takes a plaintext (ie., not HTML-formatted) text, +// and returns a list of pairs of indices into the original string, where +// hashtags are located. +func FindHashtagSpansInText(text string) []Span { + tags := []Span{} + start := 0 + // Keep one rune of lookbehind. + prev := ' ' + inTag := false + + for i, r := range text { + if r == '#' && isHashtagBoundary(prev) { + // Start of hashtag. + inTag = true + start = i + } else if inTag && !isPermittedInHashtag(r) && !isHashtagBoundary(r) { + // Inside the hashtag, but it was a phoney, gottem. + inTag = false + } else if inTag && isHashtagBoundary(r) { + // End of hashtag. + inTag = false + appendTag(&tags, text, start, i) + } else if irl := i + utf8.RuneLen(r); inTag && irl == len(text) { + // End of text. + appendTag(&tags, text, start, irl) + } + + prev = r + } + + return tags +} + +func appendTag(tags *[]Span, text string, start int, endEx int) { + if 1 < endEx-start { + *tags = append(*tags, Span{First: start, Second: endEx}) } - return UniqueStrings(tags) } // DeriveEmojisFromText takes a plaintext (ie., not html-formatted) text, @@ -58,3 +112,14 @@ func DeriveEmojisFromText(text string) []string { } return UniqueStrings(emojis) } + +func isPermittedInHashtag(r rune) bool { + return unicode.IsLetter(r) || unicode.IsNumber(r) +} + +func isHashtagBoundary(r rune) bool { + return r == '#' || + unicode.IsSpace(r) || + unicode.IsControl(r) || + ('&' != r && '/' != r && unicode.Is(unicode.Categories["Po"], r)) +} diff --git a/internal/util/statustools_test.go b/internal/util/statustools_test.go index d9f344e4b0..363f7708bb 100644 --- a/internal/util/statustools_test.go +++ b/internal/util/statustools_test.go @@ -77,26 +77,46 @@ func (suite *StatusTestSuite) TestDeriveHashtagsOK() { # testing this one shouldn't work - #thisshouldwork + #thisshouldwork #dupe #dupe!! #dupe here's a link with a fragment: https://example.org/whatever#ahhh + here's another link with a fragment: https://example.org/whatever/#ahhh #ThisShouldAlsoWork #not_this_though #111111 thisalsoshouldn'twork#### ## -#alimentación, #saúde -` +#alimentación, #saúde, #lävistää, #ö, #네` tags := util.DeriveHashtagsFromText(statusText) - assert.Len(suite.T(), tags, 7) + assert.Len(suite.T(), tags, 11) assert.Equal(suite.T(), "testing123", tags[0]) assert.Equal(suite.T(), "also", tags[1]) assert.Equal(suite.T(), "thisshouldwork", tags[2]) - assert.Equal(suite.T(), "ThisShouldAlsoWork", tags[3]) - assert.Equal(suite.T(), "111111", tags[4]) - assert.Equal(suite.T(), "alimentación", tags[5]) - assert.Equal(suite.T(), "saúde", tags[6]) + assert.Equal(suite.T(), "dupe", tags[3]) + assert.Equal(suite.T(), "ThisShouldAlsoWork", tags[4]) + assert.Equal(suite.T(), "111111", tags[5]) + assert.Equal(suite.T(), "alimentación", tags[6]) + assert.Equal(suite.T(), "saúde", tags[7]) + assert.Equal(suite.T(), "lävistää", tags[8]) + assert.Equal(suite.T(), "ö", tags[9]) + assert.Equal(suite.T(), "네", tags[10]) + + statusText = `#올빼미 hej` + tags = util.DeriveHashtagsFromText(statusText) + assert.Equal(suite.T(), "올빼미", tags[0]) +} + +func (suite *StatusTestSuite) TestHashtagSpansOK() { + statusText := `#0 #3 #8aa` + + spans := util.FindHashtagSpansInText(statusText) + assert.Equal(suite.T(), 0, spans[0].First) + assert.Equal(suite.T(), 2, spans[0].Second) + assert.Equal(suite.T(), 3, spans[1].First) + assert.Equal(suite.T(), 5, spans[1].Second) + assert.Equal(suite.T(), 8, spans[2].First) + assert.Equal(suite.T(), 12, spans[2].Second) } func (suite *StatusTestSuite) TestDeriveEmojiOK() { @@ -127,7 +147,7 @@ Here's some normal text with an :emoji: at the end func (suite *StatusTestSuite) TestDeriveMultiple() { statusText := `Another test @foss_satan@fossbros-anonymous.io - #Hashtag + #HashTag Text` @@ -139,7 +159,7 @@ func (suite *StatusTestSuite) TestDeriveMultiple() { assert.Equal(suite.T(), "@foss_satan@fossbros-anonymous.io", ms[0]) assert.Len(suite.T(), hs, 1) - assert.Equal(suite.T(), "Hashtag", hs[0]) + assert.Contains(suite.T(), hs, "HashTag") assert.Len(suite.T(), es, 0) } From 32216135e0f69ce19aebdad3223ad4901059ea85 Mon Sep 17 00:00:00 2001 From: ugla Date: Mon, 14 Nov 2022 23:12:24 +0100 Subject: [PATCH 2/2] [bugfix] Add back hashtag max length and add tests for it --- internal/regexes/regexes.go | 1 - internal/util/statustools.go | 23 ++++++++++++++++------- internal/util/statustools_test.go | 10 +++++++--- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/internal/regexes/regexes.go b/internal/regexes/regexes.go index 9066191ecf..c9286611e3 100644 --- a/internal/regexes/regexes.go +++ b/internal/regexes/regexes.go @@ -47,7 +47,6 @@ const ( const ( maximumUsernameLength = 64 maximumEmojiShortcodeLength = 30 - maximumHashtagLength = 30 ) var ( diff --git a/internal/util/statustools.go b/internal/util/statustools.go index d34cb2635d..b1fd7968b9 100644 --- a/internal/util/statustools.go +++ b/internal/util/statustools.go @@ -25,6 +25,10 @@ import ( "github.com/superseriousbusiness/gotosocial/internal/regexes" ) +const ( + maximumHashtagLength = 30 +) + // DeriveMentionNamesFromText takes a plaintext (ie., not html-formatted) text, // and applies a regex to it to return a deduplicated list of account names // mentioned in that text, in the format "@user@example.org" or "@username" for @@ -96,9 +100,11 @@ func FindHashtagSpansInText(text string) []Span { return tags } -func appendTag(tags *[]Span, text string, start int, endEx int) { - if 1 < endEx-start { - *tags = append(*tags, Span{First: start, Second: endEx}) +func appendTag(tags *[]Span, text string, start int, end int) { + l := end - start - 1 + // This check could be moved out into the parsing loop if necessary! + if 0 < l && l <= maximumHashtagLength { + *tags = append(*tags, Span{First: start, Second: end}) } } @@ -117,9 +123,12 @@ func isPermittedInHashtag(r rune) bool { return unicode.IsLetter(r) || unicode.IsNumber(r) } +// Decides where to break before or after a hashtag. func isHashtagBoundary(r rune) bool { - return r == '#' || - unicode.IsSpace(r) || - unicode.IsControl(r) || - ('&' != r && '/' != r && unicode.Is(unicode.Categories["Po"], r)) + return r == '#' || // `###lol` should work + unicode.IsSpace(r) || // All kinds of Unicode whitespace. + unicode.IsControl(r) || // All kinds of control characters, like tab. + // Most kinds of punctuation except "Pc" ("Punctuation, connecting", like `_`). + // But `someurl/#fragment` should not match, neither should HTML entities like `#`. + ('/' != r && '&' != r && !unicode.Is(unicode.Categories["Pc"], r) && unicode.IsPunct(r)) } diff --git a/internal/util/statustools_test.go b/internal/util/statustools_test.go index 363f7708bb..214fab5534 100644 --- a/internal/util/statustools_test.go +++ b/internal/util/statustools_test.go @@ -82,14 +82,17 @@ func (suite *StatusTestSuite) TestDeriveHashtagsOK() { here's a link with a fragment: https://example.org/whatever#ahhh here's another link with a fragment: https://example.org/whatever/#ahhh -#ThisShouldAlsoWork #not_this_though +(#ThisShouldAlsoWork) #not_this_though #111111 thisalsoshouldn'twork#### ## -#alimentación, #saúde, #lävistää, #ö, #네` +#alimentación, #saúde, #lävistää, #ö, #네 +#ThisOneIsThirtyOneCharactersLon... ...ng +#ThisOneIsThirteyCharactersLong +` tags := util.DeriveHashtagsFromText(statusText) - assert.Len(suite.T(), tags, 11) + assert.Len(suite.T(), tags, 12) assert.Equal(suite.T(), "testing123", tags[0]) assert.Equal(suite.T(), "also", tags[1]) assert.Equal(suite.T(), "thisshouldwork", tags[2]) @@ -101,6 +104,7 @@ func (suite *StatusTestSuite) TestDeriveHashtagsOK() { assert.Equal(suite.T(), "lävistää", tags[8]) assert.Equal(suite.T(), "ö", tags[9]) assert.Equal(suite.T(), "네", tags[10]) + assert.Equal(suite.T(), "ThisOneIsThirteyCharactersLong", tags[11]) statusText = `#올빼미 hej` tags = util.DeriveHashtagsFromText(statusText)