Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[bugfix] Fix unicode-unaware word boundary check in hashtags #1049

Merged
merged 2 commits into from
Nov 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 1 addition & 8 deletions internal/regexes/regexes.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ const (
const (
maximumUsernameLength = 64
maximumEmojiShortcodeLength = 30
maximumHashtagLength = 30
)

var (
Expand All @@ -66,17 +65,11 @@ var (
// such as @[email protected], returning whatever_user and example.org (without the @ symbols)
MentionName = regexp.MustCompile(mentionName)

// mention regex can be played around with here: https://regex101.com/r/G1oGR0/1
// mention regex can be played around with here: https://regex101.com/r/P0vpYG/1
mentionFinder = `(?:^|\s)(@\w+(?:@[a-zA-Z0-9_\-\.]+)?)`
// MentionFinder extracts mentions from a piece of text.
MentionFinder = regexp.MustCompile(mentionFinder)

// hashtag regex can be played with here: https://regex101.com/r/bpyGlj/1
hashtagFinder = fmt.Sprintf(`(?:^|\s)(?:#*)(#[\p{L}\p{N}]{1,%d})(?:#|\b)`, maximumHashtagLength)
// HashtagFinder finds possible hashtags in a string.
// It returns just the string part of the hashtag, not the # symbol.
HashtagFinder = regexp.MustCompile(hashtagFinder)

emojiShortcode = fmt.Sprintf(`\w{2,%d}`, maximumEmojiShortcodeLength)
// EmojiShortcode validates an emoji name.
EmojiShortcode = regexp.MustCompile(fmt.Sprintf("^%s$", emojiShortcode))
Expand Down
48 changes: 29 additions & 19 deletions internal/text/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,36 +27,46 @@ import (
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
"github.com/superseriousbusiness/gotosocial/internal/log"
"github.com/superseriousbusiness/gotosocial/internal/regexes"
"github.com/superseriousbusiness/gotosocial/internal/util"
)

func (f *formatter) ReplaceTags(ctx context.Context, in string, tags []*gtsmodel.Tag) string {
return regexes.ReplaceAllStringFunc(regexes.HashtagFinder, in, func(match string, buf *bytes.Buffer) string {
// we have a match
matchTrimmed := strings.TrimSpace(match)
tagAsEntered := matchTrimmed[1:]
spans := util.FindHashtagSpansInText(in)

if len(spans) == 0 {
return in
}

var b strings.Builder
i := 0

spans:
for _, t := range spans {
b.WriteString(in[i:t.First])
i = t.Second
tagAsEntered := in[t.First+1 : t.Second]

// check through the tags to find what we're matching
for _, tag := range tags {
if strings.EqualFold(tagAsEntered, tag.Name) {
// Add any dropped space from match
if unicode.IsSpace(rune(match[0])) {
buf.WriteByte(match[0])
}

// replace the #tag with the formatted tag content
// `<a href="tag.URL" class="mention hashtag" rel="tag">#<span>tagAsEntered</span></a>
buf.WriteString(`<a href="`)
buf.WriteString(tag.URL)
buf.WriteString(`" class="mention hashtag" rel="tag">#<span>`)
buf.WriteString(tagAsEntered)
buf.WriteString(`</span></a>`)
return buf.String()
b.WriteString(`<a href="`)
b.WriteString(tag.URL)
b.WriteString(`" class="mention hashtag" rel="tag">#<span>`)
b.WriteString(tagAsEntered)
b.WriteString(`</span></a>`)
continue spans
}
}

// the match wasn't in the list of tags for whatever reason, so just return the match as we found it so nothing changes
return match
})
b.WriteString(in[t.First:t.Second])
}

// Get the last bits.
i = spans[len(spans)-1].Second
b.WriteString(in[i:])

return b.String()
}

func (f *formatter) ReplaceMentions(ctx context.Context, in string, mentions []*gtsmodel.Mention) string {
Expand Down
90 changes: 82 additions & 8 deletions internal/util/statustools.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,16 @@
package util

import (
"strings"
"unicode"
"unicode/utf8"

"github.com/superseriousbusiness/gotosocial/internal/regexes"
)

const (
maximumHashtagLength = 30
)

// DeriveMentionNamesFromText takes a plaintext (ie., not html-formatted) text,
// and applies a regex to it to return a deduplicated list of account names
// mentioned in that text, in the format "@[email protected]" or "@username" for
Expand All @@ -36,16 +41,71 @@ func DeriveMentionNamesFromText(text string) []string {
return UniqueStrings(mentionedAccounts)
}

// DeriveHashtagsFromText takes a plaintext (ie., not html-formatted) text,
// and applies a regex to it to return a deduplicated list of hashtags
// used in that text, without the leading #. The case of the returned
// tags will be lowered, for consistency.
type Pair[A, B any] struct {
First A
Second B
}

// Byte index in original string
// `First` includes `#`.
type Span = Pair[int, int]

// Takes a plaintext (ie., not HTML-formatted) text,
// and returns a slice of unique hashtags.
func DeriveHashtagsFromText(text string) []string {
tagsMap := make(map[string]bool)
tags := []string{}
for _, m := range regexes.HashtagFinder.FindAllStringSubmatch(text, -1) {
tags = append(tags, strings.TrimPrefix(m[1], "#"))

for _, v := range FindHashtagSpansInText(text) {
t := text[v.First+1 : v.Second]
if _, value := tagsMap[t]; !value {
tagsMap[t] = true
tags = append(tags, t)
}
}

return tags
}

// Takes a plaintext (ie., not HTML-formatted) text,
// and returns a list of pairs of indices into the original string, where
// hashtags are located.
func FindHashtagSpansInText(text string) []Span {
tags := []Span{}
start := 0
// Keep one rune of lookbehind.
prev := ' '
inTag := false

for i, r := range text {
illfygli marked this conversation as resolved.
Show resolved Hide resolved
if r == '#' && isHashtagBoundary(prev) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

worth replacing these with a switch statement or nah? just a style thing

Copy link
Contributor Author

@illfygli illfygli Nov 15, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool I didn't know about switch /* nothing here */ { ... }!
I tried it but then I cuoldn't do assignment in a case, so I left it like that instead of duplicating that bit. :)

// Start of hashtag.
inTag = true
start = i
} else if inTag && !isPermittedInHashtag(r) && !isHashtagBoundary(r) {
// Inside the hashtag, but it was a phoney, gottem.
inTag = false
} else if inTag && isHashtagBoundary(r) {
// End of hashtag.
inTag = false
appendTag(&tags, text, start, i)
} else if irl := i + utf8.RuneLen(r); inTag && irl == len(text) {
// End of text.
appendTag(&tags, text, start, irl)
}

prev = r
}

return tags
}

func appendTag(tags *[]Span, text string, start int, end int) {
l := end - start - 1
// This check could be moved out into the parsing loop if necessary!
if 0 < l && l <= maximumHashtagLength {
*tags = append(*tags, Span{First: start, Second: end})
}
return UniqueStrings(tags)
}

// DeriveEmojisFromText takes a plaintext (ie., not html-formatted) text,
Expand All @@ -58,3 +118,17 @@ func DeriveEmojisFromText(text string) []string {
}
return UniqueStrings(emojis)
}

func isPermittedInHashtag(r rune) bool {
return unicode.IsLetter(r) || unicode.IsNumber(r)
}

// Decides where to break before or after a hashtag.
func isHashtagBoundary(r rune) bool {
return r == '#' || // `###lol` should work
unicode.IsSpace(r) || // All kinds of Unicode whitespace.
unicode.IsControl(r) || // All kinds of control characters, like tab.
// Most kinds of punctuation except "Pc" ("Punctuation, connecting", like `_`).
// But `someurl/#fragment` should not match, neither should HTML entities like `&#35;`.
('/' != r && '&' != r && !unicode.Is(unicode.Categories["Pc"], r) && unicode.IsPunct(r))
}
44 changes: 34 additions & 10 deletions internal/util/statustools_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,26 +77,50 @@ func (suite *StatusTestSuite) TestDeriveHashtagsOK() {

# testing this one shouldn't work

#thisshouldwork
#thisshouldwork #dupe #dupe!! #dupe

here's a link with a fragment: https://example.org/whatever#ahhh
here's another link with a fragment: https://example.org/whatever/#ahhh

#ThisShouldAlsoWork #not_this_though
(#ThisShouldAlsoWork) #not_this_though

#111111 thisalsoshouldn'twork#### ##

#alimentación, #saúde
#alimentación, #saúde, #lävistää, #ö, #네
#ThisOneIsThirtyOneCharactersLon... ...ng
#ThisOneIsThirteyCharactersLong
`

tags := util.DeriveHashtagsFromText(statusText)
assert.Len(suite.T(), tags, 7)
assert.Len(suite.T(), tags, 12)
assert.Equal(suite.T(), "testing123", tags[0])
assert.Equal(suite.T(), "also", tags[1])
assert.Equal(suite.T(), "thisshouldwork", tags[2])
assert.Equal(suite.T(), "ThisShouldAlsoWork", tags[3])
assert.Equal(suite.T(), "111111", tags[4])
assert.Equal(suite.T(), "alimentación", tags[5])
assert.Equal(suite.T(), "saúde", tags[6])
assert.Equal(suite.T(), "dupe", tags[3])
assert.Equal(suite.T(), "ThisShouldAlsoWork", tags[4])
assert.Equal(suite.T(), "111111", tags[5])
assert.Equal(suite.T(), "alimentación", tags[6])
assert.Equal(suite.T(), "saúde", tags[7])
assert.Equal(suite.T(), "lävistää", tags[8])
assert.Equal(suite.T(), "ö", tags[9])
assert.Equal(suite.T(), "네", tags[10])
assert.Equal(suite.T(), "ThisOneIsThirteyCharactersLong", tags[11])

statusText = `#올빼미 hej`
tags = util.DeriveHashtagsFromText(statusText)
assert.Equal(suite.T(), "올빼미", tags[0])
}

func (suite *StatusTestSuite) TestHashtagSpansOK() {
statusText := `#0 #3 #8aa`

spans := util.FindHashtagSpansInText(statusText)
assert.Equal(suite.T(), 0, spans[0].First)
assert.Equal(suite.T(), 2, spans[0].Second)
assert.Equal(suite.T(), 3, spans[1].First)
assert.Equal(suite.T(), 5, spans[1].Second)
assert.Equal(suite.T(), 8, spans[2].First)
assert.Equal(suite.T(), 12, spans[2].Second)
}

func (suite *StatusTestSuite) TestDeriveEmojiOK() {
Expand Down Expand Up @@ -127,7 +151,7 @@ Here's some normal text with an :emoji: at the end
func (suite *StatusTestSuite) TestDeriveMultiple() {
statusText := `Another test @[email protected]

#Hashtag
#HashTag

Text`

Expand All @@ -139,7 +163,7 @@ func (suite *StatusTestSuite) TestDeriveMultiple() {
assert.Equal(suite.T(), "@[email protected]", ms[0])

assert.Len(suite.T(), hs, 1)
assert.Equal(suite.T(), "Hashtag", hs[0])
assert.Contains(suite.T(), hs, "HashTag")

assert.Len(suite.T(), es, 0)
}
Expand Down