Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[bugfix] Use better plaintext representation of status for filtering #3301

Merged
merged 13 commits into from
Sep 16, 2024
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ The following open source libraries, frameworks, and tools are used by GoToSocia
- [jackc/pgconn](https://github.com/jackc/pgconn); Postgres driver. [MIT License](https://spdx.org/licenses/MIT.html).
- [jackc/pgx](https://github.com/jackc/pgx); Postgres driver and toolkit. [MIT License](https://spdx.org/licenses/MIT.html).
- [KimMachineGun/automemlimit](https://github.com/KimMachineGun/automemlimit); cgroups memory limit checking. [MIT License](https://spdx.org/licenses/MIT.html).
- [k3a/html2text](https://github.com/k3a/html2text); HTML-to-text conversion. [MIT License](https://spdx.org/licenses/MIT.html).
- [mcuadros/go-syslog](https://github.com/mcuadros/go-syslog); Syslog server library. [MIT License](https://spdx.org/licenses/MIT.html).
- [microcosm-cc/bluemonday](https://github.com/microcosm-cc/bluemonday); HTML user-input sanitization. [BSD-3-Clause License](https://spdx.org/licenses/BSD-3-Clause.html).
- [miekg/dns](https://github.com/miekg/dns); DNS utilities. [Go License](https://go.dev/LICENSE).
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ require (
github.com/gorilla/feeds v1.2.0
github.com/gorilla/websocket v1.5.2
github.com/jackc/pgx/v5 v5.7.1
github.com/k3a/html2text v1.2.1
github.com/microcosm-cc/bluemonday v1.0.27
github.com/miekg/dns v1.1.62
github.com/minio/minio-go/v7 v7.0.76
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,8 @@ github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/X
github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
github.com/k0kubun/colorstring v0.0.0-20150214042306-9440f1994b88/go.mod h1:3w7q1U84EfirKl04SVQ/s7nPm1ZPhiXd34z40TNz36k=
github.com/k3a/html2text v1.2.1 h1:nvnKgBvBR/myqrwfLuiqecUtaK1lB9hGziIJKatNFVY=
github.com/k3a/html2text v1.2.1/go.mod h1:ieEXykM67iT8lTvEWBh6fhpH4B23kB9OMKPdIBmgUqA=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/klauspost/compress v1.10.4/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
github.com/klauspost/compress v1.10.10/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
Expand Down
20 changes: 20 additions & 0 deletions internal/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ type Caches struct {
// Webfinger provides access to the webfinger URL cache.
Webfinger *ttl.Cache[string, string] // TTL=24hr, sweep=5min

// TTL cache of statuses -> filterable text fields.
// To ensure up-to-date fields, cache is keyed as:
// `[status.ID][status.UpdatedAt.Unix()]`
StatusesFilterableFields *ttl.Cache[string, []string]

// prevent pass-by-value.
_ nocopy
}
Expand Down Expand Up @@ -109,6 +114,7 @@ func (c *Caches) Init() {
c.initUserMuteIDs()
c.initWebfinger()
c.initVisibility()
c.initStatusesFilterableFields()
}

// Start will start any caches that require a background
Expand All @@ -119,6 +125,10 @@ func (c *Caches) Start() {
tryUntil("starting webfinger cache", 5, func() bool {
return c.Webfinger.Start(5 * time.Minute)
})

tryUntil("starting statusesFilterableFields cache", 5, func() bool {
return c.StatusesFilterableFields.Start(5 * time.Minute)
})
}

// Stop will stop any caches that require a background
Expand All @@ -127,6 +137,7 @@ func (c *Caches) Stop() {
log.Infof(nil, "stop: %p", c)

tryUntil("stopping webfinger cache", 5, c.Webfinger.Stop)
tryUntil("stopping statusesFilterableFields cache", 5, c.StatusesFilterableFields.Stop)
}

// Sweep will sweep all the available caches to ensure none
Expand Down Expand Up @@ -204,3 +215,12 @@ func (c *Caches) initWebfinger() {
24*time.Hour,
)
}

func (c *Caches) initStatusesFilterableFields() {
c.StatusesFilterableFields = new(ttl.Cache[string, []string])
c.StatusesFilterableFields.Init(
0,
512,
1*time.Hour,
)
}
19 changes: 15 additions & 4 deletions internal/gtsmodel/filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ package gtsmodel
import (
"regexp"
"time"

"github.com/superseriousbusiness/gotosocial/internal/util"
)

// Filter stores a filter created by a local account.
Expand Down Expand Up @@ -61,14 +63,23 @@ type FilterKeyword struct {

// Compile will compile this FilterKeyword as a prepared regular expression.
func (k *FilterKeyword) Compile() (err error) {
var wordBreak string
if k.WholeWord != nil && *k.WholeWord {
wordBreak = `\b`
var (
wordBreakStart string
wordBreakEnd string
)

if util.PtrOrZero(k.WholeWord) {
VyrCossont marked this conversation as resolved.
Show resolved Hide resolved
// Either word boundary or
// whitespace or start of line.
wordBreakStart = `(?:\b|\s|^)`
// Either word boundary or
// whitespace or end of line.
wordBreakEnd = `(?:\b|\s|$)`
}

// Compile keyword filter regexp.
quoted := regexp.QuoteMeta(k.Keyword)
k.Regexp, err = regexp.Compile(`(?i)` + wordBreak + quoted + wordBreak)
k.Regexp, err = regexp.Compile(`(?i)` + wordBreakStart + quoted + wordBreakEnd)
return // caller is expected to wrap this error
}

Expand Down
81 changes: 32 additions & 49 deletions internal/typeutils/internaltofrontend.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import (
"context"
"errors"
"fmt"
"slices"
"strconv"
"strings"
"time"

Expand All @@ -35,7 +37,6 @@ import (
"github.com/superseriousbusiness/gotosocial/internal/language"
"github.com/superseriousbusiness/gotosocial/internal/log"
"github.com/superseriousbusiness/gotosocial/internal/media"
"github.com/superseriousbusiness/gotosocial/internal/text"
"github.com/superseriousbusiness/gotosocial/internal/uris"
"github.com/superseriousbusiness/gotosocial/internal/util"
)
Expand Down Expand Up @@ -939,32 +940,48 @@ func (c *Converter) statusToAPIFilterResults(
return nil, nil
}

// Extract text fields from the status that we will match filters against.
fields := filterableTextFields(s)
// Key this status based on ID + last updated time,
// to ensure we always filter on latest version.
statusKey := s.ID + strconv.FormatInt(s.UpdatedAt.Unix(), 10)

// Check if we have filterable fields cached for this status.
cache := c.state.Caches.StatusesFilterableFields
fields, stored := cache.Get(statusKey)
if !stored {
// We don't have filterable fields
// cached, calculate + cache now.
fields = filterableFields(s)
cache.Set(statusKey, fields)
}

// Record all matching warn filters and the reasons they matched.
filterResults := make([]apimodel.FilterResult, 0, len(filters))
for _, filter := range filters {
if !filterAppliesInContext(filter, filterContext) {
// Filter doesn't apply to this context.
// Filter doesn't apply
// to this context.
continue
}

if filter.Expired(now) {
// Filter doesn't
// apply anymore.
continue
}

// List all matching keywords.
// Assemble matching keywords (if any) from this filter.
keywordMatches := make([]string, 0, len(filter.Keywords))
for _, filterKeyword := range filter.Keywords {
var isMatch bool
for _, field := range fields {
tsmethurst marked this conversation as resolved.
Show resolved Hide resolved
if filterKeyword.Regexp.MatchString(field) {
isMatch = true
break
}
}
if isMatch {
keywordMatches = append(keywordMatches, filterKeyword.Keyword)
for _, keyword := range filter.Keywords {
// Check if at least one filterable field
// in the status matches on this filter.
if slices.ContainsFunc(
fields,
func(field string) bool {
return keyword.Regexp.MatchString(field)
},
) {
// At least one field matched on this filter.
keywordMatches = append(keywordMatches, keyword.Keyword)
}
}

Expand Down Expand Up @@ -1001,40 +1018,6 @@ func (c *Converter) statusToAPIFilterResults(
return filterResults, nil
}

// filterableTextFields returns all text from a status that we might want to filter on:
// - content
// - content warning
// - media descriptions
// - poll options
func filterableTextFields(s *gtsmodel.Status) []string {
fieldCount := 2 + len(s.Attachments)
if s.Poll != nil {
fieldCount += len(s.Poll.Options)
}
fields := make([]string, 0, fieldCount)

if s.Content != "" {
fields = append(fields, text.SanitizeToPlaintext(s.Content))
}
if s.ContentWarning != "" {
fields = append(fields, s.ContentWarning)
}
for _, attachment := range s.Attachments {
if attachment.Description != "" {
fields = append(fields, attachment.Description)
}
}
if s.Poll != nil {
for _, option := range s.Poll.Options {
if option != "" {
fields = append(fields, option)
}
}
}

return fields
}

// filterAppliesInContext returns whether a given filter applies in a given context.
func filterAppliesInContext(filter *gtsmodel.Filter, filterContext statusfilter.FilterContext) bool {
switch filterContext {
Expand Down
24 changes: 16 additions & 8 deletions internal/typeutils/internaltofrontend_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1063,15 +1063,21 @@ func (suite *InternalToFrontendTestSuite) TestHideFilteredBoostToFrontend() {

// Test that a hashtag filter for a hashtag in Mastodon HTML content works the way most users would expect.
func (suite *InternalToFrontendTestSuite) testHashtagFilteredStatusToFrontend(wholeWord bool, boost bool) {
testStatus := suite.testStatuses["admin_account_status_1"]
testStatus := new(gtsmodel.Status)
*testStatus = *suite.testStatuses["admin_account_status_1"]
testStatus.Content = `<p>doggo doggin' it</p><p><a href="https://example.test/tags/dogsofmastodon" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>dogsofmastodon</span></a></p>`

if boost {
// Modify a fixture boost into a boost of the above status.
boostStatus := suite.testStatuses["admin_account_status_4"]
boostStatus.BoostOf = testStatus
boostStatus.BoostOfID = testStatus.ID
testStatus = boostStatus
boost, err := suite.typeconverter.StatusToBoost(
context.Background(),
testStatus,
suite.testAccounts["admin_account"],
"",
)
if err != nil {
suite.FailNow(err.Error())
}
testStatus = boost
}

requestingAccount := suite.testAccounts["local_account_1"]
Expand Down Expand Up @@ -1103,9 +1109,11 @@ func (suite *InternalToFrontendTestSuite) testHashtagFilteredStatusToFrontend(wh
[]*gtsmodel.Filter{filter},
nil,
)
if suite.NoError(err) {
suite.NotEmpty(apiStatus.Filtered)
if err != nil {
suite.FailNow(err.Error())
}

suite.NotEmpty(apiStatus.Filtered)
}

func (suite *InternalToFrontendTestSuite) TestHashtagWholeWordFilteredStatusToFrontend() {
Expand Down
62 changes: 62 additions & 0 deletions internal/typeutils/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"strconv"
"strings"

"github.com/k3a/html2text"
apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model"
"github.com/superseriousbusiness/gotosocial/internal/config"
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
Expand Down Expand Up @@ -284,3 +285,64 @@ func ContentToContentLanguage(

return contentStr, langTagStr
}

// filterableFields returns text fields from
// a status that we might want to filter on:
//
// - content warning
// - content (converted to plaintext from HTML)
// - media descriptions
// - poll options
//
// Each field should be filtered separately.
// This avoids scenarios where false-positive
// multiple-word matches can be made by matching
// the last word of one field + the first word
// of the next field together.
func filterableFields(s *gtsmodel.Status) []string {
// Estimate length of fields.
fieldCount := 2 + len(s.Attachments)
if s.Poll != nil {
fieldCount += len(s.Poll.Options)
}
fields := make([]string, 0, fieldCount)

// Content warning / title.
if s.ContentWarning != "" {
fields = append(fields, s.ContentWarning)
}

// Status content. Though we have raw text
// available for statuses created on our
// instance, use the html2text version to
// remove markdown-formatting characters
// and ensure more consistent filtering.
if s.Content != "" {
tsmethurst marked this conversation as resolved.
Show resolved Hide resolved
text := html2text.HTML2TextWithOptions(
s.Content,
html2text.WithLinksInnerText(),
html2text.WithUnixLineBreaks(),
)
if text != "" {
fields = append(fields, text)
}
}

// Media descriptions.
for _, attachment := range s.Attachments {
if attachment.Description != "" {
fields = append(fields, attachment.Description)
}
}

// Poll options.
if s.Poll != nil {
for _, opt := range s.Poll.Options {
if opt != "" {
fields = append(fields, opt)
}
}
}

return fields
}
Loading