Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: umputun/tg-spam
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: v1.13.4
Choose a base ref
...
head repository: umputun/tg-spam
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: v1.13.5
Choose a head ref
  • 3 commits
  • 5 files changed
  • 3 contributors

Commits on Aug 3, 2024

  1. exclude goreleaser from dependabot updates

    Goreleaser is known to break backwards compatibility in minor versions
    and shouldn't be updated automatically.
    paskal authored and umputun committed Aug 3, 2024
    Copy the full SHA
    27299b4 View commit details
  2. Bump the go-modules-updates group

    Bumps the go-modules-updates group in /_examples/simplechat with 2 updates: [github.com/umputun/tg-spam](https://github.com/umputun/tg-spam) and [modernc.org/sqlite](https://gitlab.com/cznic/sqlite).
    
    
    Updates `github.com/umputun/tg-spam` from 1.13.2 to 1.13.4
    - [Release notes](https://github.com/umputun/tg-spam/releases)
    - [Changelog](https://github.com/umputun/tg-spam/blob/master/.goreleaser.yml)
    - [Commits](v1.13.2...v1.13.4)
    
    Updates `modernc.org/sqlite` from 1.30.1 to 1.31.1
    - [Commits](https://gitlab.com/cznic/sqlite/compare/v1.30.1...v1.31.1)
    
    ---
    updated-dependencies:
    - dependency-name: github.com/umputun/tg-spam
      dependency-type: direct:production
      update-type: version-update:semver-patch
      dependency-group: go-modules-updates
    - dependency-name: modernc.org/sqlite
      dependency-type: direct:production
      update-type: version-update:semver-minor
      dependency-group: go-modules-updates
    ...
    
    Signed-off-by: dependabot[bot] <[email protected]>
    dependabot[bot] authored and umputun committed Aug 3, 2024
    Copy the full SHA
    5424f66 View commit details

Commits on Aug 4, 2024

  1. universal detection of scripts for multi-lingual check

    umputun committed Aug 4, 2024
    Copy the full SHA
    68dcc02 View commit details
Showing with 49 additions and 54 deletions.
  1. +2 −0 .github/dependabot.yml
  2. +4 −4 _examples/simplechat/go.mod
  3. +12 −12 _examples/simplechat/go.sum
  4. +28 −36 lib/tgspam/detector.go
  5. +3 −2 lib/tgspam/detector_test.go
2 changes: 2 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -9,6 +9,8 @@ updates:
directory: "/"
schedule:
interval: "monthly"
ignore:
- dependency-name: "goreleaser/goreleaser-action"
groups:
"GitHub Actions updates":
patterns:
8 changes: 4 additions & 4 deletions _examples/simplechat/go.mod
Original file line number Diff line number Diff line change
@@ -5,8 +5,8 @@ go 1.22
toolchain go1.22.3

require (
github.com/umputun/tg-spam v1.13.2
modernc.org/sqlite v1.30.1
github.com/umputun/tg-spam v1.13.4
modernc.org/sqlite v1.31.1
)

require (
@@ -21,9 +21,9 @@ require (
github.com/sandwich-go/gpt3-encoder v0.0.0-20230203030618-cd99729dd0dd // indirect
github.com/sashabaranov/go-openai v1.24.1 // indirect
golang.org/x/exp v0.0.0-20240531132922-fd00a4e0eefc // indirect
golang.org/x/sys v0.20.0 // indirect
golang.org/x/sys v0.22.0 // indirect
modernc.org/gc/v3 v3.0.0-20240304020402-f0dba7c97c2b // indirect
modernc.org/libc v1.52.1 // indirect
modernc.org/libc v1.55.3 // indirect
modernc.org/mathutil v1.6.0 // indirect
modernc.org/memory v1.8.0 // indirect
modernc.org/strutil v1.2.0 // indirect
24 changes: 12 additions & 12 deletions _examples/simplechat/go.sum
Original file line number Diff line number Diff line change
@@ -26,33 +26,33 @@ github.com/sashabaranov/go-openai v1.24.1 h1:DWK95XViNb+agQtuzsn+FyHhn3HQJ7Va8z0
github.com/sashabaranov/go-openai v1.24.1/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/umputun/tg-spam v1.13.2 h1:IZEqnJEq+sS42d+0P87opjoK4X/MFujJUlbdv0A8AMU=
github.com/umputun/tg-spam v1.13.2/go.mod h1:Y6Aqx9FRw5WIWophjb8RCBkvdUzRA8WtTf5jT4qC6uQ=
github.com/umputun/tg-spam v1.13.4 h1:jHxiObk007GXkSZZ/MXByadhwYPm+BWZBqrpE/NU9e8=
github.com/umputun/tg-spam v1.13.4/go.mod h1:Y6Aqx9FRw5WIWophjb8RCBkvdUzRA8WtTf5jT4qC6uQ=
golang.org/x/exp v0.0.0-20240531132922-fd00a4e0eefc h1:O9NuF4s+E/PvMIy+9IUZB9znFwUIXEWSstNjek6VpVg=
golang.org/x/exp v0.0.0-20240531132922-fd00a4e0eefc/go.mod h1:XtvwrStGgqGPLc4cjQfWqZHG1YFdYs6swckp8vpsjnc=
golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA=
golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M=
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y=
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI=
golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/tools v0.21.0 h1:qc0xYgIbsSDt9EyWz05J5wfa7LOVW0YTLOXrqdLAWIw=
golang.org/x/tools v0.21.0/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
modernc.org/cc/v4 v4.21.2 h1:dycHFB/jDc3IyacKipCNSDrjIC0Lm1hyoWOZTRR20Lk=
modernc.org/cc/v4 v4.21.2/go.mod h1:HM7VJTZbUCR3rV8EYBi9wxnJ0ZBRiGE5OeGXNA0IsLQ=
modernc.org/ccgo/v4 v4.17.10 h1:6wrtRozgrhCxieCeJh85QsxkX/2FFrT9hdaWPlbn4Zo=
modernc.org/ccgo/v4 v4.17.10/go.mod h1:0NBHgsqTTpm9cA5z2ccErvGZmtntSM9qD2kFAs6pjXM=
modernc.org/cc/v4 v4.21.4 h1:3Be/Rdo1fpr8GrQ7IVw9OHtplU4gWbb+wNgeoBMmGLQ=
modernc.org/cc/v4 v4.21.4/go.mod h1:HM7VJTZbUCR3rV8EYBi9wxnJ0ZBRiGE5OeGXNA0IsLQ=
modernc.org/ccgo/v4 v4.19.2 h1:lwQZgvboKD0jBwdaeVCTouxhxAyN6iawF3STraAal8Y=
modernc.org/ccgo/v4 v4.19.2/go.mod h1:ysS3mxiMV38XGRTTcgo0DQTeTmAO4oCmJl1nX9VFI3s=
modernc.org/fileutil v1.3.0 h1:gQ5SIzK3H9kdfai/5x41oQiKValumqNTDXMvKo62HvE=
modernc.org/fileutil v1.3.0/go.mod h1:XatxS8fZi3pS8/hKG2GH/ArUogfxjpEKs3Ku3aK4JyQ=
modernc.org/gc/v2 v2.4.1 h1:9cNzOqPyMJBvrUipmynX0ZohMhcxPtMccYgGOJdOiBw=
modernc.org/gc/v2 v2.4.1/go.mod h1:wzN5dK1AzVGoH6XOzc3YZ+ey/jPgYHLuVckd62P0GYU=
modernc.org/gc/v3 v3.0.0-20240304020402-f0dba7c97c2b h1:BnN1t+pb1cy61zbvSUV7SeI0PwosMhlAEi/vBY4qxp8=
modernc.org/gc/v3 v3.0.0-20240304020402-f0dba7c97c2b/go.mod h1:Qz0X07sNOR1jWYCrJMEnbW/X55x206Q7Vt4mz6/wHp4=
modernc.org/libc v1.52.1 h1:uau0VoiT5hnR+SpoWekCKbLqm7v6dhRL3hI+NQhgN3M=
modernc.org/libc v1.52.1/go.mod h1:HR4nVzFDSDizP620zcMCgjb1/8xk2lg5p/8yjfGv1IQ=
modernc.org/libc v1.55.3 h1:AzcW1mhlPNrRtjS5sS+eW2ISCgSOLLNyFzRh/V3Qj/U=
modernc.org/libc v1.55.3/go.mod h1:qFXepLhz+JjFThQ4kzwzOjA/y/artDeg+pcYnY+Q83w=
modernc.org/mathutil v1.6.0 h1:fRe9+AmYlaej+64JsEEhoWuAYBkOtQiMEU7n/XgfYi4=
modernc.org/mathutil v1.6.0/go.mod h1:Ui5Q9q1TR2gFm0AQRqQUaBWFLAhQpCwNcuhBOSedWPo=
modernc.org/memory v1.8.0 h1:IqGTL6eFMaDZZhEWwcREgeMXYwmW83LYW8cROZYkg+E=
@@ -61,8 +61,8 @@ modernc.org/opt v0.1.3 h1:3XOZf2yznlhC+ibLltsDGzABUGVx8J6pnFMS3E4dcq4=
modernc.org/opt v0.1.3/go.mod h1:WdSiB5evDcignE70guQKxYUl14mgWtbClRi5wmkkTX0=
modernc.org/sortutil v1.2.0 h1:jQiD3PfS2REGJNzNCMMaLSp/wdMNieTbKX920Cqdgqc=
modernc.org/sortutil v1.2.0/go.mod h1:TKU2s7kJMf1AE84OoiGppNHJwvB753OYfNl2WRb++Ss=
modernc.org/sqlite v1.30.1 h1:YFhPVfu2iIgUf9kuA1CR7iiHdcEEsI2i+yjRYHscyxk=
modernc.org/sqlite v1.30.1/go.mod h1:DUmsiWQDaAvU4abhc/N+djlom/L2o8f7gZ95RCvyoLU=
modernc.org/sqlite v1.31.1 h1:XVU0VyzxrYHlBhIs1DiEgSl0ZtdnPtbLVy8hSkzxGrs=
modernc.org/sqlite v1.31.1/go.mod h1:UqoylwmTb9F+IqXERT8bW9zzOWN8qwAIcLdzeBZs4hA=
modernc.org/strutil v1.2.0 h1:agBi9dp1I+eOnxXeiZawM8F4LawKv4NzGWSaLfyeNZA=
modernc.org/strutil v1.2.0/go.mod h1:/mdcBmfOibveCTBxUl5B5l6W+TTH1FXPLHZE6bTosX0=
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
64 changes: 28 additions & 36 deletions lib/tgspam/detector.go
Original file line number Diff line number Diff line change
@@ -583,50 +583,42 @@ func (d *Detector) isMultiLang(msg string) spamcheck.Response {
isMultiLingual := func(word string) bool {
scripts := make(map[string]bool)
for _, r := range word {
switch {
case r == 'i': // skip 'i' because it's used in many languages
if r == 'i' || unicode.IsSpace(r) { // skip 'i' (common in many langs) and spaces
continue
case unicode.Is(unicode.Latin, r) || unicode.In(r, unicode.Number):
scripts["Latin"] = true
case unicode.Is(unicode.Cyrillic, r):
scripts["Cyrillic"] = true
case unicode.Is(unicode.Greek, r):
scripts["Greek"] = true
case unicode.Is(unicode.Han, r):
scripts["Han"] = true
case unicode.Is(unicode.Arabic, r):
scripts["Arabic"] = true
case unicode.Is(unicode.Hebrew, r):
scripts["Hebrew"] = true
case unicode.Is(unicode.Devanagari, r):
scripts["Devanagari"] = true
case unicode.Is(unicode.Thai, r):
scripts["Thai"] = true
case unicode.Is(unicode.Hiragana, r) || unicode.Is(unicode.Katakana, r):
scripts["Japanese"] = true
case unicode.Is(unicode.Hangul, r):
scripts["Korean"] = true
case unicode.Is(unicode.Bengali, r):
scripts["Bengali"] = true
case unicode.Is(unicode.Armenian, r):
scripts["Armenian"] = true
case unicode.Is(unicode.Georgian, r):
scripts["Georgian"] = true
case r == 'ї':
scripts["Ukrainian"] = true
case unicode.In(r, unicode.Coptic):
scripts["Coptic"] = true
default:
}

scriptFound := false
for name, table := range unicode.Scripts {
if unicode.Is(table, r) {
if name != "Common" && name != "Inherited" {
scripts[name] = true
if len(scripts) > 1 {
return true
}
scriptFound = true
}
break
}
}

// if no specific script was found, it might be a symbol or punctuation
if !scriptFound {
// check for mathematical alphanumeric symbols and letterlike symbols
if unicode.In(r, unicode.Other_Math, unicode.Other_Alphabetic) ||
(r >= '\U0001D400' && r <= '\U0001D7FF') || // Mathematical Alphanumeric Symbols
(r >= '\u2100' && r <= '\u214F') { // Letterlike Symbols
scripts["Mathematical"] = true
if len(scripts) > 1 {
return true
}
} else if !unicode.IsPunct(r) && !unicode.IsSymbol(r) {
// if it's not punctuation or a symbol, count it as "Other"
scripts["Other"] = true
if len(scripts) > 1 {
return true
}
}
}
if len(scripts) > 1 {
return true
}
}
return false
}
5 changes: 3 additions & 2 deletions lib/tgspam/detector_test.go
Original file line number Diff line number Diff line change
@@ -622,7 +622,7 @@ func TestDetector_CheckMultiLang(t *testing.T) {
count int
spam bool
}{
{"No MultiLang", "Hello, world!", 0, false},
{"No MultiLang", "Hello, world!\n 12345-980! _", 0, false},
{"One MultiLang", "Hi therе", 1, false},
{"Two MultiLang", "Gооd moфning", 2, true},
{"WithCyrillic no MultiLang", "Привет мир", 0, false},
@@ -633,7 +633,8 @@ func TestDetector_CheckMultiLang(t *testing.T) {
{"WithCyrillic real example 3", "Всем привет, есть простая шабашка, подойдет любому. Даю 15 тысяч. Накину на проезд, сигареты, обед. ", 0, false},
{"WithCyrillic and i", "Привет мiр", 0, false},
{"strange with cyrillic", "𝐇айди и𝐇𝐓и𝐦𝐇ы𝐞 ф𝐨𝐓𝐤и лю𝐛𝐨й д𝐞𝐁𝐲ш𝐤и ч𝐞𝐩𝐞𝟑 𝐛𝐨𝐓а", 7, true},
{"coptic capital leter", "✔️ⲠⲢⲞⳜⲈЙ-ⲖЮⳜⲨЮ-ⲆⲈⲂⲨⲰⲔⲨ...\n\nⲎⲀЙⲆⳘ ⲤⲔⲢЫⲦⲈ ⲂⳘⲆⲞⲤЫ-ⲪⲞⲦⲞⳠⲔⳘ ⳘⲎⲦⳘⲘⲎⲞⲄⲞ-ⲬⲀⲢⲀⲔⲦⲈⲢⲀ..\n@INTIM0CHKI110DE\n\n", 5, true},
{"coptic capital leter", "✔️ⲠⲢⲞⳜⲈЙ-ⲖЮⳜⲨЮ-ⲆⲈⲂⲨⲰⲔⲨ...\n\nⲎⲀЙⲆⳘ ⲤⲔⲢЫⲦⲈ ⲂⳘⲆⲞⲤЫ-ⲪⲞⲦⲞⳠⲔⳘ ⳘⲎⲦⳘⲘⲎⲞⲄⲞ-ⲬⲀⲢⲀⲔⲦⲈⲢⲀ..\n@INTIM0CHKI110DE\n\n", 6, true},
{"mix with gothic, cyrillic and greek", "𐌿РОВЕРЬ ЛЮБУЮ НА НАЛИЧИЕ ПОШЛЫХ ΦΟͲΟ-ΒͶДξΟ, 🍑НАБЕРИ В Т𐌲 𐌿ОИСКЕ СЛОВО: 30GRL", 5, true},
}

for _, tt := range tests {