Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix chardet test and add ordering option #11621

Merged
merged 11 commits into from
Jun 2, 2020
7 changes: 6 additions & 1 deletion custom/conf/app.ini.sample
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@ RUN_MODE = dev
[repository]
ROOT =
SCRIPT_TYPE = bash
; Default ANSI charset
; DETECTED_CHARSETS_ORDER tie-break order for detected charsets.
; If the charsets have equal confidence, tie-breaking will be done by order in this list
; with charsets earlier in the list chosen in preference to those later.
; Adding "defaults" will place the unused charsets at that position.
DETECTED_CHARSETS_ORDER=UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE, ISO-8859, windows-1252, ISO-8859, windows-1250, ISO-8859, ISO-8859, ISO-8859, windows-1253, ISO-8859, windows-1255, ISO-8859, windows-1251, windows-1256, KOI8-R, ISO-8859, windows-1254, Shift_JIS, GB18030, EUC-JP, EUC-KR, Big5, ISO-2022, ISO-2022, ISO-2022, IBM424_rtl, IBM424_ltr, IBM420_rtl, IBM420_ltr
; Default ANSI charset to override non-UTF-8 charsets to
ANSI_CHARSET =
; Force every new repository to be private
FORCE_PRIVATE = false
Expand Down
3 changes: 2 additions & 1 deletion docs/content/doc/advanced/config-cheat-sheet.en-us.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ Values containing `#` or `;` must be quoted using `` ` `` or `"""`.
an absolute path.
- `SCRIPT_TYPE`: **bash**: The script type this server supports. Usually this is `bash`,
but some users report that only `sh` is available.
- `ANSI_CHARSET`: **\<empty\>**: The default charset for an unrecognized charset.
- `DETECTED_CHARSETS_ORDER`: **UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE, ISO-8859, windows-1252, ISO-8859, windows-1250, ISO-8859, ISO-8859, ISO-8859, windows-1253, ISO-8859, windows-1255, ISO-8859, windows-1251, windows-1256, KOI8-R, ISO-8859, windows-1254, Shift_JIS, GB18030, EUC-JP, EUC-KR, Big5, ISO-2022, ISO-2022, ISO-2022, IBM424_rtl, IBM424_ltr, IBM420_rtl, IBM420_ltr**: Tie-break order of detected charsets - if the detected charsets have equal confidence, charsets earlier in the list will be chosen in preference to those later. Adding `defaults` will place the unnamed charsets at that point.
- `ANSI_CHARSET`: **\<empty\>**: Default ANSI charset to override non-UTF-8 charsets to.
- `FORCE_PRIVATE`: **false**: Force every new repository to be private.
- `DEFAULT_PRIVATE`: **last**: Default private when creating a new repository.
\[last, private, public\]
Expand Down
35 changes: 31 additions & 4 deletions modules/charset/charset.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package charset
import (
"bytes"
"fmt"
"strings"
"unicode/utf8"

"code.gitea.io/gitea/modules/log"
Expand Down Expand Up @@ -137,16 +138,42 @@ func DetectEncoding(content []byte) (string, error) {
} else {
detectContent = content
}
result, err := textDetector.DetectBest(detectContent)

// Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break
results, err := textDetector.DetectAll(detectContent)
if err != nil {
if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 {
log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
return setting.Repository.AnsiCharset, nil
}
return "", err
}

topConfidence := results[0].Confidence
topResult := results[0]
priority, has := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(topResult.Charset))]
for _, result := range results {
// As results are sorted in confidence order - if we have a different confidence
// we know it's less than the current confidence and can break out of the loop early
if result.Confidence != topConfidence {
break
}

// Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guesss
resultPriority, resultHas := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(result.Charset))]
if resultHas && (!has || resultPriority < priority) {
topResult = result
priority = resultPriority
has = true
}
}

// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument
if result.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {
if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {
log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
return setting.Repository.AnsiCharset, err
}

log.Debug("Detected encoding: %s", result.Charset)
return result.Charset, err
log.Debug("Detected encoding: %s", topResult.Charset)
return topResult.Charset, err
}
4 changes: 4 additions & 0 deletions modules/charset/charset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,11 @@ func TestDetectEncoding(t *testing.T) {
// we accept either.
assert.Contains(t, encoding, "ISO-8859")

old := setting.Repository.AnsiCharset
setting.Repository.AnsiCharset = "placeholder"
defer func() {
setting.Repository.AnsiCharset = old
}()
testSuccess(b, "placeholder")

// invalid bytes
Expand Down
74 changes: 74 additions & 0 deletions modules/setting/repository.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ const (
// Repository settings
var (
Repository = struct {
DetectedCharsetsOrder []string
DetectedCharsetScore map[string]int `ini:"-"`
AnsiCharset string
ForcePrivate bool
DefaultPrivate string
Expand Down Expand Up @@ -88,6 +90,42 @@ var (
Wiki []string
} `ini:"repository.signing"`
}{
DetectedCharsetsOrder: []string{
"UTF-8",
"UTF-16BE",
"UTF-16LE",
"UTF-32BE",
"UTF-32LE",
"ISO-8859-1",
"windows-1252",
"ISO-8859-2",
"windows-1250",
"ISO-8859-5",
"ISO-8859-6",
"ISO-8859-7",
"windows-1253",
"ISO-8859-8-I",
"windows-1255",
"ISO-8859-8",
"windows-1251",
"windows-1256",
"KOI8-R",
"ISO-8859-9",
"windows-1254",
"Shift_JIS",
"GB18030",
"EUC-JP",
"EUC-KR",
"Big5",
"ISO-2022-JP",
"ISO-2022-KR",
"ISO-2022-CN",
"IBM424_rtl",
"IBM424_ltr",
"IBM420_rtl",
"IBM420_ltr",
},
DetectedCharsetScore: map[string]int{},
AnsiCharset: "",
ForcePrivate: false,
DefaultPrivate: RepoCreatingLastUserVisibility,
Expand Down Expand Up @@ -208,6 +246,10 @@ func newRepository() {
} else {
RepoRootPath = filepath.Clean(RepoRootPath)
}
defaultDetectedCharsetsOrder := make([]string, 0, len(Repository.DetectedCharsetsOrder))
for _, charset := range Repository.DetectedCharsetsOrder {
defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder, strings.ToLower(strings.TrimSpace(charset)))
}
ScriptType = sec.Key("SCRIPT_TYPE").MustString("bash")

if err = Cfg.Section("repository").MapTo(&Repository); err != nil {
Expand All @@ -222,6 +264,38 @@ func newRepository() {
log.Fatal("Failed to map Repository.PullRequest settings: %v", err)
}

preferred := make([]string, 0, len(Repository.DetectedCharsetsOrder))
for _, charset := range Repository.DetectedCharsetsOrder {
canonicalCharset := strings.ToLower(strings.TrimSpace(charset))
preferred = append(preferred, canonicalCharset)
// remove it from the defaults
for i, charset := range defaultDetectedCharsetsOrder {
if charset == canonicalCharset {
defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder[:i], defaultDetectedCharsetsOrder[i+1:]...)
break
}
}
}

i := 0
for _, charset := range preferred {
// Add the defaults
if charset == "defaults" {
for _, charset := range defaultDetectedCharsetsOrder {
canonicalCharset := strings.ToLower(strings.TrimSpace(charset))
if _, has := Repository.DetectedCharsetScore[canonicalCharset]; !has {
Repository.DetectedCharsetScore[canonicalCharset] = i
i++
}
}
continue
}
if _, has := Repository.DetectedCharsetScore[charset]; !has {
Repository.DetectedCharsetScore[charset] = i
i++
}
}

if !filepath.IsAbs(Repository.Upload.TempPath) {
Repository.Upload.TempPath = path.Join(AppWorkPath, Repository.Upload.TempPath)
}
Expand Down