From 7b93d73639d95da588ce0fd19c104110a94f35bb Mon Sep 17 00:00:00 2001 From: Jimmy Praet Date: Sat, 3 Jul 2021 12:51:27 +0200 Subject: [PATCH 1/2] Detect encoding changes while parsing diff --- services/gitdiff/gitdiff.go | 48 +++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/services/gitdiff/gitdiff.go b/services/gitdiff/gitdiff.go index f8f0fd7e3b90b..c96f77ccd1215 100644 --- a/services/gitdiff/gitdiff.go +++ b/services/gitdiff/gitdiff.go @@ -32,6 +32,7 @@ import ( "github.com/sergi/go-diff/diffmatchpatch" stdcharset "golang.org/x/net/html/charset" + "golang.org/x/text/encoding" "golang.org/x/text/transform" ) @@ -886,32 +887,43 @@ parsingLoop: // FIXME: There are numerous issues with this: // - we might want to consider detecting encoding while parsing but... // - we're likely to fail to get the correct encoding here anyway as we won't have enough information - // - and this doesn't really account for changes in encoding - var buf bytes.Buffer + var diffLineTypeBuffers = make(map[DiffLineType]*bytes.Buffer, 3) + var diffLineTypeDecoders = make(map[DiffLineType]*encoding.Decoder, 3) + diffLineTypeBuffers[DiffLinePlain] = new(bytes.Buffer) + diffLineTypeBuffers[DiffLineAdd] = new(bytes.Buffer) + diffLineTypeBuffers[DiffLineDel] = new(bytes.Buffer) for _, f := range diff.Files { - buf.Reset() + for _, buffer := range diffLineTypeBuffers { + buffer.Reset() + } for _, sec := range f.Sections { for _, l := range sec.Lines { if l.Type == DiffLineSection { continue } - buf.WriteString(l.Content[1:]) - buf.WriteString("\n") + diffLineTypeBuffers[l.Type].WriteString(l.Content[1:]) + diffLineTypeBuffers[l.Type].WriteString("\n") } } - charsetLabel, err := charset.DetectEncoding(buf.Bytes()) - if charsetLabel != "UTF-8" && err == nil { - encoding, _ := stdcharset.Lookup(charsetLabel) - if encoding != nil { - d := encoding.NewDecoder() - for _, sec := range f.Sections { - for _, l := range sec.Lines { - if l.Type == DiffLineSection { - continue - } - if c, _, err := transform.String(d, l.Content[1:]); err == nil { - l.Content = l.Content[0:1] + c - } + for lineType, buffer := range diffLineTypeBuffers { + diffLineTypeDecoders[lineType] = nil + if buffer.Len() == 0 { + continue + } + charsetLabel, err := charset.DetectEncoding(buffer.Bytes()) + if charsetLabel != "UTF-8" && err == nil { + encoding, _ := stdcharset.Lookup(charsetLabel) + if encoding != nil { + diffLineTypeDecoders[lineType] = encoding.NewDecoder() + } + } + } + for _, sec := range f.Sections { + for _, l := range sec.Lines { + decoder := diffLineTypeDecoders[l.Type] + if decoder != nil { + if c, _, err := transform.String(decoder, l.Content[1:]); err == nil { + l.Content = l.Content[0:1] + c } } } From e7c7be219986f3963709df60668cacd38fa2cc2a Mon Sep 17 00:00:00 2001 From: 6543 <6543@obermui.de> Date: Tue, 13 Jul 2021 01:04:03 +0200 Subject: [PATCH 2/2] Update services/gitdiff/gitdiff.go --- services/gitdiff/gitdiff.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/gitdiff/gitdiff.go b/services/gitdiff/gitdiff.go index c96f77ccd1215..d50e41eb40279 100644 --- a/services/gitdiff/gitdiff.go +++ b/services/gitdiff/gitdiff.go @@ -884,7 +884,7 @@ parsingLoop: } - // FIXME: There are numerous issues with this: + // TODO: There are numerous issues with this: // - we might want to consider detecting encoding while parsing but... // - we're likely to fail to get the correct encoding here anyway as we won't have enough information var diffLineTypeBuffers = make(map[DiffLineType]*bytes.Buffer, 3)