Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix line diff by using rune index without a separator #136

Merged
merged 2 commits into from
Aug 2, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Fix line diff by using runes without separators
[The suggested approach](https://github.com/google/diff-match-patch/wiki/Line-or-Word-Diffs#line-mode
) for doing line level diffing is the following set of steps:

1. `ti1, ti2, linesIdx = DiffLinesToChars(t1, t2)`
2. `diffs = DiffMain(ti1, ti2)`
3. `DiffCharsToLines(diff, linesIdx)`

The original implementation in `google/diff-match-patch` uses
unicode codepoints for storing indices in `ti1` and `ti2` joined by an empty string.
Current implementation in this repo stores them as integers joined by a
comma. While this implementation makes `ti1` and `ti2` more readable, it
introduces bugs when trying to rely on it when doing line level diffing
with `DiffMain`. The root cause of the issue is that an integer line
index might span more than one character/rune, and `DiffMain` can assume
that two different lines having the same index prefix match partially. For
example, indices 123 and 129 will have partial match `12`. In that
example, the diff will show lines 3 and 9 which is not correct. A simple
failing test case demonstrating this issue is available at
`TestDiffPartialLineIndex`.

In this PR I am adjusting the algorithm to use the same approach as in
[diff-match-patch](https://github.com/google/diff-match-patch/blob/62f2e689f498f9c92dbc588c58750addec9b1654/javascript/diff_match_patch_uncompressed.js#L508-L510
) by storing each line index as a rune.
While a rune in Golang is a type alias to uint32, not every uint32
can be a valid rune. During string to rune slice conversion invalid runes will
be replaced with `utf.RuneError`.

The integer to rune generation logic is based on the table in https://en.wikipedia.org/wiki/UTF-8#Encoding

The first 127 lines will work the fastest as they are represented as a
single bytes. Higher numbers are represented as 2-4 bytes.

In addition to that, the range `U+D800 - U+DFFF` contains
[invalid codepoints](https://en.wikipedia.org/wiki/UTF-8#Invalid_sequences_and_error_handling).
and all codepoints higher or equal to `0xD800` are incremented by
`0xDFFF - 0xD800`.

The maximum representable integer using this approach is 1'112'060.
This improves on Javascript implementation which currently
[bails out](https://github.com/google/diff-match-patch/blob/62f2e689f498f9c92dbc588c58750addec9b1654/javascript/diff_match_patch_uncompressed.js#L503-L505
) when files have more than 65535 lines.
kdarkhan committed Jan 23, 2023
commit a674b3095807a3679f154c5a4a6df738aeffdfdf
13 changes: 4 additions & 9 deletions diffmatchpatch/diff.go
Original file line number Diff line number Diff line change
@@ -34,8 +34,6 @@ const (
DiffInsert Operation = 1
// DiffEqual item represents an equal diff.
DiffEqual Operation = 0
//IndexSeparator is used to seperate the array indexes in an index string
IndexSeparator = ","
)

// Diff represents one diff operation
@@ -406,14 +404,11 @@ func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune
func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineArray []string) []Diff {
hydrated := make([]Diff, 0, len(diffs))
for _, aDiff := range diffs {
chars := strings.Split(aDiff.Text, IndexSeparator)
text := make([]string, len(chars))
runes := []rune(aDiff.Text)
text := make([]string, len(runes))

for i, r := range chars {
i1, err := strconv.Atoi(r)
if err == nil {
text[i] = lineArray[i1]
}
for i, r := range runes {
text[i] = lineArray[runeToInt(r)]
}

aDiff.Text = strings.Join(text, "")
38 changes: 18 additions & 20 deletions diffmatchpatch/diff_test.go
Original file line number Diff line number Diff line change
@@ -314,12 +314,12 @@ func TestDiffLinesToChars(t *testing.T) {
dmp := New()

for i, tc := range []TestCase{
{"", "alpha\r\nbeta\r\n\r\n\r\n", "", "1,2,3,3", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}},
{"a", "b", "1", "2", []string{"", "a", "b"}},
{"", "alpha\r\nbeta\r\n\r\n\r\n", "", "\x01\x02\x03\x03", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}},
{"a", "b", "\x01", "\x02", []string{"", "a", "b"}},
// Omit final newline.
{"alpha\nbeta\nalpha", "", "1,2,3", "", []string{"", "alpha\n", "beta\n", "alpha"}},
{"alpha\nbeta\nalpha", "", "\x01\x02\x03", "", []string{"", "alpha\n", "beta\n", "alpha"}},
// Same lines in Text1 and Text2
{"abc\ndefg\n12345\n", "abc\ndef\n12345\n678", "1,2,3", "1,4,3,5", []string{"", "abc\n", "defg\n", "12345\n", "def\n", "678"}},
{"abc\ndefg\n12345\n", "abc\ndef\n12345\n678", "\x01\x02\x03", "\x01\x04\x03\x05", []string{"", "abc\n", "defg\n", "12345\n", "def\n", "678"}},
} {
actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(tc.Text1, tc.Text2)
assert.Equal(t, tc.ExpectedChars1, actualChars1, fmt.Sprintf("Test case #%d, %#v", i, tc))
@@ -332,14 +332,13 @@ func TestDiffLinesToChars(t *testing.T) {
lineList := []string{
"", // Account for the initial empty element of the lines array.
}
var charList []string
var charList []rune
for x := 1; x < n+1; x++ {
lineList = append(lineList, strconv.Itoa(x)+"\n")
charList = append(charList, strconv.Itoa(x))
charList = append(charList, rune(x))
}
lines := strings.Join(lineList, "")
chars := strings.Join(charList[:], ",")
assert.Equal(t, n, len(strings.Split(chars, ",")))
chars := string(charList)

actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(lines, "")
assert.Equal(t, chars, actualChars1)
@@ -360,8 +359,8 @@ func TestDiffCharsToLines(t *testing.T) {
for i, tc := range []TestCase{
{
Diffs: []Diff{
{DiffEqual, "1,2,1"},
{DiffInsert, "2,1,2"},
{DiffEqual, "\x01\x02\x01"},
{DiffInsert, "\x02\x01\x02"},
},
Lines: []string{"", "alpha\n", "beta\n"},

@@ -380,13 +379,12 @@ func TestDiffCharsToLines(t *testing.T) {
lineList := []string{
"", // Account for the initial empty element of the lines array.
}
charList := []string{}
charList := []rune{}
for x := 1; x <= n; x++ {
lineList = append(lineList, strconv.Itoa(x)+"\n")
charList = append(charList, strconv.Itoa(x))
charList = append(charList, rune(x))
}
assert.Equal(t, n, len(charList))
chars := strings.Join(charList[:], ",")
chars := string(charList)

actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, chars}}, lineList)
assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lineList, "")}}, actual)
@@ -1471,7 +1469,7 @@ func TestMassiveRuneDiffConversion(t *testing.T) {
func TestDiffPartialLineIndex(t *testing.T) {
dmp := New()
t1, t2, tt := dmp.DiffLinesToChars(
`line 1
`line 1
line 2
line 3
line 4
@@ -1481,7 +1479,7 @@ line 7
line 8
line 9
line 10 text1`,
`line 1
`line 1
line 2
line 3
line 4
@@ -1494,10 +1492,10 @@ line 10 text2`)
diffs := dmp.DiffMain(t1, t2, false)
diffs = dmp.DiffCharsToLines(diffs, tt)
assert.Equal(t, []Diff{
Diff{DiffEqual, "line 1\nline 2\nline 3\nline 4\nline 5\nline 6\nline 7\nline 8\nline 9\n"},
Diff{DiffDelete, "line 10 text1"},
Diff{DiffInsert, "line 10 text2"},
}, diffs)
Diff{DiffEqual, "line 1\nline 2\nline 3\nline 4\nline 5\nline 6\nline 7\nline 8\nline 9\n"},
Diff{DiffDelete, "line 10 text1"},
Diff{DiffInsert, "line 10 text2"},
}, diffs)
}

func BenchmarkDiffMain(bench *testing.B) {
100 changes: 92 additions & 8 deletions diffmatchpatch/stringutil.go
Original file line number Diff line number Diff line change
@@ -9,11 +9,16 @@
package diffmatchpatch

import (
"strconv"
"fmt"
"strings"
"unicode/utf8"
)

const UNICODE_INVALID_RANGE_START = 0xD800
const UNICODE_INVALID_RANGE_END = 0xDFFF
const UNICODE_INVALID_RANGE_DELTA = UNICODE_INVALID_RANGE_END - UNICODE_INVALID_RANGE_START + 1
const UNICODE_RANGE_MAX = 0x10FFFF

// unescaper unescapes selected chars for compatibility with JavaScript's encodeURI.
// In speed critical applications this could be dropped since the receiving application will certainly decode these fine. Note that this function is case-sensitive. Thus "%3F" would not be unescaped. But this is ok because it is only called with the output of HttpUtility.UrlEncode which returns lowercase hex. Example: "%3f" -> "?", "%24" -> "$", etc.
var unescaper = strings.NewReplacer(
@@ -93,14 +98,93 @@ func intArrayToString(ns []uint32) string {
return ""
}

indexSeparator := IndexSeparator[0]

// Appr. 3 chars per num plus the comma.
b := []byte{}
b := []rune{}
for _, n := range ns {
b = strconv.AppendInt(b, int64(n), 10)
b = append(b, indexSeparator)
b = append(b, intToRune(n))
}
b = b[:len(b)-1]
return string(b)
}

// These constants define the number of bits representable
// in 1,2,3,4 byte utf8 sequences, respectively.
const ONE_BYTE_BITS = 7
const TWO_BYTE_BITS = 11
const THREE_BYTE_BITS = 16
const FOUR_BYTE_BITS = 21

// Helper for getting a sequence of bits from an integer.
func getBits(i uint32, cnt byte, from byte) byte {
return byte((i >> from) & ((1 << cnt) - 1))
}

// Converts an integer in the range 0~1112060 into a rune.
// Based on the ranges table in https://en.wikipedia.org/wiki/UTF-8
func intToRune(i uint32) rune {
if i < (1 << ONE_BYTE_BITS) {
return rune(i)
}

if i < (1 << TWO_BYTE_BITS) {
r, size := utf8.DecodeRune([]byte{0b11000000 | getBits(i, 5, 6), 0b10000000 | getBits(i, 6, 0)})
if size != 2 || r == utf8.RuneError {
panic(fmt.Sprintf("Error encoding an int %d with size 2, got rune %v and size %d", size, r, i))
}
return r
}

// Last -3 here needed because for some reason 3rd to last codepoint 65533 in this range
// was returning utf8.RuneError during encoding.
if i < ((1 << THREE_BYTE_BITS) - UNICODE_INVALID_RANGE_DELTA - 3) {
if i >= UNICODE_INVALID_RANGE_START {
i += UNICODE_INVALID_RANGE_DELTA
}

r, size := utf8.DecodeRune([]byte{0b11100000 | getBits(i, 4, 12), 0b10000000 | getBits(i, 6, 6), 0b10000000 | getBits(i, 6, 0)})
if size != 3 || r == utf8.RuneError {
panic(fmt.Sprintf("Error encoding an int %d with size 3, got rune %v and size %d", size, r, i))
}
return r
}

if i < (1<<FOUR_BYTE_BITS - UNICODE_INVALID_RANGE_DELTA - 3) {
i += UNICODE_INVALID_RANGE_DELTA + 3
r, size := utf8.DecodeRune([]byte{0b11110000 | getBits(i, 3, 18), 0b10000000 | getBits(i, 6, 12), 0b10000000 | getBits(i, 6, 6), 0b10000000 | getBits(i, 6, 0)})
if size != 4 || r == utf8.RuneError {
panic(fmt.Sprintf("Error encoding an int %d with size 4, got rune %v and size %d", size, r, i))
}
return r
}
panic(fmt.Sprintf("The integer %d is too large for runeToInt()", i))
}

// Converts a rune generated by intToRune back to an integer
func runeToInt(r rune) uint32 {
i := uint32(r)
if i < (1 << ONE_BYTE_BITS) {
return i
}

bytes := []byte{0, 0, 0, 0}

size := utf8.EncodeRune(bytes, r)

if size == 2 {
return uint32(bytes[0]&0b11111)<<6 | uint32(bytes[1]&0b111111)
}

if size == 3 {
result := uint32(bytes[0]&0b1111)<<12 | uint32(bytes[1]&0b111111)<<6 | uint32(bytes[2]&0b111111)
if result >= UNICODE_INVALID_RANGE_END {
return result - UNICODE_INVALID_RANGE_DELTA
}

return result
}

if size == 4 {
result := uint32(bytes[0]&0b111)<<18 | uint32(bytes[1]&0b111111)<<12 | uint32(bytes[2]&0b111111)<<6 | uint32(bytes[3]&0b111111)
return result - UNICODE_INVALID_RANGE_DELTA - 3
}

panic(fmt.Sprintf("Unexpected state decoding rune=%v size=%d", r, size))
}
18 changes: 18 additions & 0 deletions diffmatchpatch/stringutil_test.go
Original file line number Diff line number Diff line change
@@ -114,3 +114,21 @@ func TestLastIndexOf(t *testing.T) {
assert.Equal(t, tc.Expected, actual, fmt.Sprintf("Test case #%d, %#v", i, tc))
}
}

// Exhaustive check for all ints from 0 to 1112060 for correctness of implementation
// of `intToRune() -> runeToInt()`.
// This test is slow and runs longer than 5 seconds but it does provide a safety
// guarantee that these 2 functions are correct for the ranges we support.
func TestRuneToInt(t *testing.T) {

for i := uint32(0); i <= UNICODE_RANGE_MAX-UNICODE_INVALID_RANGE_DELTA-3; i += 1 {
r := intToRune(i)
ic := runeToInt(r)

assert.Equal(t, i, ic, fmt.Sprintf("intToRune(%d)=%d and runeToInt(%d)=%d", i, r, r, ic))
}

assert.Panics(t, func() {
intToRune(UNICODE_RANGE_MAX - UNICODE_INVALID_RANGE_DELTA - 2)
})
}
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -8,4 +8,4 @@ require (
gopkg.in/yaml.v2 v2.4.0 // indirect
)

go 1.12
go 1.13
Copy link
Contributor Author

@kdarkhan kdarkhan Jan 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Upgrading to go 1.13 in order to use binary and hex integer literals.