Skip to content

Commit

Permalink
Index out of range panic in DiffCharsToLines on large JSON diff
Browse files Browse the repository at this point in the history
  • Loading branch information
r-pai authored and sergi committed Dec 1, 2020
1 parent df97e07 commit db1b095
Show file tree
Hide file tree
Showing 3 changed files with 100,121 additions and 63 deletions.
124 changes: 73 additions & 51 deletions diffmatchpatch/diff.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ const (
DiffInsert Operation = 1
// DiffEqual item represents an equal diff.
DiffEqual Operation = 0
//IndexSeperator is used to seperate the array indexes in an index string
IndexSeperator = ","
)

// Diff represents one diff operation
Expand Down Expand Up @@ -396,65 +398,17 @@ func (dmp *DiffMatchPatch) DiffLinesToChars(text1, text2 string) (string, string

// DiffLinesToRunes splits two texts into a list of runes. Each rune represents one line.
func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune, []string) {
// '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character.
lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n'
lineHash := map[string]int{} // e.g. lineHash['Hello\n'] == 4

chars1 := dmp.diffLinesToRunesMunge(text1, &lineArray, lineHash)
chars2 := dmp.diffLinesToRunesMunge(text2, &lineArray, lineHash)

return chars1, chars2, lineArray
chars1, chars2, lineArray := dmp.DiffLinesToStrings(text1, text2)
return []rune(chars1), []rune(chars2), lineArray
}

func (dmp *DiffMatchPatch) diffLinesToRunes(text1, text2 []rune) ([]rune, []rune, []string) {
return dmp.DiffLinesToRunes(string(text1), string(text2))
}

// diffLinesToRunesMunge splits a text into an array of strings, and reduces the texts to a []rune where each Unicode character represents one line.
// We use strings instead of []runes as input mainly because you can't use []rune as a map key.
func (dmp *DiffMatchPatch) diffLinesToRunesMunge(text string, lineArray *[]string, lineHash map[string]int) []rune {
// Walk the text, pulling out a substring for each line. text.split('\n') would would temporarily double our memory footprint. Modifying text would create many large strings to garbage collect.
lineStart := 0
lineEnd := -1
runes := []rune{}

for lineEnd < len(text)-1 {
lineEnd = indexOf(text, "\n", lineStart)

if lineEnd == -1 {
lineEnd = len(text) - 1
}

line := text[lineStart : lineEnd+1]
lineStart = lineEnd + 1
lineValue, ok := lineHash[line]

if ok {
runes = append(runes, rune(lineValue))
} else {
*lineArray = append(*lineArray, line)
lineHash[line] = len(*lineArray) - 1
runes = append(runes, rune(len(*lineArray)-1))
}
}

return runes
}

// DiffCharsToLines rehydrates the text in a diff from a string of line hashes to real lines of text.
func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineArray []string) []Diff {
hydrated := make([]Diff, 0, len(diffs))
for _, aDiff := range diffs {
chars := aDiff.Text
text := make([]string, len(chars))

for i, r := range chars {
text[i] = lineArray[r]
}

aDiff.Text = strings.Join(text, "")
hydrated = append(hydrated, aDiff)
}
hydrated := dmp.DiffStringsToLines(diffs, lineArray)
return hydrated
}

Expand Down Expand Up @@ -1343,3 +1297,71 @@ func (dmp *DiffMatchPatch) DiffFromDelta(text1 string, delta string) (diffs []Di

return diffs, nil
}

// DiffLinesToStrings splits two texts into a list of strings. Each string represents one line.
func (dmp *DiffMatchPatch) DiffLinesToStrings(text1, text2 string) (string, string, []string) {
// '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character.
lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n'

//Each string has the index of lineArray which it points to
strIndexArray1 := dmp.diffLinesToStringsMunge(text1, &lineArray)
strIndexArray2 := dmp.diffLinesToStringsMunge(text2, &lineArray)

//Adding a delimter to later get the strings as array
str1 := strings.Join(strIndexArray1[:], IndexSeperator)
str2 := strings.Join(strIndexArray2[:], IndexSeperator)

return str1, str2, lineArray
}

// diffLinesToStringsMunge splits a text into an array of strings, and reduces the texts to a []rune where each Unicode character represents one line.
// We use strings instead of []runes as input mainly because you can't use []rune as a map key.
func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]string) []string {
// Walk the text, pulling out a substring for each line. text.split('\n') would would temporarily double our memory footprint. Modifying text would create many large strings to garbage collect.
lineHash := map[string]int{} // e.g. lineHash['Hello\n'] == 4
lineStart := 0
lineEnd := -1
strings := []string{}

for lineEnd < len(text)-1 {
lineEnd = indexOf(text, "\n", lineStart)

if lineEnd == -1 {
lineEnd = len(text) - 1
}

line := text[lineStart : lineEnd+1]
lineStart = lineEnd + 1
lineValue, ok := lineHash[line]

if ok {
strings = append(strings, strconv.Itoa(lineValue))
} else {
*lineArray = append(*lineArray, line)
lineHash[line] = len(*lineArray) - 1
strings = append(strings, strconv.Itoa(len(*lineArray)-1))
}
}

return strings
}

// DiffStringsToLines rehydrates the text in a diff from a string of line hashes to real lines of text.
func (dmp *DiffMatchPatch) DiffStringsToLines(diffs []Diff, lineArray []string) []Diff {
hydrated := make([]Diff, 0, len(diffs))
for _, aDiff := range diffs {
chars := strings.Split(aDiff.Text, IndexSeperator)
text := make([]string, len(chars))

for i, r := range chars {
i1, err := strconv.Atoi(r)
if err == nil {
text[i] = lineArray[i1]
}
}

aDiff.Text = strings.Join(text, "")
hydrated = append(hydrated, aDiff)
}
return hydrated
}
59 changes: 47 additions & 12 deletions diffmatchpatch/diff_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ package diffmatchpatch
import (
"bytes"
"fmt"
"io/ioutil"
"os"
"strconv"
"strings"
"testing"
Expand Down Expand Up @@ -312,10 +314,10 @@ func TestDiffLinesToChars(t *testing.T) {
dmp := New()

for i, tc := range []TestCase{
{"", "alpha\r\nbeta\r\n\r\n\r\n", "", "\u0001\u0002\u0003\u0003", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}},
{"a", "b", "\u0001", "\u0002", []string{"", "a", "b"}},
{"", "alpha\r\nbeta\r\n\r\n\r\n", "", "1,2,3,3", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}},
{"a", "b", "1", "2", []string{"", "a", "b"}},
// Omit final newline.
{"alpha\nbeta\nalpha", "", "\u0001\u0002\u0003", "", []string{"", "alpha\n", "beta\n", "alpha"}},
{"alpha\nbeta\nalpha", "", "1,2,3", "", []string{"", "alpha\n", "beta\n", "alpha"}},
} {
actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(tc.Text1, tc.Text2)
assert.Equal(t, tc.ExpectedChars1, actualChars1, fmt.Sprintf("Test case #%d, %#v", i, tc))
Expand All @@ -328,14 +330,14 @@ func TestDiffLinesToChars(t *testing.T) {
lineList := []string{
"", // Account for the initial empty element of the lines array.
}
var charList []rune
var charList []string
for x := 1; x < n+1; x++ {
lineList = append(lineList, strconv.Itoa(x)+"\n")
charList = append(charList, rune(x))
charList = append(charList, strconv.Itoa(x))
}
lines := strings.Join(lineList, "")
chars := string(charList)
assert.Equal(t, n, utf8.RuneCountInString(chars))
chars := strings.Join(charList[:], ",")
assert.Equal(t, n, len(strings.Split(chars, ",")))

actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(lines, "")
assert.Equal(t, chars, actualChars1)
Expand All @@ -356,8 +358,8 @@ func TestDiffCharsToLines(t *testing.T) {
for i, tc := range []TestCase{
{
Diffs: []Diff{
{DiffEqual, "\u0001\u0002\u0001"},
{DiffInsert, "\u0002\u0001\u0002"},
{DiffEqual, "1,2,1"},
{DiffInsert, "2,1,2"},
},
Lines: []string{"", "alpha\n", "beta\n"},

Expand All @@ -376,14 +378,15 @@ func TestDiffCharsToLines(t *testing.T) {
lineList := []string{
"", // Account for the initial empty element of the lines array.
}
charList := []rune{}
charList := []string{}
for x := 1; x <= n; x++ {
lineList = append(lineList, strconv.Itoa(x)+"\n")
charList = append(charList, rune(x))
charList = append(charList, strconv.Itoa(x))
}
assert.Equal(t, n, len(charList))
chars := strings.Join(charList[:], ",")

actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, string(charList)}}, lineList)
actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, chars}}, lineList)
assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lineList, "")}}, actual)
}

Expand Down Expand Up @@ -1496,3 +1499,35 @@ func BenchmarkDiffMainRunesLargeLines(b *testing.B) {
diffs = dmp.DiffCharsToLines(diffs, linearray)
}
}

func BenchmarkDiffMainStringsLargeLines(b *testing.B) {
s1, s2 := speedtestTexts()

dmp := New()

b.ResetTimer()

for i := 0; i < b.N; i++ {
text1, text2, linearray := dmp.DiffLinesToStrings(s1, s2)

diffs := dmp.DiffMain(text1, text2, false)
diffs = dmp.DiffStringsToLines(diffs, linearray)
}
}

func BenchmarkDiffMainRunesLargeDiffLines(b *testing.B) {
fp, _ := os.Open("../testdata/diff10klinestest.txt")
defer fp.Close()
data, _ := ioutil.ReadAll(fp)

dmp := New()

b.ResetTimer()

for i := 0; i < b.N; i++ {
text1, text2, linearray := dmp.DiffLinesToRunes(string(data), "")

diffs := dmp.DiffMainRunes(text1, text2, false)
diffs = dmp.DiffCharsToLines(diffs, linearray)
}
}
Loading

0 comments on commit db1b095

Please sign in to comment.