Merge pull request #5 from warpfork/normalizing-linebreaks-pt3

fix parsing CRLF files, part 3
warpfork · Sep 26, 2021 · 91a00c1 · 91a00c1
2 parents 9dc11b3 + 83e3eb5
commit 91a00c1
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 3 deletions.
diff --git a/read.go b/read.go
@@ -29,6 +29,7 @@ func ReadFile(name string) (*Document, error) {
 var (
 	sigilLineBreak      = []byte{'\n'}
 	sigilCarriageReturn = []byte{'\r'}
+	sigilCrLf           = []byte{'\r', '\n'}
 	sigilCodeBlock      = []byte("```")
 	sigilTestmark       = []byte("[testmark]:# ")
 )
@@ -67,7 +68,7 @@ func Parse(data []byte) (*Document, error) {
 			case true: // ending a block
 				if hunkInProgress.LineStart > -1 {
 					hunkInProgress.LineEnd = i
-					hunkInProgress.Body = doc.Original[codeBlockOffset:offset]
+					hunkInProgress.Body = normalizeEndings(doc.Original[codeBlockOffset:offset])
 					doc.DataHunks = append(doc.DataHunks, hunkInProgress)
 					doc.HunksByName[hunkInProgress.Name] = hunkInProgress
 					hunkInProgress = DocHunk{LineStart: -1}
@@ -124,3 +125,18 @@ func Parse(data []byte) (*Document, error) {
 	}
 	return &doc, nil
 }
+
+// normalizeEndings looks for instances of "\r\n" and flattens them to "\n".
+// If it finds no instances of "\r\n", the original byte slice is returned unchanged.
+//
+// This function does not bring joy; however,
+// see https://github.com/warpfork/go-testmark/pull/4#issuecomment-922760414
+// and see https://github.com/warpfork/go-testmark/pull/4#issuecomment-922782549
+// for discussion.  Performing this kind of normalization to data hunk boundaries
+// seems to be a "least bad" behavior in a practical sense.
+func normalizeEndings(in []byte) []byte {
+	if bytes.Count(in, sigilCrLf) == 0 {
+		return in
+	}
+	return bytes.Replace(in, sigilCrLf, sigilLineBreak, -1)
+}
diff --git a/read_test.go b/read_test.go
@@ -40,8 +40,6 @@ func readFixturesExample(t *testing.T, doc *testmark.Document) {
 }
 
 func TestParseCRLF(t *testing.T) {
-	t.Skip("currently broken")
-
 	input, err := ioutil.ReadFile(filepath.Join("testdata", "example.md"))
 	if err != nil {
 		t.Fatal(err)

diff --git a/testmark.go b/testmark.go
@@ -52,6 +52,11 @@ type Hunk struct {
 
 	// The full body of the hunk, as bytes.
 	// (This is *still* a subslice of Document.Original, if this hunk was created by Parse, but probably a unique slice otherwise.)
+	//
+	// When produced by Parse, the Body has been normalized to have '\n' linebreaks if it originally contained '\r\n'.
+	// This is meant as a practical conceit to the fact some systems in the Windows ecosystem tend to mutate documents when checking them out of version control,
+	// and thus testmark finds it practical to pave that back out that again rather than making it an application-level problem.
+	// (If such a normalization had to be applied, the earlier coment about subslicing of Document.Original probably no longer applies.)
 	Body []byte
 }