chore: change PDF text extraction logics

filzrev · Nov 23, 2024 · c2df7e0 · c2df7e0
1 parent 7102b42
commit c2df7e0
Showing 1 changed file with 23 additions and 1 deletion.
diff --git a/test/docfx.Snapshot.Tests/SamplesTest.cs b/test/docfx.Snapshot.Tests/SamplesTest.cs
@@ -11,6 +11,8 @@
 using UglyToad.PdfPig;
 using UglyToad.PdfPig.Actions;
 using UglyToad.PdfPig.Annotations;
+using UglyToad.PdfPig.Content;
+using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor;
 using UglyToad.PdfPig.Outline;
 
 namespace Docfx.Tests;
@@ -80,7 +82,7 @@ void PdfToJson(string path)
                 {
                     p.Number,
                     p.NumberOfImages,
-                    p.Text,
+                    Text = ExtractText(p),
                     Links = p.ExperimentalAccess.GetAnnotations().Select(ToLink).ToArray(),
                 }).ToArray(),
                 Bookmarks = document.TryGetBookmarks(out var bookmarks) ? ToBookmarks(bookmarks.Roots) : null,
@@ -207,4 +209,24 @@ private void ScrubFile(string path, StringBuilder builder)
             }));
         }
     }
+
+    private string ExtractText(Page page)
+    {
+        // Gets PDF text content
+        var text = ContentOrderTextExtractor.GetText(page, new ContentOrderTextExtractor.Options { ReplaceWhitespaceWithSpace = true });
+
+        // string.Normalize is not works when using `Globalization Invariant Mode`.
+        StringBuilder sb = new(text);
+
+        // Normalize known ligature chars. (Note: `string.Normalize` is not works when using `Globalization Invariant Mode`)
+        sb.Replace("ﬀ", "ff");
+        sb.Replace("ﬃ", "ffi");
+        sb.Replace("ﬂ", "fl");
+        sb.Replace("ﬁ", "fi");
+
+        // Normalize newline char.
+        sb.Replace("\r\n", "\n");
+
+        return sb.ToString();
+    }
 }