Skip to content

Commit

Permalink
chore: change PDF text extraction logics
Browse files Browse the repository at this point in the history
  • Loading branch information
filzrev committed Nov 23, 2024
1 parent 7102b42 commit c2df7e0
Showing 1 changed file with 23 additions and 1 deletion.
24 changes: 23 additions & 1 deletion test/docfx.Snapshot.Tests/SamplesTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
using UglyToad.PdfPig;
using UglyToad.PdfPig.Actions;
using UglyToad.PdfPig.Annotations;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor;
using UglyToad.PdfPig.Outline;

namespace Docfx.Tests;
Expand Down Expand Up @@ -80,7 +82,7 @@ void PdfToJson(string path)
{
p.Number,
p.NumberOfImages,
p.Text,
Text = ExtractText(p),
Links = p.ExperimentalAccess.GetAnnotations().Select(ToLink).ToArray(),
}).ToArray(),
Bookmarks = document.TryGetBookmarks(out var bookmarks) ? ToBookmarks(bookmarks.Roots) : null,
Expand Down Expand Up @@ -207,4 +209,24 @@ private void ScrubFile(string path, StringBuilder builder)
}));
}
}

private string ExtractText(Page page)
{
// Gets PDF text content
var text = ContentOrderTextExtractor.GetText(page, new ContentOrderTextExtractor.Options { ReplaceWhitespaceWithSpace = true });

// string.Normalize is not works when using `Globalization Invariant Mode`.
StringBuilder sb = new(text);

// Normalize known ligature chars. (Note: `string.Normalize` is not works when using `Globalization Invariant Mode`)
sb.Replace("ff", "ff");
sb.Replace("ffi", "ffi");
sb.Replace("fl", "fl");
sb.Replace("fi", "fi");

// Normalize newline char.
sb.Replace("\r\n", "\n");

return sb.ToString();
}
}

0 comments on commit c2df7e0

Please sign in to comment.