Skip to content

Commit

Permalink
Merge pull request #447 from jltrem/jltrem/pdf-image-consolidation
Browse files Browse the repository at this point in the history
Add image consolidation
  • Loading branch information
ststeiger authored Jul 12, 2024
2 parents d0b0a42 + f1323f4 commit 0783e08
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 4 deletions.
Binary file added PdfSharpCore.Test/Assets/frog-and-toad.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
77 changes: 73 additions & 4 deletions PdfSharpCore.Test/Merge.cs
Original file line number Diff line number Diff line change
@@ -1,25 +1,74 @@
using System.IO;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using PdfSharpCore.Drawing;
using PdfSharpCore.Drawing.Layout;
using PdfSharpCore.Pdf;
using PdfSharpCore.Pdf.IO;
using PdfSharpCore.Test.Helpers;
using Xunit;
using Xunit.Abstractions;

namespace PdfSharpCore.Test
{
public class Merge
{
private readonly ITestOutputHelper _output;

public Merge(ITestOutputHelper output)
{
_output = output;
}

[Fact]
public void CanMerge2Documents()
{
var pdf1Path = PathHelper.GetInstance().GetAssetPath("FamilyTree.pdf");
var pdf2Path = PathHelper.GetInstance().GetAssetPath("test.pdf");

var outputDocument = MergeDocuments(new[] { pdf1Path, pdf2Path });

var outFilePath = CreateOutFilePath("merge.pdf");
outputDocument.Save(outFilePath);
}

[Fact]
public void CanConsolidateImageDataInDocument()
{
var doc1 = CreateTestDocumentWithImage("lenna.png");
var doc2 = CreateTestDocumentWithImage("frog-and-toad.jpg");

var pdf1Path = CreateOutFilePath("image-doc1.pdf");
doc1.Save(pdf1Path);

var pdf2Path = CreateOutFilePath("image-doc2.pdf");
doc2.Save(pdf2Path);

var pdfPathsForMerge = Enumerable.Range(1, 50).SelectMany(_ => new[] { pdf1Path, pdf2Path });
var outputDocument = MergeDocuments(pdfPathsForMerge);

var mergedFilePath = CreateOutFilePath("images-merged.pdf");
outputDocument.Save(mergedFilePath);

outputDocument.ConsolidateImages();
var consolidatedFilePath = CreateOutFilePath("images-merged-consolidated.pdf");
outputDocument.Save(consolidatedFilePath);

long mergedLength = new FileInfo(mergedFilePath).Length;
long consolidatedLength = new FileInfo(consolidatedFilePath).Length;
Assert.True(consolidatedLength < mergedLength / 4);
}

private static PdfDocument MergeDocuments(IEnumerable<string> pdfPaths)
{
var outputDocument = new PdfDocument();

foreach (var pdfPath in new[] { pdf1Path, pdf2Path })
foreach (var pdfPath in pdfPaths)
{
using var fs = File.OpenRead(pdfPath);
var inputDocument = Pdf.IO.PdfReader.Open(fs, PdfDocumentOpenMode.Import);

var count = inputDocument.PageCount;
for (var idx = 0; idx < count; idx++)
{
Expand All @@ -28,14 +77,34 @@ public void CanMerge2Documents()
}
}

var outFilePath = Path.Combine(PathHelper.GetInstance().RootDir, "Out", "merge.pdf");
return outputDocument;
}

private static string CreateOutFilePath(string filename)
{
var outFilePath = Path.Combine(PathHelper.GetInstance().RootDir, "Out", filename);
var dir = Path.GetDirectoryName(outFilePath);
if (!Directory.Exists(dir))
{
Directory.CreateDirectory(dir);
}

outputDocument.Save(outFilePath);
return outFilePath;
}

private static PdfDocument CreateTestDocumentWithImage(string imageFilename)
{
var document = new PdfDocument();

var pageNewRenderer = document.AddPage();
var renderer = XGraphics.FromPdfPage(pageNewRenderer);
var textFormatter = new XTextFormatter(renderer);

var layout = new XRect(12, 12, 400, 50);
textFormatter.DrawString(imageFilename, new XFont("Arial", 12), XBrushes.Black, layout);
renderer.DrawImage(XImage.FromFile(PathHelper.GetInstance().GetAssetPath(imageFilename)), new XPoint(12, 100));

return document;
}
}
}
3 changes: 3 additions & 0 deletions PdfSharpCore.Test/PdfSharpCore.Test.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@
<None Update="Assets\**\*.png">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="Assets\frog-and-toad.jpg">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
</ItemGroup>

</Project>
80 changes: 80 additions & 0 deletions PdfSharpCore/Pdf/PdfDocument.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,12 @@
#endregion

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Security.Cryptography;
using System.Text;
using PdfSharpCore.Pdf.Advanced;
using PdfSharpCore.Pdf.Internal;
using PdfSharpCore.Pdf.IO;
Expand Down Expand Up @@ -809,6 +812,83 @@ public void MakeAcroFormsReadOnly()
}
}

public void ConsolidateImages()
{
var images = ImageInfo.FindAll(this);

var mapHashcodeToMd5 = new Dictionary<int, string>();
var mapMd5ToPdfItem = new Dictionary<string, PdfItem>();

// Calculate MD5 for each image XObject and build lookups for all images.
foreach (ImageInfo img in images)
{
mapHashcodeToMd5[img.XObject.GetHashCode()] = img.XObjectMD5;
mapMd5ToPdfItem[img.XObjectMD5] = img.Item.Value;
}

// Set the PdfItem for each image to the one chosen for the MD5.
foreach (ImageInfo img in images)
{
string md5 = mapHashcodeToMd5[img.XObject.GetHashCode()];
img.XObjects.Elements[img.Item.Key] = mapMd5ToPdfItem[md5];
}
}

internal class ImageInfo
{
public PdfDictionary XObjects { get; }
public KeyValuePair<string, PdfItem> Item { get; }
public PdfDictionary XObject { get; }
public string XObjectMD5 { get; }

private static readonly MD5 Hasher = MD5.Create();

public ImageInfo(PdfDictionary xObjects, KeyValuePair<string, PdfItem> item, PdfDictionary xObject)
{
XObjects = xObjects;
Item = item;
XObject = xObject;
XObjectMD5 = ComputeMD5(xObject.Stream.Value);
}

/// <summary>
/// Get info for each image in the document.
/// </summary>
internal static List<ImageInfo> FindAll(PdfDocument doc) =>
doc.Pages.Cast<PdfPage>()
.Select(page => page.Elements.GetDictionary("/Resources"))
.Select(resources => resources?.Elements?.GetDictionary("/XObject"))
.Where(xObjects => xObjects?.Elements != null)
.SelectMany(xObjects =>
from item in xObjects.Elements
let xObject = (item.Value as PdfReference)?.Value as PdfDictionary
where xObject?.Elements?.GetString("/Subtype") == "/Image"
select new ImageInfo(xObjects, item, xObject)
)
.ToList();

/// <summary>
/// Compute and return the MD5 hash of the input data.
/// </summary>
internal static string ComputeMD5(byte[] input)
{
byte[] hashBytes;
lock (Hasher)
{
hashBytes = Hasher.ComputeHash(input);
Hasher.Initialize();
}

var sb = new StringBuilder();
foreach (var x in hashBytes)
{
sb.Append(x.ToString("x2"));
}

return sb.ToString();
}
}

/// <summary>
/// Gets the security handler.
/// </summary>
Expand Down

0 comments on commit 0783e08

Please sign in to comment.