Skip to content

Commit

Permalink
Support importing zip files
Browse files Browse the repository at this point in the history
  • Loading branch information
cyanfish committed Jan 27, 2025
1 parent d32c153 commit 6f5e83e
Show file tree
Hide file tree
Showing 4 changed files with 227 additions and 16 deletions.
5 changes: 5 additions & 0 deletions NAPS2.Images/ImageContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ private static ImageFileFormat GetFileFormatFromFirstBytes(Stream stream)
stream.Read(firstBytes, 0, 8);
stream.Seek(0, SeekOrigin.Begin);

return GetFileFormatFromFirstBytes(firstBytes);
}

public static ImageFileFormat GetFileFormatFromFirstBytes(byte[] firstBytes)
{
return firstBytes switch
{
[0x89, 0x50, 0x4E, 0x47, ..] => ImageFileFormat.Png,
Expand Down
156 changes: 156 additions & 0 deletions NAPS2.Sdk.Tests/ImportExport/FileImporterTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
using NAPS2.ImportExport;
using NAPS2.Pdf.Pdfium;
using Xunit;

namespace NAPS2.Sdk.Tests.ImportExport;

public class FileImporterTests : ContextualTests
{
private readonly FileImporter _fileImporter;

public FileImporterTests()
{
_fileImporter = new FileImporter(ScanningContext);
SetUpFileStorage();
}

[Fact]
public async Task ImportPngImage()
{
var filePath = CopyResourceToFile(ImageResources.skewed_bw, "image.png");

var source = _fileImporter.Import(filePath, new ImportParams());
var result = await source.ToListAsync();

Assert.Single(result);
var storage = Assert.IsType<ImageFileStorage>(result[0].Storage);
Assert.Equal(".png", Path.GetExtension(storage.FullPath));
}

[Fact]
public async Task ImportPngStream()
{
var fileStream = new MemoryStream(ImageResources.skewed_bw);

var source = _fileImporter.Import(fileStream, new ImportParams());
var result = await source.ToListAsync();

Assert.Single(result);
var storage = Assert.IsType<ImageFileStorage>(result[0].Storage);
Assert.Equal(".png", Path.GetExtension(storage.FullPath));
}

[Fact]
public async Task ImportJpegImage()
{
var filePath = CopyResourceToFile(ImageResources.dog, "image.jpg");

var source = _fileImporter.Import(filePath, new ImportParams());
var result = await source.ToListAsync();

Assert.Single(result);
var storage = Assert.IsType<ImageFileStorage>(result[0].Storage);
Assert.Equal(".jpg", Path.GetExtension(storage.FullPath));
}

[Fact]
public async Task ImportJpegStream()
{
var fileStream = new MemoryStream(ImageResources.dog);

var source = _fileImporter.Import(fileStream, new ImportParams());
var result = await source.ToListAsync();

Assert.Single(result);
var storage = Assert.IsType<ImageFileStorage>(result[0].Storage);
Assert.Equal(".jpg", Path.GetExtension(storage.FullPath));
}

[Fact]
public async Task ImportPdfFile()
{
var filePath = CopyResourceToFile(PdfResources.word_patcht_pdf, "word.pdf");

var source = _fileImporter.Import(filePath, new ImportParams());
var result = await source.ToListAsync();

Assert.Single(result);
var storage = Assert.IsType<ImageFileStorage>(result[0].Storage);
Assert.Equal(".pdf", Path.GetExtension(storage.FullPath));
}

[Fact]
public async Task ImportPdfStream()
{
var fileStream = new MemoryStream(PdfResources.word_patcht_pdf);

var source = _fileImporter.Import(fileStream, new ImportParams());
var result = await source.ToListAsync();

Assert.Single(result);
var storage = Assert.IsType<ImageFileStorage>(result[0].Storage);
Assert.Equal(".pdf", Path.GetExtension(storage.FullPath));
}

[Fact]
public async Task ImportZipFile()
{
var filePath = CopyResourceToFile(BinaryResources.animals, "animals.zip");

var source = _fileImporter.Import(filePath, new ImportParams());
var result = await source.ToListAsync();

Assert.Equal(2, result.Count);
var storage = Assert.IsType<ImageFileStorage>(result[0].Storage);
Assert.Equal(".jpg", Path.GetExtension(storage.FullPath));
}

[Fact]
public async Task ImportZipStream()
{
var fileStream = new MemoryStream(BinaryResources.animals);

var source = _fileImporter.Import(fileStream, new ImportParams());
var result = await source.ToListAsync();

Assert.Equal(2, result.Count);
var storage = Assert.IsType<ImageFileStorage>(result[0].Storage);
Assert.Equal(".jpg", Path.GetExtension(storage.FullPath));
}

[Fact]
public async Task ImportUnsupportedFile()
{
var filePath = CopyResourceToFile(BinaryResources.testcert, "something.crt");

await Assert.ThrowsAsync<NotSupportedException>(async () =>
await _fileImporter.Import(filePath, new ImportParams()).ToListAsync());
}

[Fact]
public async Task ImportUnsupportedStream()
{
var fileStream = new MemoryStream(BinaryResources.testcert);

await Assert.ThrowsAsync<NotSupportedException>(async () =>
await _fileImporter.Import(fileStream, new ImportParams()).ToListAsync());
}

[Fact]
public async Task ImportImageWithPdfExtension()
{
var filePath = CopyResourceToFile(ImageResources.dog, "image.pdf");

await Assert.ThrowsAsync<PdfiumException>(async () =>
await _fileImporter.Import(filePath, new ImportParams()).ToListAsync());
}

[Fact]
public async Task ImportPdfWithImageExtension()
{
var filePath = CopyResourceToFile(PdfResources.word_patcht_pdf, "pdf.jpg");

await Assert.ThrowsAsync<NotSupportedException>(async () =>
await _fileImporter.Import(filePath, new ImportParams()).ToListAsync());
}
}
80 changes: 65 additions & 15 deletions NAPS2.Sdk/ImportExport/FileImporter.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using NAPS2.Pdf;
using System.IO.Compression;
using NAPS2.Pdf;
using NAPS2.Scan;

namespace NAPS2.ImportExport;
Expand All @@ -22,36 +23,85 @@ public FileImporter(PdfImporter pdfImporter, ImageImporter imageImporter)
_imageImporter = imageImporter;
}


public IAsyncEnumerable<ProcessedImage> Import(string filePath, ImportParams? importParams = null,
ProgressHandler progress = default)
ProgressHandler progress = default) =>
Import(new InputPathOrStream(filePath, null, null), importParams, progress);

public IAsyncEnumerable<ProcessedImage> Import(Stream stream, ImportParams? importParams = null,
ProgressHandler progress = default) =>
Import(new InputPathOrStream(null, stream, null), importParams, progress);

internal IAsyncEnumerable<ProcessedImage> Import(InputPathOrStream input, ImportParams? importParams = null,
ProgressHandler progress = default, bool skipUnsupported = false)
{
if (filePath == null)
if (Path.GetExtension(input.FileName).ToLowerInvariant() == ".pdf")
{
throw new ArgumentNullException(nameof(filePath));
return _pdfImporter.Import(input, importParams, progress);
}

if (Path.GetExtension(filePath).ToLowerInvariant() == ".pdf")
if (Path.GetExtension(input.FileName).ToLowerInvariant() == ".zip")
{
return _pdfImporter.Import(filePath, importParams, progress);
return ImportZip(input, importParams, progress);
}
if (ImageContext.GetFileFormatFromExtension(filePath) != ImageFileFormat.Unknown)
if (ImageContext.GetFileFormatFromExtension(input.FileName) != ImageFileFormat.Unknown)
{
return _imageImporter.Import(filePath, importParams, progress);
return _imageImporter.Import(input, importParams, progress);
}

// If we couldn't infer if it's a PDF from the extension, we will try and read the file itself
var firstBytes = new byte[8];
using var stream = new FileStream(filePath, FileMode.Open, FileAccess.Read);
stream.Seek(0, SeekOrigin.Begin);
stream.Read(firstBytes, 0, 8);
stream.Seek(0, SeekOrigin.Begin);
if (input.Stream != null)
{
input.Stream.Seek(0, SeekOrigin.Begin);
input.Stream.Read(firstBytes, 0, 8);
input.Stream.Seek(0, SeekOrigin.Begin);
}
else
{
using var stream = new FileStream(input.FilePath!, FileMode.Open, FileAccess.Read);
stream.Seek(0, SeekOrigin.Begin);
stream.Read(firstBytes, 0, 8);
stream.Seek(0, SeekOrigin.Begin);
}

// PDFs begin with "%PDF", possibly with a UTF-8 BOM first
if (firstBytes is [0x25, 0x50, 0x44, 0x46, ..] or [0xEF, 0xBB, 0xBF, 0x25, 0x50, 0x44, 0x46, ..])
{
return _pdfImporter.Import(filePath, importParams, progress);
return _pdfImporter.Import(input, importParams, progress);
}
if (firstBytes is [0x50, 0x4b, 0x03, 0x04, ..])
{
return ImportZip(input, importParams, progress);
}

return _imageImporter.Import(filePath, importParams, progress);
// If we're recursively importing a zip file, we should ignore any entries that aren't supported formats
// rather than trying to import and throwing an exception instead.
if (skipUnsupported && ImageContext.GetFileFormatFromFirstBytes(firstBytes) == ImageFileFormat.Unknown)
{
return AsyncProducers.Empty<ProcessedImage>();
}

return _imageImporter.Import(input, importParams, progress);
}

private async IAsyncEnumerable<ProcessedImage> ImportZip(InputPathOrStream input, ImportParams? importParams,
ProgressHandler progress)
{
using var zip = input.Stream != null ? new ZipArchive(input.Stream) : ZipFile.OpenRead(input.FilePath!);
int n = 0;
var fileEntries = zip.Entries.Where(entry => entry.Length > 0).ToList();
progress.Report(n++, fileEntries.Count);
foreach (var entry in fileEntries)
{
using var entryStream = entry.Open();
var memoryStream = new MemoryStream();
entryStream.CopyTo(memoryStream);
await foreach (var image in Import(new InputPathOrStream(null, memoryStream, entry.Name), importParams,
progress.CancelToken, skipUnsupported: true))
{
yield return image;
}
progress.Report(n++, fileEntries.Count);
}
}
}
2 changes: 1 addition & 1 deletion NAPS2.Sdk/ImportExport/InputPathOrStream.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ namespace NAPS2.ImportExport;
internal record InputPathOrStream(string? FilePath, Stream? Stream, string? StreamFileName)
{
public string FileName => Stream != null
? StreamFileName ?? "<stream>"
? StreamFileName ?? ""
: Path.GetFileName(FilePath)!;

public void CopyToFile(string outputPath)
Expand Down

0 comments on commit 6f5e83e

Please sign in to comment.