Skip to content

Commit

Permalink
Fix for #434. Reworked the Document's object cache dictionary. (#435)
Browse files Browse the repository at this point in the history
* Fix for #434. Reworked the Document's object cache dictionary. The getObjectsByType() method now uses it correctly. The dictionary also should support subtype searches. Only one font is asked for and returned to get the default font.

* Added type declarations.

* Testing performance test workflow

* Testing performance test workflow

* Testing performance test workflow

* Testing performance test workflow

* Testing performance test workflow

* Testing performance test workflow

* Added performance testing as requested for PR to fix the issue #434

* Style fix

* File require fix.

* File require fix. Could not get autoload to work.

* GitHub performance is lower than in localhost.

* Style fix

* Performance tests GitHub Action name change.

* Autoload test (pretty sure this did not work before).

* Yep, autoload does not work. Revert.

* Performance tests run name change.

* Removed unnecessary PHPDocs and refactored methods to use Type Declarations instead when able.

* Style fix.

* Performance test also succeeds, when time is exactly the same as required (although this will likely never happen).

* More PHPDoc removal in favour of Type Declarations.

* Document cache dictionary performance test tweak.

* Removed unused parameters.

* Another Type Declarations fix.

* Another Type Declarations fix.

* Autoload test with composer update.

* Autoload test with composer update.

* Added the thesis document used in the document cache dictionary performance test to the repository. The author gave his approval.

* Automatic code style fix.

Co-authored-by: vagrant <[email protected]>
  • Loading branch information
jee7 and vagrant authored Aug 30, 2021
1 parent 5dd2329 commit e2ca3d2
Show file tree
Hide file tree
Showing 12 changed files with 213 additions and 22 deletions.
30 changes: 30 additions & 0 deletions .github/workflows/performance.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: "Performance Tests"

on:
pull_request:
push:
branches:
- "master"

env:
fail-fast: true

jobs:
performance-tests:
name: "Tests for the performance testing the PDF parsing"
runs-on: "ubuntu-20.04"

strategy:
matrix:
php:
- "7.4"

steps:
- name: "Checkout"
uses: "actions/checkout@v2"

- name: "Run composer for further autoloading"
run: "composer update"

- name: "Run performance tests"
run: "php tests/Performance/runPerformanceTests.php"
Binary file added samples/DocumentWithLotsOfObjects.pdf
Binary file not shown.
59 changes: 45 additions & 14 deletions src/Smalot/PdfParser/Document.php
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,26 @@ protected function buildDictionary()
$this->dictionary = [];

foreach ($this->objects as $id => $object) {
// Cache objects by type and subtype
$type = $object->getHeader()->get('Type')->getContent();

if (!empty($type)) {
$this->dictionary[$type][$id] = $id;
if (null != $type) {
if (!isset($this->dictionary[$type])) {
$this->dictionary[$type] = [
'all' => [],
'subtype' => [],
];
}

$this->dictionary[$type]['all'][$id] = $object;

$subtype = $object->getHeader()->get('Subtype')->getContent();
if (null != $subtype) {
if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
$this->dictionary[$type]['subtype'][$subtype] = [];
}
$this->dictionary[$type]['subtype'][$subtype][$id] = $object;
}
}
}
}
Expand Down Expand Up @@ -164,19 +180,26 @@ public function getObjectById(string $id)
return null;
}

public function hasObjectsByType(string $type, ?string $subtype = null): bool
{
return 0 < \count($this->getObjectsByType($type, $subtype));
}

public function getObjectsByType(string $type, ?string $subtype = null): array
{
$objects = [];
if (!isset($this->dictionary[$type])) {
return [];
}

foreach ($this->objects as $id => $object) {
if ($object->getHeader()->get('Type') == $type &&
(null === $subtype || $object->getHeader()->get('Subtype') == $subtype)
) {
$objects[$id] = $object;
if (null != $subtype) {
if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
return [];
}

return $this->dictionary[$type]['subtype'][$subtype];
}

return $objects;
return $this->dictionary[$type]['all'];
}

/**
Expand All @@ -187,25 +210,33 @@ public function getFonts()
return $this->getObjectsByType('Font');
}

public function getFirstFont(): ?Font
{
$fonts = $this->getFonts();

return reset($fonts);
}

/**
* @return Page[]
*
* @throws \Exception
*/
public function getPages()
{
if (isset($this->dictionary['Catalog'])) {
if ($this->hasObjectsByType('Catalog')) {
// Search for catalog to list pages.
$id = reset($this->dictionary['Catalog']);
$catalogues = $this->getObjectsByType('Catalog');
$catalogue = reset($catalogues);

/** @var Pages $object */
$object = $this->objects[$id]->get('Pages');
$object = $catalogue->get('Pages');
if (method_exists($object, 'getPages')) {
return $object->getPages(true);
}
}

if (isset($this->dictionary['Pages'])) {
if ($this->hasObjectsByType('Pages')) {
// Search for pages to list kids.
$pages = [];

Expand All @@ -218,7 +249,7 @@ public function getPages()
return $pages;
}

if (isset($this->dictionary['Page'])) {
if ($this->hasObjectsByType('Page')) {
// Search for 'page' (unordered pages).
$pages = $this->getObjectsByType('Page');

Expand Down
4 changes: 1 addition & 3 deletions src/Smalot/PdfParser/Encoding.php
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,7 @@ public function translateChar($dec): int
}

/**
* Returns the name of the encoding class, if available.
*
* @return string Returns encoding class name if available or empty string (only prior PHP 7.4).
* Returns encoding class name if available or empty string (only prior PHP 7.4).
*
* @throws \Exception On PHP 7.4+ an exception is thrown if encoding class doesn't exist.
*/
Expand Down
2 changes: 1 addition & 1 deletion src/Smalot/PdfParser/Header.php
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ public function getDetails(bool $deep = true): array
/**
* Indicate if an element name is available in header.
*
* @param string $name the name of the element
* @param string $name the name of the element
*/
public function has(string $name): bool
{
Expand Down
2 changes: 1 addition & 1 deletion src/Smalot/PdfParser/PDFObject.php
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ private function getDefaultFont(Page $page = null): Font
$fonts = $page->getFonts();
}

$fonts = array_merge($fonts, array_values($this->document->getFonts()));
$fonts[] = $this->document->getFirstFont();

if (\count($fonts) > 0) {
return reset($fonts);
Expand Down
2 changes: 1 addition & 1 deletion src/Smalot/PdfParser/Page.php
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ public function getTextArray(self $page = null): array
/**
* Gets all the text data with its internal representation of the page.
*
* @return array An array with the data and the internal representation
* Returns an array with the data and the internal representation
*/
public function extractRawData(): array
{
Expand Down
4 changes: 2 additions & 2 deletions tests/Integration/DocumentTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,8 @@ public function testDictionary()

$objects = $document->getDictionary();
$this->assertEquals(1, \count($objects));
$this->assertEquals(1, \count($objects['Page']));
$this->assertEquals(2, $objects['Page'][2]);
$this->assertEquals(1, \count($objects['Page']['all']));
$this->assertEquals($object2, $objects['Page']['all'][2]);
}

public function testGetObjectsByType()
Expand Down
7 changes: 7 additions & 0 deletions tests/Performance/Exception/PerformanceFailException.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<?php

namespace Tests\Smalot\PdfParser\Performance\Exception;

class PerformanceFailException extends \Exception
{
}
21 changes: 21 additions & 0 deletions tests/Performance/Test/AbstractPerformanceTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<?php

namespace Tests\Smalot\PdfParser\Performance\Test;

abstract class AbstractPerformanceTest
{
/**
* Initializes the test (eg, fetches the files etc).
*/
abstract public function init(): void;

/**
* Executes the test.
*/
abstract public function run(): void;

/**
* Returns the time over which the test is considered a fail.
*/
abstract public function getMaxEstimatedTime(): int;
}
83 changes: 83 additions & 0 deletions tests/Performance/Test/DocumentDictionaryCacheTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
<?php

/**
* @file This file is part of the PdfParser library.
*
* @author Konrad Abicht <[email protected]>
* @date 2020-06-01
*
* @author Sébastien MALOT <[email protected]>
* @date 2017-01-03
*
* @license LGPLv3
* @url <https://github.com/smalot/pdfparser>
*
* PdfParser is a pdf library written in PHP, extraction oriented.
* Copyright (C) 2017 - Sébastien MALOT <[email protected]>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program.
* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
*/

namespace Tests\Smalot\PdfParser\Performance\Test;

use Smalot\PdfParser\Page;
use Smalot\PdfParser\Parser;

/**
* This test checks does a performance test with certain PDF files that extensively use
* the getFirstFont() method of Document.php. If Document.php correctly uses a dictionary
* to cache the objects inside the PDF file, then the parsing should be quick.
* If it does not, the parsing can be extensively slow or even crash.
*/
class DocumentDictionaryCacheTest extends AbstractPerformanceTest
{
/**
* @var Parser
*/
protected $parser;
protected $data;

public function init(): void
{
$this->parser = new Parser();

// load PDF file content
$this->data = file_get_contents(__DIR__.'/../../../samples/DocumentWithLotsOfObjects.pdf');
}

public function run(): void
{
// give PDF content to function and parse it
$pdf = $this->parser->parseContent($this->data);

$pages = $pdf->getPages();

foreach ($pages as $i => $page) { /* @var $page Page */
if ($i < 77) {
continue;
}
if ($i > 78) {
continue;
}

$page->getText(); // Test this method
}
}

public function getMaxEstimatedTime(): int
{
return 20;
}
}
21 changes: 21 additions & 0 deletions tests/Performance/runPerformanceTests.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<?php

require __DIR__.'/../../vendor/autoload.php';

$tests = [
new \Tests\Smalot\PdfParser\Performance\Test\DocumentDictionaryCacheTest(),
];

foreach ($tests as $test) { /* @var $test \Tests\Smalot\PdfParser\Performance\Test\AbstractPerformanceTest */
$test->init();

$startTime = microtime(true);
$test->run();
$endTime = microtime(true);

$time = $endTime - $startTime;

if ($test->getMaxEstimatedTime() <= $time) {
throw new \Tests\Smalot\PdfParser\Performance\Exception\PerformanceFailException(sprintf('Performance failed on test "%s". Time taken was %.2f seconds, expected less than %d seconds.', get_class($test), $time, $test->getMaxEstimatedTime()));
}
}

0 comments on commit e2ca3d2

Please sign in to comment.