Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for two bugs related to Unicode translation support by Font objects #698

Merged
merged 20 commits into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/Smalot/PdfParser/Font.php
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ public function loadTranslateTable(): array
// Support for multiple bfchar sections
if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
foreach ($matches['sections'] as $section) {
$regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
$regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';

preg_match_all($regexp, $section, $matches);

Expand Down
12 changes: 12 additions & 0 deletions src/Smalot/PdfParser/Page.php
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,18 @@ class Page extends PDFObject
*/
protected $dataTm;

/**
* @param array<\Smalot\PdfParser\Font> $fonts
*
* @internal
*/
public function setFonts($fonts)
unixnut marked this conversation as resolved.
Show resolved Hide resolved
{
if (empty($this->fonts)) {
$this->fonts = $fonts;
k00ni marked this conversation as resolved.
Show resolved Hide resolved
}
}

/**
* @return Font[]
*/
Expand Down
60 changes: 59 additions & 1 deletion src/Smalot/PdfParser/Pages.php
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,13 @@
class Pages extends PDFObject
{
/**
* @todo Objects other than Pages or Page might need to be treated specifically in order to get Page objects out of them,
* @var array<\Smalot\PdfParser\Font>|null
*/
protected $fonts;

/**
* @todo Objects other than Pages or Page might need to be treated specifically
* in order to get Page objects out of them.
*
* @see https://github.com/smalot/pdfparser/issues/331
*/
Expand All @@ -57,17 +63,69 @@ public function getPages(bool $deep = false): array
return $kidsElement->getContent();
}

// Prepare to apply the Pages' object's fonts to each page
if (false === \is_array($this->fonts)) {
$this->setupFonts();
}
$fontsAvailable = 0 < \count($this->fonts);

$kids = $kidsElement->getContent();
$pages = [];

foreach ($kids as $kid) {
if ($kid instanceof self) {
$pages = array_merge($pages, $kid->getPages(true));
} elseif ($kid instanceof Page) {
if ($fontsAvailable) {
$kid->setFonts($this->fonts);
}
$pages[] = $kid;
}
}

return $pages;
}

/**
* Gathers information about fonts and collects them in a list.
*
* @return void
*
* @internal
*/
protected function setupFonts()
{
$resources = $this->get('Resources');

if (method_exists($resources, 'has') && $resources->has('Font')) {
// no fonts available, therefore stop here
if ($resources->get('Font') instanceof Element\ElementMissing) {
return;
}

if ($resources->get('Font') instanceof Header) {
$fonts = $resources->get('Font')->getElements();
} else {
$fonts = $resources->get('Font')->getHeader()->getElements();
}

$table = [];

foreach ($fonts as $id => $font) {
if ($font instanceof Font) {
$table[$id] = $font;

// Store too on cleaned id value (only numeric)
$id = preg_replace('/[^0-9\.\-_]/', '', $id);
if ('' != $id) {
$table[$id] = $font;
}
}
}

$this->fonts = $table;
} else {
$this->fonts = [];
}
}
}
106 changes: 106 additions & 0 deletions tests/PHPUnit/Integration/PagesTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
<?php

/**
* @file This file is part of the PdfParser library.
*
* @author Konrad Abicht <[email protected]>
*
* @date 2024-04-19
*
* @license LGPLv3
*
* @url <https://github.com/smalot/pdfparser>
*
* PdfParser is a pdf library written in PHP, extraction oriented.
* Copyright (C) 2017 - Sébastien MALOT <[email protected]>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program.
* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
*/

namespace PHPUnitTests\Integration;

use PHPUnitTests\TestCase;
use Smalot\PdfParser\Document;
use Smalot\PdfParser\Element\ElementArray;
use Smalot\PdfParser\Font;
use Smalot\PdfParser\Header;
use Smalot\PdfParser\Page;
use Smalot\PdfParser\Pages;

/**
* @internal only for test purposes
*/
class PagesDummy extends Pages
{
/**
* The purpose of this function is to bypass the tedious
* work to setup instances which lead to a valid $fonts variable.
*
* @param array<\Smalot\PdfParser\Font> $fonts
*
* @return void
*/
public function setFonts($fonts)
{
$this->fonts = $fonts;
}
}

class PagesTest extends TestCase
{
public function testFontsArePassedFromPagesToPage(): void
{
// Create mock Document, Font and Page objects
$document = $this->createMock(Document::class);
$font1 = new Font($document);
$page = new Page($document);

// Create a Header object that indicates $page is a child
$header = new Header([
'Kids' => new ElementArray([
$page,
]),
], $document);

// Use this header to create a mock Pages object
$pages = new PagesDummy($document, $header);

// Apply $font1 as a Font object to this Pages object;
// setFonts is used here as part of PagesDummy, only to access
// the protected Pages::fonts variable; it is not a method
// available in production
$pages->setFonts([$font1]);

// Trigger setupFonts method in $pages
$pages->getPages(true);

// Since the $page object font list is empty, $font1 from Pages
// object must be passed to the Page object
$this->assertEquals([$font1], $page->getFonts());

// Create a second $font2 using a different method
$font2 = $this->createMock(Font::class);

// Update the fonts in $pages
$pages->setFonts([$font1, $font2]);

// Trigger setupFonts method in $pages
$pages->getPages(true);

// Now that $page already has a font, updates from $pages
// should not overwrite it
$this->assertEquals([$font1], $page->getFonts());
}
}
Loading