Skip to content

Commit

Permalink
fix for random spaces problem (issue smalot#72).patch
Browse files Browse the repository at this point in the history
  • Loading branch information
panique committed Nov 21, 2020
1 parent d2f2248 commit 521e676
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 12 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@
debug*
composer.lock
/.php_cs.cache
/.phpunit.result.cache
Binary file added samples/bugs/Issue72.pdf
Binary file not shown.
42 changes: 32 additions & 10 deletions src/Smalot/PdfParser/PDFObject.php
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,30 @@ class PDFObject
protected $content = null;

/**
* @param Header $header
* @param string $content
* Configuration array.
*/
public function __construct(Document $document, Header $header = null, $content = null)
protected $cfg = [
// if `true` ignore spacing between letters (= fix random spaces inside words)
'ignore_letter_spacing' => false,
];

/**
* PDFObject constructor
*
* @param Document $document
* @param Header|null $header
* @param null $content
* @param array $config
*/
public function __construct(Document $document, Header $header = null, $content = null, $config = [])
{
$this->document = $document;
$this->header = null !== $header ? $header : new Header();
$this->content = $content;

if (!empty($config)) {
$this->cfg = $config;
}
}

public function init()
Expand Down Expand Up @@ -282,14 +298,18 @@ public function getText(Page $page = null)
if (((float) $x <= 0) ||
(false !== $current_position_td['y'] && (float) $y < (float) ($current_position_td['y']))
) {
// vertical offset
$text .= "\n";
if (!$this->cfg['ignore_letter_spacing']) {
// vertical offset
$text .= "\n";
}
} elseif (false !== $current_position_td['x'] && (float) $x > (float) (
$current_position_td['x']
)
) {
// horizontal offset
$text .= ' ';
if (!$this->cfg['ignore_letter_spacing']) {
// horizontal offset
$text .= ' ';
}
}
$current_position_td = ['x' => $x, 'y' => $y];
break;
Expand All @@ -302,7 +322,9 @@ public function getText(Page $page = null)
if ((float) $y < 0) {
$text .= "\n";
} elseif ((float) $x <= 0) {
$text .= ' ';
if (!$this->cfg['ignore_letter_spacing']) {
$text .= ' ';
}
}
break;

Expand Down Expand Up @@ -724,7 +746,7 @@ public function getCommandsText($text_part, &$offset = 0)
*
* @return PDFObject
*/
public static function factory(Document $document, Header $header, $content)
public static function factory(Document $document, Header $header, $content, $config = [])
{
switch ($header->get('Type')->getContent()) {
case 'XObject':
Expand Down Expand Up @@ -758,7 +780,7 @@ public static function factory(Document $document, Header $header, $content)
return new Font($document, $header, $content);

default:
return new self($document, $header, $content);
return new self($document, $header, $content, $config);
}
}

Expand Down
14 changes: 12 additions & 2 deletions src/Smalot/PdfParser/Parser.php
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,14 @@
*/
class Parser
{
/**
* Configuration array.
*/
protected $cfg = [
// if `true` ignore spacing between letters (= fix random spaces inside words)
'ignore_letter_spacing' => false,
];

/**
* @var PDFObject[]
*/
Expand All @@ -55,6 +63,7 @@ class Parser

public function __construct($cfg = [])
{
$this->cfg = array_merge($this->cfg, $cfg);
$this->rawDataParser = new RawDataParser($cfg);
}

Expand Down Expand Up @@ -104,6 +113,7 @@ public function parseContent($content)

// Create destination object.
$document = new Document();
// TODO hier config setzen
$this->objects = [];

foreach ($data as $id => $structure) {
Expand Down Expand Up @@ -205,7 +215,7 @@ protected function parseObject($id, $structure, $document)
$sub_content = substr($content, $position, (int) $next_position - (int) $position);

$sub_header = Header::parse($sub_content, $document);
$object = PDFObject::factory($document, $sub_header, '');
$object = PDFObject::factory($document, $sub_header, '', $this->cfg);
$this->objects[$id] = $object;
}

Expand All @@ -229,7 +239,7 @@ protected function parseObject($id, $structure, $document)
}

if (!isset($this->objects[$id])) {
$this->objects[$id] = PDFObject::factory($document, $header, $content);
$this->objects[$id] = PDFObject::factory($document, $header, $content, $this->cfg);
}
}

Expand Down
22 changes: 22 additions & 0 deletions tests/Integration/ParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,28 @@ public function testIssue19()
$this->assertArrayHasKey('17_0', $objects);
}

/**
* Addresses the issue with text that is "too wide" between letters, resulting in random spaces everywhere in the text.
* First case is result with default behaviour, second case is with config that should skip the space-handling.
*
* @see https://github.com/smalot/pdfparser/issues/72
* @see https://github.com/smalot/pdfparser/issues/314
*/
public function testIssue72()
{
$filename = $this->rootDir.'/samples/bugs/Issue72.pdf';
$document1 = $this->fixture->parseFile($filename);

$secondParser = new Parser(['ignore_letter_spacing' => true]);
$document2 = $secondParser->parseFile($filename);

$expected1 = '1Der Z we it e W e l t kr i eg';
$expected2 = '1Der Zweite Weltkrieg';

$this->assertStringContainsString($expected1, $document1->getText());
$this->assertStringContainsString($expected2, $document2->getText());
}

/**
* Properly decode ANSI encodings without producing scrambled UTF-8 characters
*
Expand Down

0 comments on commit 521e676

Please sign in to comment.