diff --git a/.gitignore b/.gitignore index c685f194..56badf4b 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ debug* composer.lock /.php_cs.cache +/.phpunit.result.cache diff --git a/samples/bugs/Issue72.pdf b/samples/bugs/Issue72.pdf new file mode 100644 index 00000000..05ccfdc1 Binary files /dev/null and b/samples/bugs/Issue72.pdf differ diff --git a/src/Smalot/PdfParser/PDFObject.php b/src/Smalot/PdfParser/PDFObject.php index 488d7537..aedb464b 100644 --- a/src/Smalot/PdfParser/PDFObject.php +++ b/src/Smalot/PdfParser/PDFObject.php @@ -67,14 +67,30 @@ class PDFObject protected $content = null; /** - * @param Header $header - * @param string $content + * Configuration array. */ - public function __construct(Document $document, Header $header = null, $content = null) + protected $cfg = [ + // if `true` ignore spacing between letters (= fix random spaces inside words) + 'ignore_letter_spacing' => false, + ]; + + /** + * PDFObject constructor + * + * @param Document $document + * @param Header|null $header + * @param null $content + * @param array $config + */ + public function __construct(Document $document, Header $header = null, $content = null, $config = []) { $this->document = $document; $this->header = null !== $header ? $header : new Header(); $this->content = $content; + + if (!empty($config)) { + $this->cfg = $config; + } } public function init() @@ -282,14 +298,18 @@ public function getText(Page $page = null) if (((float) $x <= 0) || (false !== $current_position_td['y'] && (float) $y < (float) ($current_position_td['y'])) ) { - // vertical offset - $text .= "\n"; + if (!$this->cfg['ignore_letter_spacing']) { + // vertical offset + $text .= "\n"; + } } elseif (false !== $current_position_td['x'] && (float) $x > (float) ( $current_position_td['x'] ) ) { - // horizontal offset - $text .= ' '; + if (!$this->cfg['ignore_letter_spacing']) { + // horizontal offset + $text .= ' '; + } } $current_position_td = ['x' => $x, 'y' => $y]; break; @@ -302,7 +322,9 @@ public function getText(Page $page = null) if ((float) $y < 0) { $text .= "\n"; } elseif ((float) $x <= 0) { - $text .= ' '; + if (!$this->cfg['ignore_letter_spacing']) { + $text .= ' '; + } } break; @@ -724,7 +746,7 @@ public function getCommandsText($text_part, &$offset = 0) * * @return PDFObject */ - public static function factory(Document $document, Header $header, $content) + public static function factory(Document $document, Header $header, $content, $config = []) { switch ($header->get('Type')->getContent()) { case 'XObject': @@ -758,7 +780,7 @@ public static function factory(Document $document, Header $header, $content) return new Font($document, $header, $content); default: - return new self($document, $header, $content); + return new self($document, $header, $content, $config); } } diff --git a/src/Smalot/PdfParser/Parser.php b/src/Smalot/PdfParser/Parser.php index f37a14b7..425cba67 100644 --- a/src/Smalot/PdfParser/Parser.php +++ b/src/Smalot/PdfParser/Parser.php @@ -46,6 +46,14 @@ */ class Parser { + /** + * Configuration array. + */ + protected $cfg = [ + // if `true` ignore spacing between letters (= fix random spaces inside words) + 'ignore_letter_spacing' => false, + ]; + /** * @var PDFObject[] */ @@ -55,6 +63,7 @@ class Parser public function __construct($cfg = []) { + $this->cfg = array_merge($this->cfg, $cfg); $this->rawDataParser = new RawDataParser($cfg); } @@ -104,6 +113,7 @@ public function parseContent($content) // Create destination object. $document = new Document(); + // TODO hier config setzen $this->objects = []; foreach ($data as $id => $structure) { @@ -205,7 +215,7 @@ protected function parseObject($id, $structure, $document) $sub_content = substr($content, $position, (int) $next_position - (int) $position); $sub_header = Header::parse($sub_content, $document); - $object = PDFObject::factory($document, $sub_header, ''); + $object = PDFObject::factory($document, $sub_header, '', $this->cfg); $this->objects[$id] = $object; } @@ -229,7 +239,7 @@ protected function parseObject($id, $structure, $document) } if (!isset($this->objects[$id])) { - $this->objects[$id] = PDFObject::factory($document, $header, $content); + $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->cfg); } } diff --git a/tests/Integration/ParserTest.php b/tests/Integration/ParserTest.php index 0adc3de3..b8bb10d1 100644 --- a/tests/Integration/ParserTest.php +++ b/tests/Integration/ParserTest.php @@ -146,6 +146,28 @@ public function testIssue19() $this->assertArrayHasKey('17_0', $objects); } + /** + * Addresses the issue with text that is "too wide" between letters, resulting in random spaces everywhere in the text. + * First case is result with default behaviour, second case is with config that should skip the space-handling. + * + * @see https://github.com/smalot/pdfparser/issues/72 + * @see https://github.com/smalot/pdfparser/issues/314 + */ + public function testIssue72() + { + $filename = $this->rootDir.'/samples/bugs/Issue72.pdf'; + $document1 = $this->fixture->parseFile($filename); + + $secondParser = new Parser(['ignore_letter_spacing' => true]); + $document2 = $secondParser->parseFile($filename); + + $expected1 = '1Der Z we it e W e l t kr i eg'; + $expected2 = '1Der Zweite Weltkrieg'; + + $this->assertStringContainsString($expected1, $document1->getText()); + $this->assertStringContainsString($expected2, $document2->getText()); + } + /** * Properly decode ANSI encodings without producing scrambled UTF-8 characters *