Skip to content

Commit

Permalink
revert 4f4fd10 preserving fix for #260, fixing #319, #322 and #334 (#342
Browse files Browse the repository at this point in the history
)

* Revert "Fix \f crush"

This reverts commit 4f4fd10.

* revert 4f4fd10 preserving fix for #260, fixing #319, #322 and #334

* reduced sample PDF and added more descriptive comment to the two new test cases
  • Loading branch information
Connum authored Sep 30, 2020
1 parent 7a0dbf8 commit 1862686
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 1 deletion.
Binary file added samples/bugs/Issue322.pdf
Binary file not shown.
Binary file added samples/bugs/Issue334.pdf
Binary file not shown.
6 changes: 5 additions & 1 deletion src/Smalot/PdfParser/Font.php
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,11 @@ public function decodeText($commands)
}

// replace escaped chars
$text = stripcslashes($text);
$text = str_replace(
['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
$text
);

// add content to result string
if (isset($words[$word_position])) {
Expand Down
35 changes: 35 additions & 0 deletions tests/Integration/ParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -91,4 +91,39 @@ public function testIssue267()
$this->assertEquals(Image::class, \get_class($document->getObjectById('128_0')));
$this->assertStringContainsString('4 von 4', $document->getText());
}

/**
* Test that issue related pdf can now be parsed:
* Too many slashes were being stripped and resulted
* in malformed encoding of parts of the text content.
*
* @see https://github.com/smalot/pdfparser/issues/322
*/
public function testIssue322()
{
$filename = $this->rootDir.'/samples/bugs/Issue322.pdf';

$document = $this->fixture->parseFile($filename);

$this->assertStringContainsString('this text isn’t working properly, I’ve edited it in Google Documents', $document->getText());
}

/**
* Test that issue related pdf can now be parsed:
* Too many slashes were being stripped and resulted
* in malformed encoding of parts of the text content.
*
* License of the content taken from https://stackoverflow.com in the sample PDF:
* CC BY-SA 2.5 https://creativecommons.org/licenses/by-sa/2.5/
*
* @see https://github.com/smalot/pdfparser/issues/334
*/
public function testIssue334()
{
$filename = $this->rootDir.'/samples/bugs/Issue334.pdf';

$document = $this->fixture->parseFile($filename);

$this->assertStringContainsString('This question already has an answer here', $document->getText());
}
}

0 comments on commit 1862686

Please sign in to comment.