Skip to content

Commit

Permalink
mostly refinements and some code refactoring (see below)
Browse files Browse the repository at this point in the history
- Moved some test-related code to separate function to improve code
readability
- renamed and refined testSpecialCharsEncodedAsHex: you don't need an
if-clause in a test if you insist on certain values along the way, just
use assertX to check for expected values ("fail early")
- ElementString: moved the part which handles escaped characters to a
separate function to improve code readability
- added references / comments
  • Loading branch information
k00ni committed Jun 5, 2024
1 parent 4ba5b8d commit e738b8e
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 41 deletions.
82 changes: 47 additions & 35 deletions src/Smalot/PdfParser/Element/ElementString.php
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,51 @@ public function equals($value): bool
return $value == $this->value;
}

/**
* Part of parsing process to handle escaped characters.
* Note, most parameters are passed by reference.
*
* Further information in PDF specification (page 53):
* https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf
*/
private static function handleEscapedCharacters(string &$name, int &$position, string &$processedName, string $char): void
{
// escaped chars
$nextChar = substr($name, 0, 1);
switch ($nextChar) {
// end-of-line markers (CR, LF, CRLF) should be ignored
case "\r":
case "\n":
preg_match('/^\\r?\\n?/', $name, $matches);
$name = substr($name, \strlen($matches[0]));
$position += \strlen($matches[0]);
break;
// process LF, CR, HT, BS, FF
case 'n':
case 't':
case 'r':
case 'b':
case 'f':
$processedName .= stripcslashes('\\'.$nextChar);
$name = substr($name, 1);
++$position;
break;
// decode escaped parentheses and backslash
case '(':
case ')':
case '\\':
case ' ': // TODO: this should probably be removed - kept for compatibility
$processedName .= $nextChar;
$name = substr($name, 1);
++$position;
break;
// TODO: process octal encoding (but it is also processed later)
// keep backslash in other cases
default:
$processedName .= $char;
}
}

/**
* @return bool|ElementString
*/
Expand Down Expand Up @@ -80,46 +125,13 @@ public static function parse(string $content, ?Document $document = null, int &$
$processedName .= $char;
--$delimiterCount;
break;
// escaped chars
case '\\':
$nextChar = substr($name, 0, 1);
switch ($nextChar) {
// end-of-line markers (CR, LF, CRLF) should be ignored
case "\r":
case "\n":
preg_match('/^\\r?\\n?/', $name, $matches);
$name = substr($name, \strlen($matches[0]));
$position += \strlen($matches[0]);
break;
// process LF, CR, HT, BS, FF
case 'n':
case 't':
case 'r':
case 'b':
case 'f':
$processedName .= stripcslashes('\\'.$nextChar);
$name = substr($name, 1);
++$position;
break;
// decode escaped parentheses and backslash
case '(':
case ')':
case '\\':
case ' ': // TODO: this should probably be removed - kept for compatibility
$processedName .= $nextChar;
$name = substr($name, 1);
++$position;
break;
// TODO: process octal encoding (but it is also processed later)
// keep backslash in other cases
default:
$processedName .= $char;
}
self::handleEscapedCharacters($name, $position, $processedName, $char);
break;
default:
$processedName .= $char;
}
} while (\strlen($name));
} while ('' !== $name);

$offset += strpos($content, '(') + 1 + $position;

Expand Down
9 changes: 9 additions & 0 deletions tests/PHPUnit/Integration/Element/ElementHexaTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,16 @@ public function testParse(): void
007000610 073007100750061002c0020> ');

$this->assertEquals('pasqua, primavera, resurrezione, festa cristiana, gesù, uova di cioccolata, coniglietti, pulcini, pasquale, campane, dina rebucci, uova di pasqua, ', $element);
}

/**
* Closing round bracket encoded in hexadecimal format breaks parsing - string is truncated.
*
* @see https://github.com/smalot/pdfparser/issues/715
*/
public function testIssue715(): void
{
$offset = 0;
$testString = '()\\';
$element = ElementHexa::parse('<'.bin2hex($testString).'>', null, $offset);
$this->assertEquals($testString, (string) $element);
Expand Down
6 changes: 6 additions & 0 deletions tests/PHPUnit/Integration/Element/ElementStringTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,13 @@ public function testParse(): void
$element = ElementString::parse('(Gutter\\ console\\ assembly)', null, $offset);
$this->assertEquals('Gutter console assembly', $element->getContent());
$this->assertEquals(27, $offset);
}

/**
* @see https://github.com/smalot/pdfparser/issues/715
*/
public function testParseIssue715(): void
{
$element = ElementString::parse('(())');
$this->assertEquals('()', $element->getContent());
}
Expand Down
13 changes: 7 additions & 6 deletions tests/PHPUnit/Integration/ParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -441,19 +441,20 @@ public function testIgnoreEncryption(): void

/**
* Tests special chars encoded as hex.
*
* @see https://github.com/smalot/pdfparser/issues/715
*/
public function testSpecialCharsEncodedAsHex(): void
public function testIssue715SpecialCharsEncodedAsHex(): void
{
$filename = $this->rootDir.'/samples/bugs/Issue715.pdf';

$this->fixture = new Parser();
$document = $this->fixture->parseFile($filename);
$sigObject = $document->getObjectsByType('Sig');
$result = null;
if (isset($sigObject['4_0'])) {
$result = (string) $sigObject['4_0']->getHeader()->get('Contents');
}
$this->assertEquals('()\\', $result);

$this->assertTrue(isset($sigObject['4_0']));
$this->assertEquals('()\\', (string) $sigObject['4_0']->getHeader()->get('Contents'));

$details = $document->getDetails();
$this->assertEquals('x(y)', $details['Producer'] ?? null);
$this->assertEquals('a(b)', $details['Creator'] ?? null);
Expand Down

0 comments on commit e738b8e

Please sign in to comment.