Skip to content

Commit

Permalink
Fix issue #387 (#401)
Browse files Browse the repository at this point in the history
  • Loading branch information
smalot authored Mar 25, 2021
1 parent 7d3c67b commit 74a074f
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 3 deletions.
19 changes: 17 additions & 2 deletions src/Smalot/PdfParser/PDFObject.php
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,8 @@ public function getSectionsText($content)
$textCleaned = $this->cleanContent($content, '_');

// Extract text blocks.
if (preg_match_all('/\s+BT[\s|\(|\[]+(.*?)\s*ET/s', $textCleaned, $matches, \PREG_OFFSET_CAPTURE)) {
foreach ($matches[1] as $part) {
if (preg_match_all('/(\sQ)?\s+BT[\s|\(|\[]+(.*?)\s*ET(\sq)?/s', $textCleaned, $matches, \PREG_OFFSET_CAPTURE)) {
foreach ($matches[2] as $pos => $part) {
$text = $part[0];
if ('' === $text) {
continue;
Expand All @@ -224,6 +224,10 @@ public function getSectionsText($content)
// Removes BDC and EMC markup.
$section = preg_replace('/(\/[A-Za-z0-9]+\s*<<.*?)(>>\s*BDC)(.*?)(EMC\s+)/s', '${3}', $section.' ');

// Add Q and q flags if detected around BT/ET.
// @see: https://github.com/smalot/pdfparser/issues/387
$section = trim((!empty($matches[1][$pos][0]) ? "Q\n" : '').$section).(!empty($matches[3][$pos][0]) ? "\nq" : '');

$sections[] = $section;
}
}
Expand Down Expand Up @@ -270,6 +274,7 @@ public function getText(Page $page = null)
$text = '';
$sections = $this->getSectionsText($this->content);
$current_font = $this->getDefaultFont($page);
$clipped_font = $current_font;

$current_position_td = ['x' => false, 'y' => false];
$current_position_tm = ['x' => false, 'y' => false];
Expand Down Expand Up @@ -333,6 +338,16 @@ public function getText(Page $page = null)
}
break;

case 'Q':
// Use clip: restore font.
$current_font = $clipped_font;
break;

case 'q':
// Use clip: save font.
$clipped_font = $current_font;
break;

case "'":
case 'Tj':
$command[self::COMMAND] = [$command];
Expand Down
5 changes: 5 additions & 0 deletions src/Smalot/PdfParser/Page.php
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,7 @@ public function extractDecodedRawData($extractedRawData = null)
$extractedRawData = $this->extractRawData();
}
$currentFont = null;
$clippedFont = null;
foreach ($extractedRawData as &$command) {
if ('Tj' == $command['o'] || 'TJ' == $command['o']) {
$data = $command['c'];
Expand Down Expand Up @@ -382,6 +383,10 @@ public function extractDecodedRawData($extractedRawData = null)
$fontId = explode(' ', $command['c'])[0];
$currentFont = $this->getFont($fontId);
continue;
} elseif ('Q' == $command['o']) {
$currentFont = $clippedFont;
} elseif ('q' == $command['o']) {
$clippedFont = $currentFont;
}
}

Expand Down
3 changes: 2 additions & 1 deletion tests/Integration/PDFObjectTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,8 @@ public function testGetSectionText()
EMC
(ABC) Tj
[ (a)-4.5(b) 6(c)8.8 ( fsdfsdfsdf[ sd) ] TD ', '/TT1 1.5 Tf (BT )Tj '],
[ (a)-4.5(b) 6(c)8.8 ( fsdfsdfsdf[ sd) ] TD', '/TT1 1.5 Tf (BT )Tj
q'],
$sections
);
}
Expand Down

0 comments on commit 74a074f

Please sign in to comment.