From 521e67679ddd1bb4fbe3ff190c9279740a07ccc7 Mon Sep 17 00:00:00 2001 From: panique Date: Sat, 21 Nov 2020 02:10:46 +0100 Subject: [PATCH] fix for random spaces problem (issue #72).patch --- .gitignore | 1 + samples/bugs/Issue72.pdf | Bin 0 -> 8461 bytes src/Smalot/PdfParser/PDFObject.php | 42 ++++++++++++++++++++++------- src/Smalot/PdfParser/Parser.php | 14 ++++++++-- tests/Integration/ParserTest.php | 22 +++++++++++++++ 5 files changed, 67 insertions(+), 12 deletions(-) create mode 100644 samples/bugs/Issue72.pdf diff --git a/.gitignore b/.gitignore index c685f194..56badf4b 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ debug* composer.lock /.php_cs.cache +/.phpunit.result.cache diff --git a/samples/bugs/Issue72.pdf b/samples/bugs/Issue72.pdf new file mode 100644 index 0000000000000000000000000000000000000000..05ccfdc1574ad44024b172ba2f2f1071e5b5def9 GIT binary patch literal 8461 zcmai32UJtrwzUBg=|~X}qV$@CkV2QeCp+H21}_dX+Ma_cH83qwTU0PddWJ*_?UJvjhT zAQP0%@Yq&R7>9lwi>ZDZ6@LQNAE$4?8SM5#`|Jh?0>3V6eU@J5K;EGbOQ2 zJsd)#^i!&LB6LZ!T~joK#`~22o6Kd?2U(@IHyR&RdYQhmvI8|f&%$65o4VC)@JZp} z+!AK6xZc-EHO>Wda4G8%6;oGP=!&UpTX=N%ca9gl zq8j{}@$yR)L2h(EQ64o|&&1cm!+x#WV+O8*^+VC-1-t%Q*qJH+_-J zw8y>;%;fPj?V|APwt$?ob4wgfC)LZ|xPc#`kG8LM3-ORL z83)y=p7*WPdsj?>y;~ZQhhm1EJ?Ai9kZ)ts@1Y0%zXtgLyYW$<)R2@ac-V9iH(~>>yWF-fCQRcna zaj88`hg4ktwx3+%p^|+2xqNIy!ioqRLqm{aDTO^ZOlX5*s$i(eG9DYA(A5!^rcUYb>&`Gi!jsZ)`YCuWFX+yTI)S8 zFnDC4;ZACY|Eiz#oMgSf-?7BZ$JUR@Q{@(Lo0b=-B^YbQdl+v0tK0iBYooTYw!8r& zbc2~NcW+@{_l119N_vuTbF*IEC;{U^xXtt&p2~l6a^U*s>L1NBR1Ki#19oNwQRtMG z(%7U~J0q$$>jHt{ObH4l@`;?OY61ajC^@?FU8m`{mW`Dpc#<{6_7#S=OsphZ#EiI#2rEap=eQslY#$rLJC|zaJ-2YCLc~9BxE$i*^ zpf}tl$8Mz-*~b@0u(PWR=IZ3`R~72MQ+M7naxw7ne-=Yo6z?ky`-~ZEe=%Zs_;z;r zu|<7zQQ^Cl7lvd7tS9G1eVjH~RWeJq_T(m6V(^8};C>66Xul5?UoV#EZX`L_<&D^s zwz%cj4%7pee(}L|&H2QW@g)LJA~~I}GT&6pnq_#sBuXKkL(*$NZx}(d^!6==?czhI z_^_ESb<>Obxu6h!Ba=4i#m#3AImk0KQq(oJMHI8MkfsKJO$+1X=!}NZu@cj zYevlM-2hu`T7|6Ux}t9fwtbVDy!7?P^iX&Ke7(l#x+ry_quhPP+@h(0`igv((zS}CG9M$i_0ZhY&N%AL>#gAk=K<-5LpV1#68QaXY@w-zr^~lj%io&t zV;%fE8Y-UD!?#|tQg3A#Z+kDyckUTWFtKk^O9?(3VcaDxGNA))w`zhYwHcUGXs_Lb zhMUc3sh&^um#I9b>01#;kCeGf8Zr)8alPE*94_@P!~bQBr2Z{noXD`=N`arE;OlW7 z@kjBlw)_S$`|2V3#c2`tUE+@(?hEO)%dJk!hZ{HvDX)sTZmcXrk>Xax$UV9wuH0o# zDJkza-#h0!`JruDStr_Ytsc59$^FIC`{rwpIm`597#?axgnCqmC>G?(8jiN53@p#9 zhcLf&OORYQ(n5ps8$<)$v(dI#&N@DI)%7I9I~_4)7b~a@M-!7%?=Og>q=y9xcx9dEMpyxpbfeILWUF)A!iU*! z6Gz?eI-cc|SxO#++zTD*h|ndKj#2T7vcKaKpp@t1Hbmb|ykimqJkc(tv9^V!avM!Mxq4oMM z{0F*ER-I>_YfafzzAEgae2-KgX|qr9JdNb}u+qsusxoxyV)Lwh! z0+-|pdnD>guf7l0tFkEXJ$p83FUFL9+I6%4S-1!}13CZxQKT&d0fSXnxY(ahtruI# z_p7wF1J4W;%bBW`4Ak2vZR@@(>>GyN6JM3aUe>sR_~i{{xPea=2^3sk7f@`st*=R<|`j zK(+A`Gj*YH%ksWxu+|C(K0r|Iw>4HvBOs%)8BJ$dHRv5EO{zgzgu zJV&bLW-F*`-sljqOL2f*dS40gs`1Vu5C6RXmRYT9Qgm;^A{_OZm+{Mur ztuM;!+EJs?T0>ILF`Fj`He0VxwdjV~?=P90cAUK7MAFq>abVhF@+y$=vu?F`V0~A& zG0B!}0qXX}xMb@*{RYeZiig1;JgPnna8*}Y6A= z^Yu8O_?{1d4I|G)4F|jnDJDgwy7ZbT%2Ah5xDW$XZ1wUOd~B>LXY1xnv!ylqbo2z? z?N4i-a{Hp^^>;7*#MBqN$6kF*ZZ=nXQEm{^o>_|YK}>y_f4wyD^XZBS&-a+1P2pT^ ze_qv*7iA&>{16fP-WuPm4~kC}az`DjFHYYkkqc~PoAmV%R?!i^g0U-g$=I^2SKGFd z*Xf#`eOErc?Bd(#d3$Ey!;eTwg|LP>cF)rf!xf&>6RYim+3XDMrVYouvTGfDc|-ZOg9@Dn3Xpw1rKyq_fulRh<(B^#H(dKwYcLs747hdsgI;~-`E6jsiGZ#UH|!`a(UMMO@z=OMVG zith9C^Nn#g`TmjVudfllQ@={rvR>UAFAZi&`KtacO4vbc?pwKq#}XYwBNsPh9`A(|}b6(?C?#G$BWZasTdOs_S_MtU`2%*D*qwg4r3lsJ}L z6-iot|29tIaec~)4b!)A!$x?a(BgU1@35xoGj|&Uq^Y=3o%NI zgI-QIY${y1BElfTc}K6xw}Gj30KoWJP)a>TXN@^We<1b-2=0v!(qR1ZDUtiGtB~{! zA+t3N5~eK9HQ?Ht_iax2&S&2{yT97jCB&r|Jxc5u8cx~wZ7gJeS~)sThkBoUjoar` zz~6}c`YqS{^TCf;GsufDMOJhw?Wf=8+KsE-FRD`q`7EZ|WFOr#DMqb2#0RaiPMW1nLoE$!;xaoWW9MZ~7NJo3KA5SBXcWv;$C`c2_M+)0HA;AQ z`F18L)5x8%{H_SyP)`=iaPrxLE}lkpWXrZqW+2{=QY6x;S~pC?wy7~isnVfCMpl?q zj9G4KP0TcyZcd$&(Jrh)FU*gNe%w~tvOuz|i?nzmYr>YZwe)8_DN`rQ>3disO~u$R z`LNjVoUgx%yLhHew+$%aJ(LmLES3}q#`)qd?yC0f$sdO?mIO|`hbg;uAzH0p z#lbA#-L`pCwVhSiY`SJ#oYLodM~M{8sM;OJy!PS^-)iLmMEVziv-{_bX#8>Zp{^K6 z@IYyNCt+@MG`iY5(839s@j!9hH|uWGtIvnoE)qSD`;H?ck5@Ei2P2lQi3>T8<5jUz zneA%YaYB@e+A1nb=^kQQuW<70#apGRr+5#sgp{7>2eTc|rX@w`IO2R4y~VO9le|S) z^L4#4Rr{d{hv2;@FKp7f^051;e6svqKfDJ#Lye~uPjQs@`gkeYvCvrNYAkjCvI71g zTeQ<+KDj;B+)dDALNse(6Bpf^FvnfnmroYi)l{I>ID5mPDt`AFUpV7Xjas})?b@L? z&f^7_%gAEYsKpP=;_UHG%UPoeyPL~6!*j*B{=_#=r|&8!x&p)!A599!+~rDto#>`drE*iuXTaDOXP-G4 zJ(ih8Eh3evO$UN84kf1SXa)&A)$-iMQC*iXbt>_WqyUmV;It0isXlfLoEZQ5HRSe# z9zH9xW^@Ff;;lkKOd8tS( zcCKd@O-H9ot(3X9GsQwaQ?4}HTjx$bqkY`tm>abp`JiRGf+8rBt|;h_Ad0~zP(VCu;}(oM_SBA??A^aE=f3jRbo(FXv&BEmzJVN z;(R|!k*A5T(k2qA7pK%{$ovF$GyhVv8mxw>Uu|Lzx6Is{u0Vmh;S)Fzi(VB^z!?sITn#p_Zvw~HsfU2=j1l`B($Azk$bJ-&MyN#W2T1lRq0Yb zFVZM=pY$QwR*oKedrqOqwJ;Bkwa-jtYYJJX))PO;P`T-eW|<2waGqr2@6j(Stn7R) zZJlU%yLWZPY}h0)v(ZdMQga`2JeyZu>ocG}|Ne1(BGr!HwQ(n@p^bs6r0gi130J@z zosl5!5eUV-fZ#MQ8n8qtc_$9r%f$P&!aaEO*!jb{?`}{h^f4b~#S8bfaK3eCopj6k z3{4w0QGCiDE78mEH`4VpwEVF6$+*s#_9#Xv_J~4cQ_DCB!tUjiuyH9KUIXsGTc19W zw!$yF1Zn4-d#Rg4JMVS$P`RMCCc;1kOjrn!Ky3cUrGYlm6u{&1e$i?7efW z3VJ1~4f)|xDqCH(&cj%~aHPIJ%0AI^5Su(DD5rG4hXZFXBylxFe6b*3(A;---NO?8rB+xsZx(2%m{{k9a7pd&yWpz?n3&L!A9 zoKZLYF)N2-l9sG1ivMzb|J%s+M))|ryvy@2oM1ahDj}} zm6wM&Vmnq?mLnJ=R*RurJz3=$ezCg0s>^oR#Me{?-PP>TbaV1uNlE{Zeg#UBv>^3E z3-4vK-6#ZVw5G8v3e=hNQKyiOcv57kLw=^B5D(~&%m3W?wQ;ra@O97kiYQ4zD{7gl z8Q9Ox2(n8*mijOD!U^9aVx_Ahd>i+Yk*k4DPcK&HyPAF3%o|iN0a>XL5Q%L+c-$B( zLj8k}wYLkDyC$<51Us0!y^!#8|HZKOy&u!gy>D9L=Y*v&+u!7FEZKZj=W(-)9@1$P z$QU4dsI?dzGFz2nSx;?Lxf4Ok@E%BR)zQ#Z*MKQ^H#1lZ&gLoOl#BgBXVGg7;hz`G zW_U!>a+*?Mg?)DnIwa|}art2uXw{Z&z4LaqjBcairn{uRB#-}tk3sHMR#~i8Pqa;n ziuOJaAtV+y#VBqsF72=0>GPlB6=<(x&si)j+qIhf%=S2g?IQdoh3QkFix>2BpW1dh zq}N_%9DMef>dMbGvT!x>?>N$Ut%HNf32Y8zXk*j(%E|juoN4){gFouM^`y6HL`d6v zvO?NlLb!v`{7-Go7M|a(6z7(P&AEl2i#VTj$+R;2Nhj-D#~RpOTP?~idTdliG|SD+ z;pG=qolzMSPA>5S-rQw&7XsMUtb1^Zwt=cMKcSuIhh|A8_A}Ssv6&1lAa&b+b(}xi zfxr7-V-By`MNp*h+>X21}H{o5Ip7Oy>J%7Ffk}l7z&00A)*KbP#g)d0;yr` zJX{^*(9RwxAQ%9W!#JSOSRhgy0ss*^Bi;y$Lf`<9f}OW2%GKEgON;}646rCqV<3_c zLQGH4k5HmOqC%n%Am|%v5w&q^^ZcOQb)F9)t6}hgE-7K{3tRX? z%0b_Xm-`1Xx5WX2iJvy6KJ-jy*bzRownof)i)J%`?uXgK$B*4VmW)i@njLN218S1p?8dhVR~Ds|WN4Upmqp#wm;1j+ z-{e>lp9_cy9LvlvltbUlNdP5iu$;+o%Ur~qMMn+xHqlT9k)lFfT`qR~Lh;6mm@iE5 zMpyhV#1a1w!2jk5$iUAYOC*e;uOI5SRo)JRBF@tPO%+#P3|7I#&X-65O}jta5GVj- z;_8TX!B`*&bb*1vK!m6$P>et#C=?15g@J)^$R9Q$oqkh_@QipR`Vvp8Kj`rv+5(BJ z)(3zLy^PSVM4b^x2+>&|q=j;HwIePZVh1pB zmm!CI5S~Va*Z->Ts6O%gxfD{Y} zOX;?&14>^-9;61u5~f8)2BbhJozOQ1NYrHY|FUR^#Sk|GLZW{ciUItiI>3L*BLw`e zk602Bat&8Uj0N!b7>KtDUVcQB{?Y~HFFNq=J{0T-+klrd0S{-w@PESr0O@$6&~gsM z9mV30Tp&|3b0FH!!{d)1Fd?_59om@>g%&n4;Qt%g{}NCMgHRx-$b=0`N?A!+84QLI z;S&Xe;o^j=7~x8w2|`o`K%6~CKL^ym{KORg@PQM2fBO^s#0c?l_@5^s9u6g(1Utb6 zOq>=`CBhX<$o0ERLfwi4Ih#*WLX_YEfe=m@p@kF#LwLbpUjDz}f%8Q<0l+|r7=V!J z&jSdD!Jsgp6Yx(PLJUfvD)9iK|78P%Aq0Bd#F@BeNCLkZdc%LXT) z^G};N0sX(n!VyTqE937r1O)nzS`bj!KVlJ3Bw=#?D;5F%XFeq1J@Icptgjtmi}xjx z(!ezkMSu#V@8v~c)o=C!Kx$|wFCc_C>x9LvrU false, + ]; + + /** + * PDFObject constructor + * + * @param Document $document + * @param Header|null $header + * @param null $content + * @param array $config + */ + public function __construct(Document $document, Header $header = null, $content = null, $config = []) { $this->document = $document; $this->header = null !== $header ? $header : new Header(); $this->content = $content; + + if (!empty($config)) { + $this->cfg = $config; + } } public function init() @@ -282,14 +298,18 @@ public function getText(Page $page = null) if (((float) $x <= 0) || (false !== $current_position_td['y'] && (float) $y < (float) ($current_position_td['y'])) ) { - // vertical offset - $text .= "\n"; + if (!$this->cfg['ignore_letter_spacing']) { + // vertical offset + $text .= "\n"; + } } elseif (false !== $current_position_td['x'] && (float) $x > (float) ( $current_position_td['x'] ) ) { - // horizontal offset - $text .= ' '; + if (!$this->cfg['ignore_letter_spacing']) { + // horizontal offset + $text .= ' '; + } } $current_position_td = ['x' => $x, 'y' => $y]; break; @@ -302,7 +322,9 @@ public function getText(Page $page = null) if ((float) $y < 0) { $text .= "\n"; } elseif ((float) $x <= 0) { - $text .= ' '; + if (!$this->cfg['ignore_letter_spacing']) { + $text .= ' '; + } } break; @@ -724,7 +746,7 @@ public function getCommandsText($text_part, &$offset = 0) * * @return PDFObject */ - public static function factory(Document $document, Header $header, $content) + public static function factory(Document $document, Header $header, $content, $config = []) { switch ($header->get('Type')->getContent()) { case 'XObject': @@ -758,7 +780,7 @@ public static function factory(Document $document, Header $header, $content) return new Font($document, $header, $content); default: - return new self($document, $header, $content); + return new self($document, $header, $content, $config); } } diff --git a/src/Smalot/PdfParser/Parser.php b/src/Smalot/PdfParser/Parser.php index f37a14b7..425cba67 100644 --- a/src/Smalot/PdfParser/Parser.php +++ b/src/Smalot/PdfParser/Parser.php @@ -46,6 +46,14 @@ */ class Parser { + /** + * Configuration array. + */ + protected $cfg = [ + // if `true` ignore spacing between letters (= fix random spaces inside words) + 'ignore_letter_spacing' => false, + ]; + /** * @var PDFObject[] */ @@ -55,6 +63,7 @@ class Parser public function __construct($cfg = []) { + $this->cfg = array_merge($this->cfg, $cfg); $this->rawDataParser = new RawDataParser($cfg); } @@ -104,6 +113,7 @@ public function parseContent($content) // Create destination object. $document = new Document(); + // TODO hier config setzen $this->objects = []; foreach ($data as $id => $structure) { @@ -205,7 +215,7 @@ protected function parseObject($id, $structure, $document) $sub_content = substr($content, $position, (int) $next_position - (int) $position); $sub_header = Header::parse($sub_content, $document); - $object = PDFObject::factory($document, $sub_header, ''); + $object = PDFObject::factory($document, $sub_header, '', $this->cfg); $this->objects[$id] = $object; } @@ -229,7 +239,7 @@ protected function parseObject($id, $structure, $document) } if (!isset($this->objects[$id])) { - $this->objects[$id] = PDFObject::factory($document, $header, $content); + $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->cfg); } } diff --git a/tests/Integration/ParserTest.php b/tests/Integration/ParserTest.php index 0adc3de3..b8bb10d1 100644 --- a/tests/Integration/ParserTest.php +++ b/tests/Integration/ParserTest.php @@ -146,6 +146,28 @@ public function testIssue19() $this->assertArrayHasKey('17_0', $objects); } + /** + * Addresses the issue with text that is "too wide" between letters, resulting in random spaces everywhere in the text. + * First case is result with default behaviour, second case is with config that should skip the space-handling. + * + * @see https://github.com/smalot/pdfparser/issues/72 + * @see https://github.com/smalot/pdfparser/issues/314 + */ + public function testIssue72() + { + $filename = $this->rootDir.'/samples/bugs/Issue72.pdf'; + $document1 = $this->fixture->parseFile($filename); + + $secondParser = new Parser(['ignore_letter_spacing' => true]); + $document2 = $secondParser->parseFile($filename); + + $expected1 = '1Der Z we it e W e l t kr i eg'; + $expected2 = '1Der Zweite Weltkrieg'; + + $this->assertStringContainsString($expected1, $document1->getText()); + $this->assertStringContainsString($expected2, $document2->getText()); + } + /** * Properly decode ANSI encodings without producing scrambled UTF-8 characters *