From 268a620b96523eb4244c42931885024c8db8dae1 Mon Sep 17 00:00:00 2001 From: Alastair Irvine Date: Fri, 1 Dec 2023 19:13:56 +0800 Subject: [PATCH] Ignore encryption (#653) * Add ability to ingore PDF encryption check * Switch to ! syntax * Update src/Smalot/PdfParser/Parser.php * Additional changes for #488 doc/Usage.md: - Moved description of `setIgnoreEncryption` option to doc/CustomConfig.md - Added brief "PDF encryption" section doc/CustomConfig.md: added `setIgnoreEncryption` option and section to describe it. src/Smalot/PdfParser/Config.php: Doc comment for Config::setIgnoreEncryption() Added tests/PHPUnit/Integration/EncryptionTest.php Added samples/not_really_encrypted.pdf (thanks to @parijke who orginially created this as test.pdf) See https://github.com/smalot/pdfparser/pull/653 * src/Smalot/PdfParser/Config.php: PHP-CS-Fixer issue fixed * Update CustomConfig.md refined texts * Config.php: use explicit PHP doc entities * ParserTest.php: moved tests * removed EncryptionTest.php --------- Co-authored-by: Jordan Hall Co-authored-by: Konrad Abicht --- doc/CustomConfig.md | 15 ++++++++++ doc/Usage.md | 11 +++++++ samples/not_really_encrypted.pdf | Bin 0 -> 6695 bytes src/Smalot/PdfParser/Config.php | 21 ++++++++++++++ src/Smalot/PdfParser/Parser.php | 2 +- tests/PHPUnit/Integration/ParserTest.php | 35 +++++++++++++++++++++++ 6 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 samples/not_really_encrypted.pdf diff --git a/doc/CustomConfig.md b/doc/CustomConfig.md index 34d5c1cf..377e6102 100644 --- a/doc/CustomConfig.md +++ b/doc/CustomConfig.md @@ -21,6 +21,7 @@ The `Config` class has the following options: |--------------------------|---------|-----------------|------------------------------------------------------------------------------------------------------------------------------------------------------| | `setDecodeMemoryLimit` | Integer | `0` | If parsing fails because of memory exhaustion, you can set a lower memory limit for decoding operations. | | `setFontSpaceLimit` | Integer | `-50` | Changing font space limit can be helpful when `Parser::getText()` returns a text with too many spaces. | +| `setIgnoreEncryption` | Boolean | `false` | Read PDFs that are not encrypted but have the encryption flag set. This is a temporary workaround, don't rely on it. | | `setHorizontalOffset` | String | ` ` | When words are broken up or when the structure of a table is not preserved, you may get better results when adapting `setHorizontalOffset`. | | `setPdfWhitespaces` | String | `\0\t\n\f\r ` | | | `setPdfWhitespacesRegex` | String | `[\0\t\n\f\r ]` | | @@ -63,3 +64,17 @@ $config->setFontSpaceLimit(-60); $parser = new \Smalot\PdfParser\Parser([], $config); $pdf = $parser->parseFile('document.pdf'); ``` + +## option setIgnoreEncryption + +In some cases PDF files may be internally marked as encrypted even though the content is not encrypted and can be read. +This can be caused by the PDF being created by a tool that does not properly set the encryption flag. +If you are sure that the PDF is not encrypted, you can ignore the encryption flag by setting the `ignoreEncryption` flag to `true` in a custom `Config` instance. + +```php +$config = new \Smalot\PdfParser\Config(); +$config->setIgnoreEncryption(true); + +$parser = new \Smalot\PdfParser\Parser([], $config); +$pdf = $parser->parseFile('document.pdf'); +``` diff --git a/doc/Usage.md b/doc/Usage.md index 864c2924..398c243c 100644 --- a/doc/Usage.md +++ b/doc/Usage.md @@ -230,3 +230,14 @@ foreach ($pages as $page) { ]; } ``` + +## PDF encryption + +This library cannot currently read encrypted PDF files, i.e. those with +a read password. Attempting to do so produces this error: +``` +Exception: Secured pdf file are currently not supported. +``` + +See `setIgnoreEncryption` option in [CustomConfig.md](CustomConfig.md) +for how to override the check in specific cases. diff --git a/samples/not_really_encrypted.pdf b/samples/not_really_encrypted.pdf new file mode 100644 index 0000000000000000000000000000000000000000..fe841fe87de7562a182e16e29b7fd899a11b7f6b GIT binary patch literal 6695 zcmai3c|4SD+m=K|mMmGaOqOJwea2XZ!N|U^p~wu`1~X%AvxSgdmO`Oy*|LNz$r8$A z35g0RMA;(QznR{ix1P86`@ZY1>pt)EysqOq&*QxA-|x63j5W37Ao56nL|cAaV_R)o z4gd-S1HI0>0hEQ z`kMy)*{d^vZys4@ECL#AJ%GnHTobJuodJ?L_snkI^0vOMn%vau?0@mK;gJdS(#Q3N znp%eXV`^Gm);yKzaZP?TjMuY0dr(X^tK378^&;m2oBX4|qo8$X9VNW30QW5w=1zZ@ z5l*PdA{am*;(u;Uozc(&Xdr152n0%d`@@F#<0#1g>>U3#mhm&nH`q3z>~{##vos{-DUkkz)a)33an~# zD9(+s&@A&*wx5-WTB$EaAY7ZUiqToz-*MF)5?%`mR|JM~=!ja%tAr)KEAI5MjKL2y zzY|Y!7S=vn{yxjn9)c1+h;E|HMvfh-3YRW`ejmF3n(W6m_&V}y+A(J2Ne2sS#;ZF%el@<|LKfrb!j{=ab{l8n90qi%4&SJiTPQ~j97A7847be5qDqkru^SH7 zzA_6|A)v+gHL2YZan1L)s&2g5vr`%;l-`yulDct%<@Rjr%QQPq3Ph9JF=h2Y;Aj2k zFHiPoU#`~-?uPoYCO-GZ(sAnc%v)?6Rt~p&v4^=(k>Q}O9VhB1RBN8@QS`05El}=? zWM4XDcQvXi>tlS+O!U_%d)e{*+ukY|p4B`<=4m52(S;Q^dCPIltzrjC+Orv}PZpR< z3=W+7`jkcUYo;P?b%*_Q`n;agiw6?)4ZV;1aMPX2d%;%-RaG*hJGVABgZ}I4TZXI; zHS=f(^O?bqpJNSoq54Ivo1d;CpZNQ|E8mROS7m;O&8k%J^>EOLb$gr-`D!0Jur}=I zg|CJRat3t-{Plon9|`H#Yha=XwSI|92 zvn2{1Bd;yrW37l-d)bfKk}W8`H!f`{xNa1!(oyNK3g%y&ldwmLM2F@wyT%6hsWvYy zN#wnqBKcK4Of=H#P)O+Yja>Y4k*5S$6cytWd+N+NB1x7&#fy7TZ+kF#mHD=XpE2VcEb;)H&-a;k%7?*7A5aBuZu9HZvQvg2;+(z1&a?|A z*dVJU{mrw%`7d_yIxqTLauR1P41`TOf0YO=^ySBHMKm`>o5%6(=ktK|eD{+qLzHawFndkL^eB#k*L9x9YhE5mdmb*S zVRvPH=3Ru0C{vtoCH=XWamm#KIHfbJ7kp|U6RX?P(ZS!7b@aENKW8}86M#IwVojfm7_gleLu?wI#U6#W5wU}A==kNLR@f-}sAz6V@})sqWkF4D(QK#PTNWy0u5_>V#1CA)!N{v~{FDmQ z6ygmY;gJQ8MfxlUDC)b#$`El&J5NdHwM>n<|4mAV^Md z1}`nOdn2M~{t+J-9*L+f6dAi+z|OXEKi@MaK0qg-^4?fWAcUjTuvMZ zTj=r}*JlaOM`C2857)<`;oXY%5uHWnk<5wgL~>?o=Z*16^0gQDRCcn>4OKM)5+F5Y$Y%n|fT|gWLZ@>Pe0QM*{@Px7j%3Wc> zR~lGO;h7+ZtwJ#s)jehtoyS{7VVNVJl2%5QrP1EoZPAzotpP#`mm4r^ zYYryHOCMM;uOcB5U(VGsp7J0LV@3_C6X~MpEb8!uUv4`N9rLL+AJ%{0?5SIdaSo%i+8e0B-3T-3riJ4Fu>~ zmI>E}wjrN6p}tQZoj-Qp94i<9a-yNGL@D~Mm-MUh-D$3a(fS{HqFgEZ9zLo?t#96! zo~W*RvJiOa=6uA935Mdh2X!?11=Xo%2Fj(Q6o zOYx!Zw0Gu&=i-0830YGZ+(SH69aV`o79Sq};4dvydal@2ww%H{cclGFoY`9_ze0`N ztIa~x(LlbQ%ewq(dC`#@aq=hBzp$yCE6QE+sx;z&XN#Qd@~mk4TFH?1#)xTn;@I58 z>phX#_ZjFk#-093?@9^`dm~l zG*ZV@R32~tF+Cc(mAGD%LqcK?j#dcIJdBTxQBr|3cP(sW5%xZp}@Z7wKhYM?DjlC{#hUk2jD#~A!fhMV<0qa_T+V_JKaExpT)hD zT#cJ=oFc|%z8wg@jp6Q#j9l%Uu-+pb9;jc}*?%naBG>egqbF}TGb1`n}BgtArE0gNF%iOW4G=>R#mZ7-fZm$PX#lPCB!PTz?%BpZ|-`=+1nu<$^*(< z8BB*URoyBpA2=1@bn4||D;`nvk@J^{HEOE8G3@sB=9LWw+M!&>#SkVzok2;`+cTOQ z(bL<<>6mge^k(<)nu4nA#~osAa!>2x88zZ{aXQ5soIA6r1Lr?`FrUs~>^!3Of@BTv z?cL`c0mElyl-5?`v?SxwYumNAbKj|4EeP^`?bW_j z*VS3+8D`WI{w%kY$%rnZgUMvXqi&y5Kr5IdWjG-~TvJR#U-Rm5`9n2uj+La_rOg}p zO}=K|dKFLKt3QtDW4k6YaAWLQOI4Aqs*rDjd!uE8K-Gkk8IOH;@kaE zNd3ge9k-BYq1cE0*FG_ju87uAI&8bef*LK4VcCq}ewAsh}|KY5v2-)LwLL?OF(-)Yf(@{Y9pj3Fug6d(=cg z&p`nDiQ{Lz`R?auFvJ*e!tmkm`Al~Tb+M%zRZ)0IC3Oyq1 za;1Y|Xqa=Q`uW`WcB8)FO93lkd)!H!a@DYVso~fc);`${WM}*#r5RurlWLJwbb-m7 z1*Z>j`aF;9hTCE0tKFR~59wbmN$J|Ri28=or#p>MdJikG@yCJjxK_uF{<&#GnNF>_ z-nXo}?^~Lcb4-o>?*eZ&Jbq>V-~mh$WsMwuXE(gWSzMu8JTjXebxn0Qf_(YXpwpBr z2PQJ#>fKnp{$kewo~hSuQOYq#0ZoxieBSi1=j&sWVO(*W@+LZ}Ty(em=|r7)+}?q{ zO@72rnC3ksyA+)(ZmWq$sB+xDOS0?6JTf*pwaOXyv`)fq(>}5PWgDQheNrj+4&1)0 z;T0Zwr=T*Hi%XE_Ns6nzQ2Fazof|@lOOz*hq5`khJPo^E9Vv{Wlx46ym2mOv!e?Eb z-ob7TN^?!JcO2Q1j}h}a^0Lq1y0Y$egonyUY(>a&szAcT?f%`Jl~0Z1<`teRqvxgj zj>Z9v(&Y-pMIt<3BGbhVw9oG1xVf0ZoP&-LZpPbMUDwPlYRK#W#}=`F$SKPZW#MOv zVDNgV`s(S`Bt#_#xJo%Ja%6L^>}>NzU&*iSZb8Z01{VqvBR`!#zl4V~h#P;M;``KH zEmd*4z#xaAu1ppWt24~mvMONzG?{y1>XE_M>Bz|M4HFpdSopeMdB@)M{Fh?z!N&w2FBO80QdluPNol{;Vg-{dH}CnLy1wY7^`BOZh$y@%)epe zUWfDpV97#bnlNXQ;QPx34B# zj&*6#zF2Qg{`TEDFA$~5`}A5&K$Ti?R*JYvZTH5BR(IjPvKs?42s~)*b*xv|Cgx0{ z<8X*O2+rW0`?Mhe&AP+d%9>2fbE~XgP$47$-@bWlo+9oS9@J!<;BW6g8GQPLMTa4y z%0)9U>ZEy0yOrkfSLIxkMvpciQWE&Q^8{W(ipBOWTFg||A74D0{E5P3Q@K~h0oUwS zn>bpFKJrcewnmcO7tMt;teZAjV)H!=PsUztAT7lg(Z%chxUyXS^=PrCE!q!&tL@e zC4{x&4yM$H=JiYpYijQ|#FF@7z3KHncDg@(H{m*#%*g zQAb3$%r%_Z-(BRa<-Zu@Ig4Z;S#a5=8bqw32mb zehjSEK=kO;#TL0#k0!)Z)KuI;5MPwdRze2uJNHgz9KE*sU&%+puACGUF_A%68zkK$ z_9#j5%0kHRU-uZkeSkcJtR z_I|ET1k=;%ARQounsE#Uq(OyF-J1-gX~CwyahwK**0$7z{tc82_yZl_7d&df4}3I8 zw3O?);>otaA7`LFYIyn5vh>>)kl*y@AA4vxQXAIGg_;i+>hXW10{}AeCJ?bswBEP< z*ILv@qvn)ad!oxJM9hVKZ>fJI_-}QX3S5J#SW~YPWhhPyj)vi|2nZI8!+{|R5G)Ff zgG05T5DmIeCk9~6RyAdnCZ7)}cUL!;m@7!0RL%{N*d z3xUDlXe1gA#iFQd{n(O}kF|KbPz!w-&vQk(Igv2Yap zk63C&|F9vT@IPV^a43}$|6C6NM^G>I|FEG@f5s{x|A<9`VZZrNNRHHR1c}xrX0Aa5 zYWhH?US3q5|L7+G2uE}x1$tATf3jJJ=a ly%4d~Z=@yyjD_NGw3Z=L97&WPWk*6#)c-;fI3q2<{{WrNSCjw% literal 0 HcmV?d00001 diff --git a/src/Smalot/PdfParser/Config.php b/src/Smalot/PdfParser/Config.php index ff69d3e6..e44b1640 100644 --- a/src/Smalot/PdfParser/Config.php +++ b/src/Smalot/PdfParser/Config.php @@ -82,6 +82,13 @@ class Config */ private $dataTmFontInfoHasToBeIncluded = false; + /** + * Whether to attempt to read PDFs even if they are marked as encrypted. + * + * @var bool + */ + private $ignoreEncryption = false; + public function getFontSpaceLimit() { return $this->fontSpaceLimit; @@ -151,4 +158,18 @@ public function setDataTmFontInfoHasToBeIncluded(bool $dataTmFontInfoHasToBeIncl { $this->dataTmFontInfoHasToBeIncluded = $dataTmFontInfoHasToBeIncluded; } + + public function getIgnoreEncryption(): bool + { + return $this->ignoreEncryption; + } + + /** + * @deprecated this is a temporary workaround, don't rely on it + * @see https://github.com/smalot/pdfparser/pull/653 + */ + public function setIgnoreEncryption(bool $ignoreEncryption): void + { + $this->ignoreEncryption = $ignoreEncryption; + } } diff --git a/src/Smalot/PdfParser/Parser.php b/src/Smalot/PdfParser/Parser.php index d3cac625..86bfe555 100644 --- a/src/Smalot/PdfParser/Parser.php +++ b/src/Smalot/PdfParser/Parser.php @@ -102,7 +102,7 @@ public function parseContent(string $content): Document // Create structure from raw data. list($xref, $data) = $this->rawDataParser->parseData($content); - if (isset($xref['trailer']['encrypt'])) { + if (isset($xref['trailer']['encrypt']) && false === $this->config->getIgnoreEncryption()) { throw new \Exception('Secured pdf file are currently not supported.'); } diff --git a/tests/PHPUnit/Integration/ParserTest.php b/tests/PHPUnit/Integration/ParserTest.php index 29091914..fa0d3f42 100644 --- a/tests/PHPUnit/Integration/ParserTest.php +++ b/tests/PHPUnit/Integration/ParserTest.php @@ -403,6 +403,41 @@ public function testRetainImageContentImpact(): void $this->assertLessThan($baselineMemory * 1.05, $usedMemory, 'Memory is '.$usedMemory); $this->assertTrue('' !== $document->getText()); } + + /** + * Tests handling of encrypted PDF. + * + * @see https://github.com/smalot/pdfparser/pull/653 + */ + public function testNoIgnoreEncryption(): void + { + $filename = $this->rootDir.'/samples/not_really_encrypted.pdf'; + $threw = false; + try { + (new Parser([]))->parseFile($filename); + } catch (\Exception $e) { + // we expect an exception to be thrown if an encrypted PDF is encountered. + $threw = true; + } + $this->assertTrue($threw); + } + + /** + * Tests behavior if encryption is ignored. + * + * @see https://github.com/smalot/pdfparser/pull/653 + */ + public function testIgnoreEncryption(): void + { + $config = new Config(); + $config->setIgnoreEncryption(true); + + $filename = $this->rootDir.'/samples/not_really_encrypted.pdf'; + + $this->assertTrue((new Parser([], $config))->parseFile($filename) instanceof Document); + + // without the configuration option set, an exception would be thrown. + } } class ParserSub extends Parser