From 5248914c76084672d9f00a94e3f4d538e6470573 Mon Sep 17 00:00:00 2001 From: Romain Deltour Date: Fri, 14 Oct 2022 10:38:58 +0200 Subject: [PATCH] feat: improve detection of non-UTF-8 file names - explicitly read ZIP archives as UTF-8 encoded - report a (new) fatal error PKG-027 when the a zip could not be read due to encoding issues (detected by matching the exception message) - add tests Fixes #1236 --- .../epubcheck/messages/DefaultSeverities.java | 1 + .../adobe/epubcheck/messages/MessageId.java | 1 + .../com/adobe/epubcheck/ocf/OCFChecker.java | 139 ++++++++++-------- .../adobe/epubcheck/ocf/OCFZipResources.java | 3 +- .../messages/MessageBundle.properties | 3 +- .../files/ocf-filename-not-utf8-error.epub | Bin 0 -> 1510 bytes .../04-ocf/files/ocf-filename-utf8-valid.epub | Bin 0 -> 1528 bytes .../files/ocf-filepath-not-utf8-error.epub | Bin 0 -> 2040 bytes .../04-ocf/files/ocf-filepath-utf8-valid.epub | Bin 0 -> 1785 bytes src/test/resources/epub3/04-ocf/ocf.feature | 22 +++ 10 files changed, 104 insertions(+), 65 deletions(-) create mode 100644 src/test/resources/epub3/04-ocf/files/ocf-filename-not-utf8-error.epub create mode 100644 src/test/resources/epub3/04-ocf/files/ocf-filename-utf8-valid.epub create mode 100755 src/test/resources/epub3/04-ocf/files/ocf-filepath-not-utf8-error.epub create mode 100644 src/test/resources/epub3/04-ocf/files/ocf-filepath-utf8-valid.epub diff --git a/src/main/java/com/adobe/epubcheck/messages/DefaultSeverities.java b/src/main/java/com/adobe/epubcheck/messages/DefaultSeverities.java index deedf5ddc..cd25d4799 100644 --- a/src/main/java/com/adobe/epubcheck/messages/DefaultSeverities.java +++ b/src/main/java/com/adobe/epubcheck/messages/DefaultSeverities.java @@ -307,6 +307,7 @@ private void initialize() severities.put(MessageId.PKG_024, Severity.INFO); severities.put(MessageId.PKG_025, Severity.ERROR); severities.put(MessageId.PKG_026, Severity.ERROR); + severities.put(MessageId.PKG_027, Severity.FATAL); // Resources severities.put(MessageId.RSC_001, Severity.ERROR); diff --git a/src/main/java/com/adobe/epubcheck/messages/MessageId.java b/src/main/java/com/adobe/epubcheck/messages/MessageId.java index 340db9aaf..dd8486823 100644 --- a/src/main/java/com/adobe/epubcheck/messages/MessageId.java +++ b/src/main/java/com/adobe/epubcheck/messages/MessageId.java @@ -301,6 +301,7 @@ public enum MessageId implements Comparable PKG_024("PKG-024"), PKG_025("PKG-025"), PKG_026("PKG-026"), + PKG_027("PKG-027"), // Messages relating to resources RSC_001("RSC-001"), diff --git a/src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java b/src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java index d8cdde80e..04960f4b4 100755 --- a/src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java +++ b/src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java @@ -85,7 +85,11 @@ public void check() // Check the OCF Container file structure // -------------------------------------- // - checkContainerStructure(state); + if (!checkContainerStructure(state)) + { + return; + } + ; OCFContainer container = state.getContainer(); // @@ -270,83 +274,92 @@ private boolean checkContainerFile(OCFCheckerState state) return true; } - private void checkContainerStructure(OCFCheckerState state) + private boolean checkContainerStructure(OCFCheckerState state) { - // Get a container - Iterable resourcesProvider; try { // FIXME 2022 build resourcesProvider depending on MIME type - resourcesProvider = new OCFZipResources(context.url); - } catch (IOException e) - { - // FIXME 2022 see how to propagate fatal IOError - report.message(MessageId.PKG_008, EPUBLocation.of(context), e.getLocalizedMessage()); - return; - } - // Map to store the container resource files - Map resources = new HashMap<>(); - // List to store the container resource directories - List directories = new LinkedList<>(); - - // Loop through the entries - OCFFilenameChecker filenameChecker = new OCFFilenameChecker(state.context().build()); - for (OCFResource resource : resourcesProvider) - { - Preconditions.checkNotNull(resource.getPath()); - Preconditions.checkNotNull(resource.getProperties()); + // Get a container + Iterable resourcesProvider = new OCFZipResources(context.url); + // Map to store the container resource files + Map resources = new HashMap<>(); + // List to store the container resource directories + List directories = new LinkedList<>(); + + // Loop through the entries + OCFFilenameChecker filenameChecker = new OCFFilenameChecker(state.context().build()); + // FIXME catch IAE MALFORMED entries + for (OCFResource resource : resourcesProvider) + { + Preconditions.checkNotNull(resource.getPath()); + Preconditions.checkNotNull(resource.getProperties()); - // FIXME 2022 report symbolic links and continue + // FIXME 2022 report symbolic links and continue - // Check duplicate entries - if (resources.containsKey(resource.getPath().toLowerCase(Locale.ROOT))) - { - context.report.message(MessageId.OPF_060, EPUBLocation.of(context), resource.getPath()); - } - // Check duplicate entries after NFC normalization - else if (resources.containsKey( - Normalizer.normalize(resource.getPath().toLowerCase(Locale.ROOT), Normalizer.Form.NFC))) - { - context.report.message(MessageId.OPF_061, EPUBLocation.of(context), resource.getPath()); - } + // Check duplicate entries + if (resources.containsKey(resource.getPath().toLowerCase(Locale.ROOT))) + { + context.report.message(MessageId.OPF_060, EPUBLocation.of(context), resource.getPath()); + } + // Check duplicate entries after NFC normalization + else if (resources.containsKey( + Normalizer.normalize(resource.getPath().toLowerCase(Locale.ROOT), Normalizer.Form.NFC))) + { + context.report.message(MessageId.OPF_061, EPUBLocation.of(context), resource.getPath()); + } - // Store the resource in the data structure - if (resource.isDirectory()) - { - // the container resource is a directory, - // store it for later checking of empty directories - directories.add(resource.getPath()); - } - else - { - // Check file name requirements - filenameChecker.checkCompatiblyEscaped(resource.getPath()); - - // report entry metadata - reportFeatures(resource.getProperties()); - // the container resource is a file, - // add the resource to the container model - resources.put(resource.getPath().toLowerCase(Locale.ROOT), resource); - state.addResource(resource); + // Store the resource in the data structure + if (resource.isDirectory()) + { + // the container resource is a directory, + // store it for later checking of empty directories + directories.add(resource.getPath()); + } + else + { + // Check file name requirements + filenameChecker.checkCompatiblyEscaped(resource.getPath()); + + // report entry metadata + reportFeatures(resource.getProperties()); + // the container resource is a file, + // add the resource to the container model + resources.put(resource.getPath().toLowerCase(Locale.ROOT), resource); + state.addResource(resource); + } } - } - // Report empty directories - for (String directory : directories) - { - boolean hasContents = false; - for (OCFResource resource : resources.values()) + // Report empty directories + for (String directory : directories) { - if (resource.getPath().startsWith(directory)) + boolean hasContents = false; + for (OCFResource resource : resources.values()) + { + if (resource.getPath().startsWith(directory)) + { + hasContents = true; + break; + } + } + if (!hasContents) { - hasContents = true; - break; + report.message(MessageId.PKG_014, EPUBLocation.of(context), directory); } } - if (!hasContents) + return true; + } catch (Exception e) + { + switch (e.getMessage()) { - report.message(MessageId.PKG_014, EPUBLocation.of(context), directory); + case "invalid CEN header (bad entry name)": // reported by OpenJDK + case "MALFORMED": // reported by Oracle JDK 1.8 + report.message(MessageId.PKG_027, EPUBLocation.of(context), e.getLocalizedMessage()); + break; + default: + report.message(MessageId.PKG_008, EPUBLocation.of(context), e.getLocalizedMessage()); + break; } + return false; } } diff --git a/src/main/java/com/adobe/epubcheck/ocf/OCFZipResources.java b/src/main/java/com/adobe/epubcheck/ocf/OCFZipResources.java index f31f6c26b..8a719fb04 100644 --- a/src/main/java/com/adobe/epubcheck/ocf/OCFZipResources.java +++ b/src/main/java/com/adobe/epubcheck/ocf/OCFZipResources.java @@ -4,6 +4,7 @@ import java.io.IOException; import java.io.InputStream; import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; import java.security.MessageDigest; import java.util.Enumeration; import java.util.Iterator; @@ -32,7 +33,7 @@ public OCFZipResources(URL url) throws IOException { new IllegalArgumentException("Not a file URL: " + url); } - this.zip = new ZipFile(file); + this.zip = new ZipFile(file, StandardCharsets.UTF_8); } @Override diff --git a/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties b/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties index 1f6abd970..a09ca0378 100644 --- a/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties +++ b/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties @@ -316,7 +316,8 @@ PKG_023=Validating the EPUB against version 2.0, default validation profile will PKG_024=Uncommon EPUB file extension. PKG_024_SUG=For maximum compatibility, use ".epub". PKG_025=Publication resource must not be located in the META-INF directory -PKG_026=Obfuscated resource must be a Font Core Media Type (was declared as "%1$s" in "%2$s"). +PKG_026=Obfuscated resource must be a Font Core Media Type (was declared as "%1$s" in "%2$s"). +PKG_027=Could not extract EPUB ZIP content, probably due to file names not encoded in UTF-8. #Resources RSC_001=File "%1$s" could not be found. diff --git a/src/test/resources/epub3/04-ocf/files/ocf-filename-not-utf8-error.epub b/src/test/resources/epub3/04-ocf/files/ocf-filename-not-utf8-error.epub new file mode 100644 index 0000000000000000000000000000000000000000..dab8ad697678d08ee7dd590405fd6f6e1d83f2b6 GIT binary patch literal 1510 zcmWIWW@h1HVBlb2P^!Hdke|3(M}&ca0fd1<47r)PsU?*Ksfh&zIho0cC7Jno`l$t_ zN!nGJ1p!c<91MR_o(1S%UdOT;sN)9^ivh8(Ylx$+r=OdCa(-S(VrE`ykzPe^PHf*n zu0sX_F6X~#ABtP*mM=7mt7)F?3-&cH!^EonH=_JolH}Qfx6wvWvd)hA&(8V(bSO<4)ZbeNz3Y``kA*7_z-L=uh8r z!Eo89UO#u~7t5XgTWI~gc>O3d$a|1r0C`W7TXE$Yptqg?F-X)kAk+!$wbZ^?0dZG$SMB!oiJROMP!Sy)ZsnpRalBW!$@c)h920 zeHaw`-nUmp;;fJr_pF|6>y8K|XmEVIx4drlw`BKCHs?>87+17$oy`qTN;LcxuM-)l zSs`8G+RM;4Z&JUXL+AYux84|Za%x;?nmaYLEjz?~yRB-s_^&758|;346E2*&G5Ef8 zspJ9~C=eoa6JIKE10)x)!AieUX^Z7Qcel2xQ`>^qq7X^3z zEBlK7I&v@Us_hR0^-UX3$+BGU+q~Z^k4t9Dt3_fKSwU-Lub5;D{qgvFmbu+T(K+n2 z_~SDxbgDzuTkdp~?f6yrez*6cxmFv$E^E_swtR_f z?cbW|KSMv=N(|lV93Z6l=p=t7qmz%vMN6Z36YSoJBSJ$O7#fW)Hr!1IhD1LT1A_o8 zGzt=vvlG)(_3{hSdV}`*9X1fSSIPA{Y|0Hrm%u43oQy)u_Rf(J6GY>032#gmn!c_7 zNb!fH7@ygzq=VRwfBzd76qBE$y>Em3ielB2l}wH6-tp{o& z|6noOAhG45fNklqz`iYVKFQIlPS}nElP$ z3DI$i$vh$HTC-JO8J*O7=JoXPsB>SLJo&^eNA8`!_pXW$@MdHZVZdEB0b>RUz)C8FF6`wJLW>oU ziM^D9n1MUvA`BcH91Lzr-l0InaKHy-xdwzf>E|Vu=~ZNu-zhJ^5)-@v$wlH|E9y@ zsnRj6bKPI39b{j0fkEeVkY4%H`FtBzzm__uec1TQi-J4_C5CQw4iHj& zbdtZ4(aFc-qNUNi33l(q1H9QeX1h=Q_z38CPeuj?Q0Rcdm4iVV7&?tFHr!1IiuN-x zFbKdxrywyoJ25>~FTWtIH)yZlVFQ7Cm0YjGrrcn337o>h$tc8Z?;II1K{WoB@Wy1J z>D&5`6n{vH@tM6!I*9G~_rGyLG5I;#`!>k0C{|5b$<(;+9nVfz&h?*vKjB)pdh(&y zznt|03~uqN>NtPcIpy_NE7ixdx4ahrSE_Eym}zlB=d8-*77d@d>|DBBMyf}IL;9bb zcolYM@1A=ce1Ruyv-~BQ*X{^t)#F?K{qCO$`mc(PEPvj9o5Qw8se(tZ=6>ay!`oPb z+26dK5FMwO%oCEXHCy$S(Mi2$UQZuS-YZ(Pq%e2qd)svm)>#D{fzz07WyF^2ZFsml zNRngYn~j!t{?;s?ow`Anqr0s{Xq%k(g?rmWULTn6aO|Vzdyyv_*IbUDEHnTA`pxgX zEdH!K5s`c^?0dZG$SMB!oiJROMP!Sy)ZsnpRalB zW!$@c)h920eHaw`-nUmp;;fJr_pF|6>y8K|XmEVIx4drlw`BKCHs?>87+17$oy`qT zN;LcxuM-)lSs`8G+RM;4Z&JUXL+AYux84|Za%x;?nmaYLEjz?~yRB-s_^&758|;34 z6E2*&G5Ef8sEC$5B zt|5-Po_=onATK6n=A{4 z(8c}RyH-@lR2b*4y1JJ8l~?VjLtbC-Ux-08oC*58ZQk1_{%GcwsT;4Z9y zenbMWq6?x6QBWagQcyvK09HUIYO#f|5qma9=#2noZm?dYOpUA?TY5w2ehVxXz`D^> gAG#6faf&eF3^T464)A7W1F2&LLUW)qSF?b40KsuX^EonH=_JolH}Qfx6wvWvd)hA&(8V(bSO<4)ZbeNz3Y``kA*7_z-L=uh8r z!Eo89UO#u~7t5XgTWI~gc>O3d$a|1r@bmWz&0^@tT?_OUiv$A$ACPtp2zAoea|w zC~RA0?`J0In@7^NZJ3%Hrx$hXZ2QNP$7H)ww`~wr+Vt?j$z z&QkMA;tdQ8^(rz-a&v6^4f&f4L|W2+if-L)a>Ui+rGn9njFbxpH+n7gz2)@6_-K8; z=CPM?@Ag%ny!iEDQ0#l(UKNS6LRQ?fdbX`QB9x%P@$ugBy4l~7-8b2sKWSoI(Z+Q) zH#{lP@K?M}WT0k+bct&(L*Kkf{eBLe_dncvW6;T|aiMAM)X=u<5cBP}s@>who_uex z`}Iw@aOTF~`_ie_;+J%)m{;$(c7mg%_kN2US=;KOr8&!Kf`~!}bBV_-#mHSEER~!17izfK z9RH^L)$&);(v0XfW6=u^ANTG5z2sS$xy8q};3XHg2TbK+sWOiGv|4Q5{CC~@1-EN6 zn#}a8T9p>o<|Lk#$j$or2@co7HrrWTxu&C*XuLBgq1UC|S;;MO>Qo;SMXBs8wK9zp+a~y5u${2n z(fZ@t=bz_#C`wg2&DAJg;2X8Vd4Bup--UJOrk4gEIWB*Hf7*-6DF=%S1eX~;IPzO; zP2!xk2>z9aS~gyYUHNKR!2Ir4mnR2({S&fZ%JTB1Tc5Ukk7@jNCHaYtcaQTnv&GL- zenhY2YKic_5r1duZc~1~BMs9^4*qV?eIXztdG@9B37da2yjUw&N>{!&h&yrY!}os0 zw@(toW=ZI`N|x(9zg->`w_jK3<)8mE{f-JAYwkKTSI5=FaKiDUI+7~;zl%1USfjjU z;_>>ezu93i#=y`T;1>#vBsky&vcP$>xS%OHGciXWp>!@Va^dQL3~_X&#U+(Fsl|H9 z#l->Mj7%a7xXXKFV-Tf3LKpV39-+$$$i!aqL(ITkphC<5CQSxOU>Zd&UXk@;E3-g) zftrBe6HG5g35IS4dbt474Kd>q&=Fw!;iUw!UTirDq4y3mR4-D#Lg>aGt03Kw9NEW; o5yznD0tFfa1GZQOOMr=;=vtBEoRtlvh64z{0KNDQsFZ;L00PPE0ssI2 literal 0 HcmV?d00001 diff --git a/src/test/resources/epub3/04-ocf/files/ocf-filepath-utf8-valid.epub b/src/test/resources/epub3/04-ocf/files/ocf-filepath-utf8-valid.epub new file mode 100644 index 0000000000000000000000000000000000000000..2117b3f149206b7148c07c777e0c6529edb60f1d GIT binary patch literal 1785 zcmWIWW@h1HVBlb2P^!Hdke|3(M}&ca0fd1<47r)PsU?*Ksfh&zIho0cC7Jno`l$t_ zN!nGJ1pz>vA`BcH91L?Cd_#eX;Xne&at#P|(l0JJyfQg6F-O0+q%tS9STDJ_n3*BK zn;m2{0~1s|$Q+OhI2ihX=B%r`&C|ljz;FjBEdVp8ATc>RF+Ei;zaXtQXm8$a1A%+L zMUJd(bP5cWr+-#12Q~qlCD`{y)bepm01&5FO_Wxe; ztjyfvV_Wc&i`xUHa3@$K`^b3GKLDxKzP6ff|NTH!pu z{q*m`x^vS@gO41SzrR22Mdg%(#RY=P3?Cf%Ew(0cPFn>3%0n$1FT}2VwJczM_p8g3 zgTDR=*)L^z`O>XVTfWCMe!G(VM8~_wd7Ihd=P5s;S8}yP_}_@XGj+EqKi`ptX(b1L zH|V|)kdZw5Qu>6=zZqVvl`Ewy-y6i8IQHRtzv9~`iD9!O^jjs%b)MfYkBZx`tMu~E z|CxSA1&=j%otdlSYGOFy_)#56mHppE8&0fI-ZJrc{np>?h#32m@+?6A@;a8)z?k|0 z#9~0~>l)&y>*?pFpPZjpl9-v7TBKKzn-kl2kn50vfXn%B+K1wny5$Sa;%b^_`+|MV z%dqB`c^N?$_iyi7Q6W=doWJVoTJBd~wVw`&iEXyjNK)^8KDqU9U9_I#+EOS{P4Tfy*4f@l!Trgbrsn^e4`o(gm{}x(*FJ3>&jOxAc zuZ{`#fZp-~CQ3da4Nlm3iDh~f86~+np%V=A4=V_?rauoC`1Wf~Nk;D6$Xk8B)+H}P z1f|(kcuwp)a&o_Z+lrv&?e}WS-}BwsSGT98wtkb2><` zf9ZU`4Xa;EtKa&K{4kWv-oeA@cGhvl!8t72!K7b@>|<#yz1IkB8^g1W+yjwy4EAM?F@UdMXR40J`9R|@7t>)aaPERdsff3bw`8}G&nxqTV6N& zTeABmoAW14j4Rr>&gOuyxq4WNSTW<_HIW;ac&7B(B zmK|cg-Bz_*{MVE34R*i22^Y@X7<^wk)mr?LP8IX&9oJ59l$2a5SQzZp77?Gt*fV<{ z%zq3RMHoD$;YC=0HzSih1MZ>{7$8UhR&YXep%#;{f)7+oB7hZ;iL1~=*p0oAM3~Y9 zGX<&0MAnTh!y0G)