From c22368ff30d522aad55f30b77deafe93d8bdce03 Mon Sep 17 00:00:00 2001 From: Calixte Denizet Date: Sun, 18 Apr 2021 23:37:22 +0200 Subject: [PATCH] Support search with or without diacritics - get original index in using a dichotomic seach instead of a linear one; - remove diacritics from text using NFD decomposition and unicode regex; - convert the query string into a RegExp; - replace whitespaces in the query with \s+; - remove pdf_find_utils.js. --- l10n/en-US/viewer.properties | 1 + test/pdfs/.gitignore | 1 + test/pdfs/french_diacritics.pdf | Bin 0 -> 10500 bytes test/unit/pdf_find_controller_spec.js | 125 +++++++++ web/app.js | 3 + web/firefoxcom.js | 2 + web/pdf_find_bar.js | 6 + web/pdf_find_controller.js | 356 ++++++++++++++------------ web/ui_utils.js | 4 +- web/viewer.html | 7 +- web/viewer.js | 1 + 11 files changed, 333 insertions(+), 173 deletions(-) create mode 100644 test/pdfs/french_diacritics.pdf diff --git a/l10n/en-US/viewer.properties b/l10n/en-US/viewer.properties index 5fe094b769b286..e02857c1157980 100644 --- a/l10n/en-US/viewer.properties +++ b/l10n/en-US/viewer.properties @@ -168,6 +168,7 @@ find_next.title=Find the next occurrence of the phrase find_next_label=Next find_highlight=Highlight all find_match_case_label=Match case +find_match_diacritics_label=Match Diacritics find_entire_word_label=Whole words find_reached_top=Reached top of document, continued from bottom find_reached_bottom=Reached end of document, continued from top diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 1abd53fd2fc180..bc44edac2f7948 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -327,6 +327,7 @@ !issue4650.pdf !issue6721_reduced.pdf !issue3025.pdf +!french_diacritics.pdf !issue2099-1.pdf !issue3371.pdf !issue2956.pdf diff --git a/test/pdfs/french_diacritics.pdf b/test/pdfs/french_diacritics.pdf new file mode 100644 index 0000000000000000000000000000000000000000..ba5b5cb07e2bde6e804ea005722ffa01b4f851c5 GIT binary patch literal 10500 zcmbuFcT`jB(*LOeL+?Ra=shHaDj=a(siGhq0s_(r9RxxV=~5IBF;qd6CLq1{B1J$! zng}8YNRux7@Z59m@4oNJT9>olto)I^v$OUypUgA!o!P^!r>rUrgNOsUN5;E80^wjN z*v-ZXC@TvT(YABFi*f+N#3aD~dI3dL9i35jo?sDGXKR$5vYo9P(hhk2I?xN{X=m*M z^qZDTf|29FNC{3F+qrL>^>+INIX=Q*bcj&V=kj4PdNro;X)H2N9=&@zO~`#P#{Pw~ zJX6U$7u+Gi=;oi5?PO!+1>p~1>{++Z(Sh{sDC!1DqhCG}L{sbEI3_`dkH zZ-hOcB%mQ|-AIhiDV0{eD*$>+|UE zcqvR71?gs)hT1WLNBhTTp?$~4W|{Bf8^v`u%Rj1z64uhaDarh0E&jpuE+VL6DVE)m zmW+W`yL3uED$UJI_k_U#*>@g%^=PQAC8%qq?dUU8w9gnPc$iR+0YzT00+ZxgB;OYx ze+k~7QJ9fZ8op{Dps2546%ORMhOz}aJ;*fpUN2A-O9Dl?zRp^1jpk+C{ua1F=`u36 z5+>pBWj%`3iMR@0#du4YGGj(LK*)l&w##xOGc@e)oTadEmWE%NbMV+-oN)y^Y)S)- zowHkOdVS=CZvG2&Af`4B9k?JvCL~4Qggfpx1VpRw(Ow(DIX=ji0}8^SUN7#C{rBJmr*3 z7vaWn$Izwb>miLF?Pkt6_XccQP2{8qj<$2fcepTCs}2GQy8jCfxb*;W*-gwiB>lIQK@ zP^n+`7hni(y&;8Kh(JQ6)LSr%_oKoV z4-pmlxc+JQJ2I)m>U!y8x?`$qK1tn@54)C%%rK|wa{$)tr7rynS_9_yPefEaI5tIt zx@Gg4#7yC}!L3G-t7*M(?cr70_iKuc&5g6R_4TrJsCxPjsAWJ&HO>O1AjU{w?#|QB zJCYib%TjFTY(IXOZ_9hqWs1Fjyiw)75;U`2;(I@;GSO`%(4jL*g91VHPT>;D&^btkPwftxGL)IHcWj7Wm=fJT(?(UtzH$?j<8AV;y^nXy zcp*0QHsrd0Cgxd+NnmsUuOwqXZdKvTor8u&lSrmhTq|hQ`QHh{pjZ$~OJ5ps^`jQe zP6XwH5Ox)-EBVz@gN>m)(#h!@lv29TNK9HJG5QWr07xIUp0}*OL(E@h>^eL9&^_$l zjYkwdJ}bkLb{@^mm{j1k0RaI+0k43*&If#+UYQ2f*-FE1Ht%DWvSZA+K72Lf+8_%K zzTXmq^a|9-P_Nlb(m&>#7RGhpi3lw_FvhKz9K*2X2iaWv2WmGyz9Bq0$*McT)Chc-Fjd)6Xe4{Px>XLJ?N{#HI&r6lh9iu!KXXf((!? zb4l!l;F@kO7z;7c|0}V!-?gG|_}YrReX}oINT^6nL8a~K z^*d|@i6ePL?1e{y=>{=M?P|#xTwE`@`s!_l1m|yb-Q}KV*wgMltYD4mXrJYrcsx$V zq_oyQoLu?g^mu0<@p8`0UE$^DPw1Zs&K>P)0*Xt|!@L&osPiF%cJsa8lENxQMq)xD zNk+Sg=ktHCos}~q&A|_o5QCML2#Rkioov}0$Dss{t#ypu)$@ZDu9X?t?$91~*QQRG zR16>I9A`iySRDb}VPzZnB2y2luc|6w+qj*_frS_r`6DWJd3BIBI^={0qr_P-p*I8N zTgXMfqp3A{3kyY5%xVOflF^ zfOErrnuSzGAg^?7s$>+LzY~y~y`_!^dcTJJg zH-a?`r$9(RFYYD5=0`t}8dy0Ym+y1U{@dMK2kz+sm|LpmuI4IQ4gvYBY80Fzd=ZLV zsJNieM?ZRl-IG5Hp5f8g=fbc66vN_ju*=8Cv{5Z!JCqV8+r)CA#_x3q3^m)>Kf^Lk)$8w!<9ynfVpbKZci%jlH}wg2sgC%%t9CR>VaznGWKlQ^jbKfGrCbm31KC1* z`%Le1)#R=Y0LlxMxW|8^aDUKUm>B?^N(CDt`L{YoIo`R0L^w5o(CQ!BZy;|o2KD)K z``<$H+Qx;C=_O;+Et5PL6CJxxabY1n?*Y8*kS+X*{-QTnP>KH?6-@l{z6O^VgTrq@ z^>A$Q2}zV9X_ZP9A)Ih9+aRF^*NUgKzvt=$K~RRrY9NkE-JV=9oo0-tVcp&{yezuV zAcjZ>CBkrHh%N&mfE54$NP}yL7LNgds9_=yi0!(S)%C5gu%;Yz*z-03pD+>E5hDkK+%Tw`2kI)i^?JDk@`Y(-pGXXZ+zIK8k3hZKGm(%Pyw z4;6y-y7k^)_WZd7T*AWg$k7 z`fIP3S&6K7ElGbhCRl<{15JB#ax66ac@#I-KlrS$olFs0HVt=kSiR8{{xsax9Uwg* zeLvxTX`7(D?-9jTVSC^yK=W(P78d^4N{fFW1U8akz;*McLmWhTkyyyS6CW_+d5vfT{iYJKFOH`kcU#4*JQbAONV^W4VJ|@p&d|&9Yrx zm=vyD*9ehZzQi|V#DiZRWY<^^lInxAMxRS_1>fv%F3Qciry)u6wf#XO6V+9!No9vv zy=RehKZ6;Sz8Iv0RDQH!DlnJ}V91V`d)`txFhf1n)kWmCB#@#LzeztD3=_;-q-61! z<-W6YBkfs!Ix5}KnuemPQdCzKJ(P-*(PMhwDyfwq+nOsR6bmtIg@O3x_u$4@o6`j` zz=8>f&|q6*SO$Yb3ST!f9@@x(94+VnyA)G2z0)h~FUHoT);k(r`)>VX?A!gB$gM~d z`+aDLL;x!pe-5tPPkttY>@i3e#h?L%CzWE)KCo5e;dXK*?U&mSTeEko-**EOw~^b5 zX3xfA0w;}iuHb0npAo1{WRI5{1*YAhb(%2zG-9h$GwQ=)#q@;;g*(?pj?-3_=(R~) zEh7&kB_PK0|3U3@yUlLgi>4=i|)WqAYew7Bws^h7sJtW34 zla{vG_`rpKqhuQxtb?~A7P|SeBys8LUGC=3WW)ZURDUNaicSBEN?v;JZzRk7_Pr=I z1A;=RTCW)*h0mWBy@0RH?qJ7`#yq|d2hUiAFI?|2{o?Tt9!Ip16T@Q)dmsDQABl@T z%rPS7j^12AZXG1k)}uTe70cw@H&eS{J2!Uy);R*Z{juUK-6t%8;Zu5gEPd2)K^-;4 zrKDiqtg~5vkbUWAVXx;c#`^;Y%RxIzk0ISw*^~bDa-+ z_=k^3$aEv>u^_{y+r=*Ld@k=4G`oJQ&{SOfH&6yjdkDsnn~<<#D+p~)Je)@2Q2;UL zt9J>+OWgaW?-?P~!1o&dv(C=}kBuwkh&M-8dN~W9je%JcGJIIqDoHB+9Wopupj%e^ za&+FScXYm2{P_5>{G7byNSVZKGmPb~Qri*dA(I3R;K1=Fq8u?=3hfT$c zU%r##@=o_c#nbj|TSgoE#jX+pb|DxS?no+Nqz6F*Xkwb{l{NCN_Z& zzr5&58~)Q7TL^>(+szOcJNUh(0I}l25kUiGkRZdNra4gHe>!8zjv1eu?H}HlQMijSP zL&ScOS1YNGiVB;ACYEQxzaoX3pQ-B{vU+BXNfS^-zoRJ-$3X_2ibZZ!nkZ*z3@&Sj2gnBpb}P;oXStl z&C-AD5J<}wmfRkbA)bTK8*Q9;cY3t@qPMVcDbDMDs-#FA^wj=a%8ueoM){#L(=xW3 z;`tx481fkmcxM~84n7RatU4VHMxln7I;I|HX668JQgZNx$Xm&;jN`&`SLPq~_i_aYqx4J50{E zg6-9pH9I*}xn2Nb1>@rNZB*9V(^J@nnC^^H#Sa*eV?l;R-i!W&u;)Ee@w-!Wk#lQ^ zwLdq9Ci&ye3iT;C$7T6i0Tl#EDh#wpRAl-#@;35jIP7BP|z~B-(J_TKg$<* zrz9Mi>Pesf%1(bkYrz*H$V^{Zo9>o9?>AK$l2mCE5^lS{@jB>!P*bJloVk`&OL7mm zi{`Ch4vpe6jlM=deJv{zA1c$dnX2kB?NB|+hf)a{SYtYhZiaNcYTJvr3vMWI>!&tP z)=H0zKT>vf=VtAbOnD<(*%7M_cND8|^2$e&L2wU{EGl%{HXW?bvj%1GN;JPhB( zi_IAP+VNyX(qi11jZ5U?oK1#HFz@rHWw&c$PYR@J1pQITavn|8xmR)w3JhvA6F)E` zs)#&Dk})ov)3Pee6CltYjKdS;XGA}$ZBA-%$bR75<{H-tC&md8eu1TXlu_iHD z?HlaD*kR8I8r`we_O9{`+%*4%NpbPlfMiFDwZ`!k?Kv)ybZzhZ64|m%{kinMW&Z4& znT{SmQ%gG<-K7=)Sc4%Mny(jK5P|ASi?+0$D}&5>q{!wZzSxyLsnp6Sc-)swsPa2 zq>My4or`iWa^5L6t+16LdZ`Y_q--7h-in{#3WK_a3RQt7A3tJ*{cd87+BTJ0efgrc zbF`vDo5P;Q?m3TEA!H@Ou-PZ;t8Byavph*nxCT$quK85+SR>cTprh)({upwA_ss7+ zh+W2V?hWUKlKX;GT1I>8iEF=RQjT|2>x^H8wC7yg`!$T>jSi%^scLUJfoO~@-6o`Q zf53-9tW!~J5vnTRKM&V?<{_2Gz~mt{dqgvF3#8Jf6?(495K0MxJXag|l<|$aJ^kzXpdcvp zCKP&VZFSny8k8N@)D-rtE-Z07l(Yk%BnpiOh$SKwB?9?f1sQ{q+SmYrAREBiFMR48 zD!gm2&d!O57ys-Ar1z^o{=1zQt&G1vF%`Xh#|tho0vFT2uoKT(i_lQ(-wlp=oX-E+ z$oZcg4kYLh^hbw7a-0p(1NKOg9c|4H3p4ic_wWdgF9NUG5$#Pw^rs0knbB0iM9 zm`qt3lV2n=`+yMo*jI?yM7r^yZP51YXywN4wldAop6tY`vRB!si)VMcLOhw4r_Ii+ zNQ-Dwr>|YJ%feddK7@;yuf0qCHL zWPY;fp71_mA&M~SyWbuZE6+PWjU;nN#A4dG+pod^bh%@cY1%fq3DQ;BRPmqVsa+^i zN$byZKb+`{?(5E}PH#m^2<+NS4sxC6$S}U8OBm?&W}wKFtQ zaZwT$~TskZOPH$74+P5C3a*`8OuujRepZd76!?m_DD zRxPUD)26}mCQ49+(A)6e()sDGD4ZaA*AytruV;Y60Hc1o1*erP#Xvsq7vHXTl)Yv2 zLnb8mc+6#)Ti(=pJF0Z#GZ`8C;yLOK`sZ5Ka+6VL3N6DS0dm4y0Sgyw!W(*duNoTu zqi_Z+$md6EQ;ASs2ye+NCedJy>#eltTl6*szf0#xtAwxmo&*E5ZBiu?9Q!)k!h^9U zEMj>{_zAUCo>}EOesp6AHo|?O-rtQ(T#KG0XL`KSi08T?z_M!{VwSRYXsS3 zFdqxlsv{zW@%!z(=S-g@zpNr>c)Dk3UX=(=XV^w95t{Yk8l+TQ9qB472V#_s7Iuma zTOrmA<^8_H&mDsKeHRO|evLQb-}R##QQ$b=e)II(&AMpKs}AX7sjRv+s9Y?3uqbSp z)a4rsL&Ft7v>q*B!IC)wzC=Qh%}`X5YpoT=xI{TVPI+VEqux0Ex4a|1>*y~mTr&Bw zVl}8Jx5FR9pJzr`PFNe(cYD8p7v1}L6H93$sOx1{EfN@F3=L$+6%7O*Y82dSo@c#l zux;PQS{!ju`3>RNUT(eID*INe!oBmA`^#pRIiKk>3R?$^YTl!t)nbEJ3#0UXpUICN z8f+?;Wc~8k!pjCpw30qL?dcO7b<(7Yr*+5}dp!3f*MAoa3M|SQ_7D1daM%R=B5{=C z*w~T~!vE@wYk~h)L^Zic8vms=9)(XzI*1q~@7zI{g@lIvI>(JV2g{!0bWr#g9qM)e zWD7kzI>(Rtv;XatXw>s}Yc3@J|IZJ4o^D8Q+lv7jK6j-3O*`X@!5c9sL=?sk6j8dE zph3A^d~rn6lAE5^ z?(TNTi=iEBXD>UTh~7;VV~DD^v$Ku0tLyKz+;njC0$;rUwI(6(Ejv%Ii(7-?5SSQD zTv}8@Tv!w$@z+}aSRq*3(Z>#au_&mBvYm~iwJSgPp1U>5!OPAX?DOZ(B_ME!DF5&A zpg%`%{_}sm>F;g_7SXkFGDNumMRdSYzh{d)y-;9r(SHvfNl0EkZf+Q9xqd>B&VHwD zM2Kg$0HBrb2KXvZIn?n`)_ytNnjDx4pcd}frvne2$ zy+K(bDV{atr6HGTN=+!tf(FRAx6Iz~ zNrMk!iPWs-Q>n?mNL4A$;gS{&{)EmG?`hiT${ru2^~#$!%psBKRmMqoG6-Ca*3kaomO1k9CgLnupw=a)M*1_0K zr&x(jRxTt>+QJ5z{SUaP8GT>od8}`?whn>Zo7hK&GhVF7ra*TdF4h>!v+T?y^3~_v z*+Xccg1k$@!xmE}Z5+l_NvkGftoe1vk(e}#IQ47HWb2dqq_4(pz^jiGT+z^lMUrqc z6imxQX?GNBPXDrnI>U z{`|$Xjuk4W*~gSQoq2Ic&T}qqsO&5!y(X*lMGKE+Y6+I0@Shs%Ut7W0z|I~hqT!0P z^8=eg!D6E3KoLX70K1D%K#|*EQy5qj47)gI;O2$`!+(owgWs3iyMh0RFuxC!!KSj3 zii)bDib@KS(ozzVa8YTvC=3o4lY~pbpfFWsDaGsH|9;8EUH(1#cX=U87cWDOYYkEv zezrZ_X{!y`j$ZLoeo>BhG1p_jKMoWeDXG$Y?fo&-VZw$e$S^BwX6dWN!pdhP*k&-h z`(!Zeki#whl*kiKJEM1~ctlM1xX4erfGp-~QIOg;6Zd3^v;NN*C$^iM+KV9tPKK74XHpQTTJ#%Fzw)8qW~ob?c2(2N@Mcko^)YfqG)=LPBu9#9yN Ln_ER!75M)E&lq1~ literal 0 HcmV?d00001 diff --git a/test/unit/pdf_find_controller_spec.js b/test/unit/pdf_find_controller_spec.js index 1b97f47e48eda9..814e4c9ba2005d 100644 --- a/test/unit/pdf_find_controller_spec.js +++ b/test/unit/pdf_find_controller_spec.js @@ -271,5 +271,130 @@ describe("pdf_find_controller", function () { pageMatches: [[19, 48, 66]], pageMatchesLength: [[8, 8, 8]], }); + + await testSearch({ + eventBus, + pdfFindController, + parameters: { + query: "1/2", + caseSensitive: false, + entireWord: false, + phraseSearch: true, + findPrevious: false, + }, + matchesPerPage: [2], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[28, 57]], + pageMatchesLength: [[1, 1]], + }); + + await testSearch({ + eventBus, + pdfFindController, + parameters: { + query: "½", + caseSensitive: false, + entireWord: false, + phraseSearch: true, + findPrevious: false, + }, + matchesPerPage: [2], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[28, 57]], + pageMatchesLength: [[1, 1]], + }); + }); + + it("performs a normal search, where the text with diacritics is normalized", async function () { + const { eventBus, pdfFindController } = await initPdfFindController( + "french_diacritics.pdf" + ); + + await testSearch({ + eventBus, + pdfFindController, + parameters: { + query: "a", + caseSensitive: false, + entireWord: false, + phraseSearch: true, + findPrevious: false, + matchDiacritics: false, + }, + matchesPerPage: [6], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[0, 2, 4, 6, 8, 10]], + pageMatchesLength: [[1, 1, 1, 1, 1, 1]], + }); + + await testSearch({ + eventBus, + pdfFindController, + parameters: { + query: "u", + caseSensitive: false, + entireWord: false, + phraseSearch: true, + findPrevious: false, + matchDiacritics: false, + }, + matchesPerPage: [6], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[44, 46, 48, 50, 52, 54]], + pageMatchesLength: [[1, 1, 1, 1, 1, 1]], + }); + + await testSearch({ + eventBus, + pdfFindController, + parameters: { + query: "ë", + caseSensitive: false, + entireWord: false, + phraseSearch: true, + findPrevious: false, + matchDiacritics: true, + }, + matchesPerPage: [2], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[28, 30]], + pageMatchesLength: [[1, 1]], + }); + }); + + it("performs a search where one of the results contains an hyphen", async function () { + const { eventBus, pdfFindController } = await initPdfFindController(); + + await testSearch({ + eventBus, + pdfFindController, + parameters: { + query: "optimiz", + caseSensitive: false, + entireWord: false, + phraseSearch: true, + findPrevious: false, + }, + matchesPerPage: [1, 4, 2, 3, 3, 0, 2, 9, 1, 0, 0, 6, 3, 4], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + }); }); }); diff --git a/web/app.js b/web/app.js index 213b8324a28b69..6903a8a1eb152c 100644 --- a/web/app.js +++ b/web/app.js @@ -2603,6 +2603,7 @@ function webViewerFind(evt) { entireWord: evt.entireWord, highlightAll: evt.highlightAll, findPrevious: evt.findPrevious, + matchDiacritics: evt.matchDiacritics, }); } @@ -2614,6 +2615,7 @@ function webViewerFindFromUrlHash(evt) { entireWord: false, highlightAll: true, findPrevious: false, + matchDiacritics: true, }); } @@ -2820,6 +2822,7 @@ function webViewerKeyDown(evt) { entireWord: findState.entireWord, highlightAll: findState.highlightAll, findPrevious: cmd === 5 || cmd === 12, + matchDiacritics: findState.matchDiacritics, }); } handled = true; diff --git a/web/firefoxcom.js b/web/firefoxcom.js index 129190a747851c..a98d093177b65f 100644 --- a/web/firefoxcom.js +++ b/web/firefoxcom.js @@ -218,6 +218,7 @@ class MozL10n { "findcasesensitivitychange", "findentirewordchange", "findbarclose", + "finddiacriticmatchingchange", ]; const handleEvent = function ({ type, detail }) { if (!PDFViewerApplication.initialized) { @@ -236,6 +237,7 @@ class MozL10n { entireWord: !!detail.entireWord, highlightAll: !!detail.highlightAll, findPrevious: !!detail.findPrevious, + matchDiacritics: !!detail.matchDiacritics, }); }; diff --git a/web/pdf_find_bar.js b/web/pdf_find_bar.js index 3388711d935fd7..cd00f8d438b245 100644 --- a/web/pdf_find_bar.js +++ b/web/pdf_find_bar.js @@ -33,6 +33,7 @@ class PDFFindBar { this.highlightAll = options.highlightAllCheckbox; this.caseSensitive = options.caseSensitiveCheckbox; this.entireWord = options.entireWordCheckbox; + this.matchDiacritics = options.matchDiacriticsCheckbox; this.findMsg = options.findMsg; this.findResultsCount = options.findResultsCount; this.findPreviousButton = options.findPreviousButton; @@ -82,6 +83,10 @@ class PDFFindBar { this.dispatchEvent("entirewordchange"); }); + this.matchDiacritics.addEventListener("click", () => { + this.dispatchEvent("diacriticmatchingchange"); + }); + this.eventBus._on("resize", this._adjustWidth.bind(this)); } @@ -99,6 +104,7 @@ class PDFFindBar { entireWord: this.entireWord.checked, highlightAll: this.highlightAll.checked, findPrevious: findPrev, + matchDiacritics: this.matchDiacritics.checked, }); } diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js index 7eeb01e65e8658..e1c8fdb317643a 100644 --- a/web/pdf_find_controller.js +++ b/web/pdf_find_controller.js @@ -13,9 +13,9 @@ * limitations under the License. */ +import { binarySearchFirstItem, scrollIntoView } from "./ui_utils.js"; import { createPromiseCapability } from "pdfjs-lib"; import { getCharacterType } from "./pdf_find_utils.js"; -import { scrollIntoView } from "./ui_utils.js"; const FindState = { FOUND: 0, @@ -42,47 +42,141 @@ const CHARACTERS_TO_NORMALIZE = { "\u00BE": "3/4", // Vulgar fraction three quarters }; +const diacriticsRegExp = /\p{Mn}+/gu; +const escapeRegExp = /[.*+\-?^${}()|[\]\\]/g; +const prepareNoDiacriticsRegExp = /(\p{Mn})|(\p{L})/gu; +const whitespacesRegExp = /\s+/g; +const notDiacriticFromEndRegExp = /([^\p{Mn}])\p{Mn}*$/u; +const notDiacriticFromStartRegExp = /^\p{Mn}*([^\p{Mn}])/u; + let normalizationRegex = null; function normalize(text) { + // The diacritics in the text or in the query can be composed or not. + // So we use a decomposed text using NFD (and the same for the query) + // in order to be sure that diacritics are in the same order. + if (!normalizationRegex) { // Compile the regular expression for text normalization once. const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join(""); - normalizationRegex = new RegExp(`[${replace}]`, "g"); + normalizationRegex = new RegExp( + `([${replace}])|(-\\n)|(\\n)|(\\p{Mn}+)`, + "gum" + ); } - let diffs = null; - const normalizedText = text.replace(normalizationRegex, function (ch, index) { - const normalizedCh = CHARACTERS_TO_NORMALIZE[ch], - diff = normalizedCh.length - ch.length; - if (diff !== 0) { - (diffs ||= []).push([index, diff]); + + // The goal of this function is to normalize the string and + // be able to get from an index in the new string the + // corresponding index in the old string. + // For example if we have: abCd12ef456gh where C is replaced by ccc + // and numbers replaced by nothing (it's the case for diacritics), then + // we'll obtain the normalized string: abcccdefgh. + // So here the reverse map is: [0,1,2,2,2,3,6,7,11,12]. + + // The goal is to obtain the array: [[0, 0], [3, -1], [4, -2], + // [6, 0], [8, 3]]. + // which can be used like this: + // - let say that i is the index in new string and j the index + // the old string. + // - if i is in [0; 3[ then j = i + 0 + // - if i is in [3; 4[ then j = i - 1 + // - if i is in [4; 6[ then j = i - 2 + // ... + // Thanks to a binary search it's easy to know where is i and what's the + // shift. + // Let say that the last entry in the array is [x, s] and we have a + // substitution at index y (old string) which will replace o chars by n chars. + // Firstly, if o === n, then no need to add a new entry: the shift is + // the same. + // Secondly, if o < n, then we push the n - o elements: + // [y - (s - 1), s - 1], [y - (s - 2), s - 2], ... + // Thirdly, if o > n, then we push the element: [y - (s - n), o + s - n] + + // Collect diacritics length and positions. + const rawDiacriticsPositions = []; + let m; + while ((m = diacriticsRegExp.exec(text)) !== null) { + rawDiacriticsPositions.push([m[0].length, m.index]); + } + + let normalized = text.normalize("NFD"); + const positions = [[0, 0]]; + let k = 0; + let shift = 0; + let shiftOrigin = 0; + let eol = 0; + normalized = normalized.replace( + normalizationRegex, + (match, p1, p2, p3, p4, i) => { + i -= shiftOrigin; + if (p1) { + // Fractions... + const replacement = CHARACTERS_TO_NORMALIZE[match]; + const jj = replacement.length; + for (let j = 1; j < jj; j++) { + positions.push([i - shift + j, shift - j]); + } + shift -= jj - 1; + return replacement; + } + + if (p2) { + // "-\n" is removed because an hypen at the end of a line + // is likely here to mark a break in a word. + positions.push([i - shift, 1 + shift]); + shift += 1; + shiftOrigin += 1; + eol += 1; + return ""; + } + + if (p3) { + // eol is replaced by space: "foo\nbar" is likely equivalent to + // "foo bar". + positions.push([i - shift + 1, shift - 1]); + shift -= 1; + shiftOrigin += 1; + eol += 1; + return " "; + } + + // Diacritics. + let jj = match.length; + if (i + eol === rawDiacriticsPositions?.[k]?.[1]) { + jj -= rawDiacriticsPositions[k][0]; + ++k; + } + + for (let j = 1; j < jj + 1; j++) { + // i is the position of the first diacritic + // so (i - 1) is the position for the letter before. + positions.push([i - 1 - shift + j, shift - j]); + } + shift -= jj; + shiftOrigin += jj; + + return match; } - return normalizedCh; - }); + ); - return [normalizedText, diffs]; + positions.push([normalized.length, shift]); + + return [normalized, positions]; } -// Determine the original, non-normalized, match index such that highlighting of -// search results is correct in the `textLayer` for strings containing e.g. "½" -// characters; essentially "inverting" the result of the `normalize` function. -function getOriginalIndex(matchIndex, diffs = null) { - if (!diffs) { - return matchIndex; +function getOriginalIndex(positions, pos, len) { + const start = pos; + const end = pos + len - 1; + let i = binarySearchFirstItem(positions, x => x[0] >= start); + if (positions[i][0] > start) { + --i; } - let totalDiff = 0; - for (const [index, diff] of diffs) { - const currentIndex = index + totalDiff; - if (currentIndex >= matchIndex) { - break; - } - if (currentIndex + diff > matchIndex) { - totalDiff += matchIndex - currentIndex; - break; - } - totalDiff += diff; + let j = binarySearchFirstItem(positions, x => x[0] >= end, i); + if (positions[j][0] > end) { + --j; } - return matchIndex - totalDiff; + + return [start + positions[i][1], len + positions[j][1] - positions[i][1]]; } /** @@ -302,192 +396,111 @@ class PDFFindController { return true; } - /** - * Helper for multi-term search that fills the `matchesWithLength` array - * and handles cases where one search term includes another search term (for - * example, "tamed tame" or "this is"). It looks for intersecting terms in - * the `matches` and keeps elements with a longer match length. - */ - _prepareMatches(matchesWithLength, matches, matchesLength) { - function isSubTerm(currentIndex) { - const currentElem = matchesWithLength[currentIndex]; - const nextElem = matchesWithLength[currentIndex + 1]; - - // Check for cases like "TAMEd TAME". - if ( - currentIndex < matchesWithLength.length - 1 && - currentElem.match === nextElem.match - ) { - currentElem.skipped = true; - return true; - } - - // Check for cases like "thIS IS". - for (let i = currentIndex - 1; i >= 0; i--) { - const prevElem = matchesWithLength[i]; - if (prevElem.skipped) { - continue; - } - if (prevElem.match + prevElem.matchLength < currentElem.match) { - break; - } - if ( - prevElem.match + prevElem.matchLength >= - currentElem.match + currentElem.matchLength - ) { - currentElem.skipped = true; - return true; - } - } - return false; - } - - // Sort the array of `{ match: , matchLength: }` - // objects on increasing index first and on the length otherwise. - matchesWithLength.sort(function (a, b) { - return a.match === b.match - ? a.matchLength - b.matchLength - : a.match - b.match; - }); - for (let i = 0, len = matchesWithLength.length; i < len; i++) { - if (isSubTerm(i)) { - continue; - } - matches.push(matchesWithLength[i].match); - matchesLength.push(matchesWithLength[i].matchLength); - } - } - /** * Determine if the search query constitutes a "whole word", by comparing the * first/last character type with the preceding/following character type. */ _isEntireWord(content, startIdx, length) { - if (startIdx > 0) { + let match = content.slice(0, startIdx).match(notDiacriticFromEndRegExp); + if (match) { const first = content.charCodeAt(startIdx); - const limit = content.charCodeAt(startIdx - 1); - if (getCharacterType(first) === getCharacterType(limit)) { + if (getCharacterType(first) === getCharacterType(match[1])) { return false; } } - const endIdx = startIdx + length - 1; - if (endIdx < content.length - 1) { - const last = content.charCodeAt(endIdx); - const limit = content.charCodeAt(endIdx + 1); - if (getCharacterType(last) === getCharacterType(limit)) { + + match = content.slice(startIdx + length).match(notDiacriticFromStartRegExp); + if (match) { + const last = content.charCodeAt(startIdx + length - 1); + if (getCharacterType(last) === getCharacterType(match[1])) { return false; } } + return true; } - _calculatePhraseMatch(query, pageIndex, pageContent, pageDiffs, entireWord) { + _calculateRegExpMatch(query, entireWord, pageIndex, pageContent) { const matches = [], matchesLength = []; - const queryLen = query.length; - let matchIdx = -queryLen; - while (true) { - matchIdx = pageContent.indexOf(query, matchIdx + queryLen); - if (matchIdx === -1) { - break; - } - if (entireWord && !this._isEntireWord(pageContent, matchIdx, queryLen)) { + const diffs = this._pageDiffs[pageIndex]; + let match; + while ((match = query.exec(pageContent)) !== null) { + if ( + entireWord && + !this._isEntireWord(pageContent, match.index, match[0].length) + ) { continue; } - const originalMatchIdx = getOriginalIndex(matchIdx, pageDiffs), - matchEnd = matchIdx + queryLen - 1, - originalQueryLen = - getOriginalIndex(matchEnd, pageDiffs) - originalMatchIdx + 1; - matches.push(originalMatchIdx); - matchesLength.push(originalQueryLen); + const [matchPos, matchLen] = getOriginalIndex( + diffs, + match.index, + match[0].length + ); + matches.push(matchPos); + matchesLength.push(matchLen); } this._pageMatches[pageIndex] = matches; this._pageMatchesLength[pageIndex] = matchesLength; } - _calculateWordMatch(query, pageIndex, pageContent, pageDiffs, entireWord) { - const matchesWithLength = []; + _convertToRegExpString(query) { + const { matchDiacritics } = this._state; - // Divide the query into pieces and search for text in each piece. - const queryArray = query.match(/\S+/g); - for (let i = 0, len = queryArray.length; i < len; i++) { - const subquery = queryArray[i]; - const subqueryLen = subquery.length; + // Escape characters like *+?... to not interfer with regexp syntax. + query = query.replace(escapeRegExp, "\\$&"); - let matchIdx = -subqueryLen; - while (true) { - matchIdx = pageContent.indexOf(subquery, matchIdx + subqueryLen); - if (matchIdx === -1) { - break; - } - if ( - entireWord && - !this._isEntireWord(pageContent, matchIdx, subqueryLen) - ) { - continue; + if (matchDiacritics) { + // aX musn't match aXY. + query = `${query}(?=[^\\p{Mn}])`; + } else { + query = query.replace(prepareNoDiacriticsRegExp, (match, p1) => { + if (p1) { + // Diacritics are removed. + return ""; } - const originalMatchIdx = getOriginalIndex(matchIdx, pageDiffs), - matchEnd = matchIdx + subqueryLen - 1, - originalQueryLen = - getOriginalIndex(matchEnd, pageDiffs) - originalMatchIdx + 1; - - // Other searches do not, so we store the length. - matchesWithLength.push({ - match: originalMatchIdx, - matchLength: originalQueryLen, - skipped: false, - }); - } + // A letter has been matched and it can be followed by any diacritics + // in normalized text. + return `${match}\\p{Mn}*`; + }); } - // Prepare arrays for storing the matches. - this._pageMatchesLength[pageIndex] = []; - this._pageMatches[pageIndex] = []; + // Replace spaces by \s+ to be sure to match any spaces. + // We must do it after the if (matchDiacritcs) block to avoid + // wrong things with the "s". + query = query.replace(whitespacesRegExp, "\\s+"); - // Sort `matchesWithLength`, remove intersecting terms and put the result - // into the two arrays. - this._prepareMatches( - matchesWithLength, - this._pageMatches[pageIndex], - this._pageMatchesLength[pageIndex] - ); + return query; } _calculateMatch(pageIndex) { - let pageContent = this._pageContents[pageIndex]; - const pageDiffs = this._pageDiffs[pageIndex]; let query = this._query; - const { caseSensitive, entireWord, phraseSearch } = this._state; - if (query.length === 0) { // Do nothing: the matches should be wiped out already. return; } - if (!caseSensitive) { - pageContent = pageContent.toLowerCase(); - query = query.toLowerCase(); - } + const { caseSensitive, entireWord, phraseSearch } = this._state; + const pageContent = this._pageContents[pageIndex]; + const flags = caseSensitive ? "gu" : "gui"; if (phraseSearch) { - this._calculatePhraseMatch( - query, - pageIndex, - pageContent, - pageDiffs, - entireWord - ); + query = this._convertToRegExpString(query); } else { - this._calculateWordMatch( - query, - pageIndex, - pageContent, - pageDiffs, - entireWord - ); + // Words are sorted in reverse order to be sure that "foobar" is matched + // before "foo" in case the query is "foobar foo". + query = query + .match(/\S+/g) + .sort() + .reverse() + .map(q => `(${this._convertToRegExpString(q)})`) + .join("|"); } + query = new RegExp(query, flags); + + this._calculateRegExpMatch(query, entireWord, pageIndex, pageContent); // When `highlightAll` is set, ensure that the matches on previously // rendered (and still active) pages are correctly highlighted. @@ -533,6 +546,9 @@ class PDFFindController { for (let j = 0, jj = textItems.length; j < jj; j++) { strBuf.push(textItems[j].str); + if (textItems[j].hasEOL) { + strBuf.push("\n"); + } } // Store the normalized page content (text items) as one string. diff --git a/web/ui_utils.js b/web/ui_utils.js index d9af9235b76461..dd26ddbba572c4 100644 --- a/web/ui_utils.js +++ b/web/ui_utils.js @@ -204,8 +204,8 @@ function parseQueryString(query) { * @returns {number} Index of the first array element to pass the test, * or |items.length| if no such element exists. */ -function binarySearchFirstItem(items, condition) { - let minIndex = 0; +function binarySearchFirstItem(items, condition, start = 0) { + let minIndex = start; let maxIndex = items.length - 1; if (maxIndex < 0 || !condition(items[maxIndex])) { diff --git a/web/viewer.html b/web/viewer.html index c390ba62e88693..57f4e1021caf97 100644 --- a/web/viewer.html +++ b/web/viewer.html @@ -138,8 +138,13 @@
- + + + +
+ +
diff --git a/web/viewer.js b/web/viewer.js index 8c843566010575..fdf6ecbfb249b7 100644 --- a/web/viewer.js +++ b/web/viewer.js @@ -155,6 +155,7 @@ function getViewerConfiguration() { highlightAllCheckbox: document.getElementById("findHighlightAll"), caseSensitiveCheckbox: document.getElementById("findMatchCase"), entireWordCheckbox: document.getElementById("findEntireWord"), + matchDiacriticsCheckbox: document.getElementById("findMatchDiacritics"), findMsg: document.getElementById("findMsg"), findResultsCount: document.getElementById("findResultsCount"), findPreviousButton: document.getElementById("findPrevious"),