From a019b231ba6d91829b9eca2691d1f8a7735a658a Mon Sep 17 00:00:00 2001 From: Jenny Bryan Date: Sat, 19 Mar 2022 15:32:03 -0700 Subject: [PATCH] Account for rich text strings in BIFF5 files Fixes #611 See https://github.com/libxls/libxls/commit/b6d9d872756f69780b743dbaec9cd2ec30c37740 --- debug/debug.R | 19 +++++++++++++++--- src/XlsCell.h | 1 + src/XlsCellSet.h | 1 + .../sheets/biff5-rich-text-string.xls | Bin 0 -> 19968 bytes tests/testthat/test-compatibility.R | 17 ++++++++++++++++ 5 files changed, 35 insertions(+), 3 deletions(-) create mode 100755 tests/testthat/sheets/biff5-rich-text-string.xls diff --git a/debug/debug.R b/debug/debug.R index 565ad9db..4c58160e 100644 --- a/debug/debug.R +++ b/debug/debug.R @@ -1,7 +1,20 @@ devtools::clean_dll() devtools::load_all() -files <- fs::dir_ls("investigations/Data", recurse = TRUE, glob = "*.XLS") -#files -read_xls(files[[1]]) +# success <- read_excel( +# "investigations/sample_data/success.xls", +# col_names = FALSE, +# range = "A1" +# ) +failure <- read_excel("investigations/sample_data/failure.xls", col_names=F) +# cell_is_readable +# cell->id is 214 for the failure +# 214 in hexadecimal is 0xD6 +# in xlsstruct.h: +# #define XLS_RECORD_RSTRING 0x00D6 + +# cell->id is 516 for the success +# 516 in hexadecimal is 0x204 +# in xlsstruct.h: +# #define XLS_RECORD_LABEL 0x0204 diff --git a/src/XlsCell.h b/src/XlsCell.h index f41f8a40..0321f017 100644 --- a/src/XlsCell.h +++ b/src/XlsCell.h @@ -130,6 +130,7 @@ class XlsCell { switch(cell_->id) { case XLS_RECORD_LABELSST: case XLS_RECORD_LABEL: + case XLS_RECORD_RSTRING: { std::string s = cell_->str == NULL ? "" : cell_->str; ct = na.contains(s, trimWs) ? CELL_BLANK : CELL_TEXT; diff --git a/src/XlsCellSet.h b/src/XlsCellSet.h index 603c2fed..12cda78b 100644 --- a/src/XlsCellSet.h +++ b/src/XlsCellSet.h @@ -152,6 +152,7 @@ class XlsCellSet { bool cell_is_readable(const xls::xlsCell* cell) { return cell && ( cell->id == XLS_RECORD_MULRK || + cell->id == XLS_RECORD_RSTRING || cell->id == XLS_RECORD_NUMBER || cell->id == XLS_RECORD_RK || cell->id == XLS_RECORD_LABELSST || diff --git a/tests/testthat/sheets/biff5-rich-text-string.xls b/tests/testthat/sheets/biff5-rich-text-string.xls new file mode 100755 index 0000000000000000000000000000000000000000..ed8963fc7b9afc66a65f808c8a6ace9dae106468 GIT binary patch literal 19968 zcmeHOU2GiH6+ScGAKUTI+W8|JaIORSF)?leO-l&`yN(klP3$yl5kP%lu*dP@^^Tcc zC%9^9B+65fNwXWsw%ZFec%xRRgn;pDp4twc;NvakXnf<#QnZ|XLe`S z8^=&6rOZUfGxyvx_ndpq{W!xX_|a(#w8@BwVSC5T}P_cVwx&w^S(ZJ>4#W88szC#Vb54Z;wR9#Ai6BWM$d za$9iU3c3lj4U`9M2i*)}J!uD?6{sJy6Lc%+Hqb5*>yzuh%X>O#Dtb|Jy`2{zJg?_Gq?9+PnZ-8utlLl8>3bG^%_~&TfA4e2swN2zyrHJRU5pDW+Z_t0v-Sz6< z%$5TG({{#<_OCVnY4dvKKighw{&TJA1FbdxccXp}XfJ30bUSDtXb^MtBad&^vK7e)oB|NWu=UEOGTW56*b{rP-naar+fe- z=Ef42ZrH|480l+i*(6WI+6epECzo#c<})z~^JZTvCBG1Be_tvszZ83D{}s$SPPBil z$ki7W9&jbRwS9mw7xNVpltvzt6-#;`H+=E<#bHIC8eYB$@0KRKZJO71aahNLNslFM znm2M$bbD!rQg(rdq$rRxw)AUWU5@V>UT8~HuKsSly>T%%N9CIFTE*BGl^d3*eQ9k= z{9P-?h|bA z_ILPVC|B=CombaU=SAg0y-^d{>FC3qoP?JYhVe-`2nZnqV z8XGB0sYB}M@v)Of?ihXj%)~2GqYtSGZ%7>+R1cpWIR>%u*I%9(YT2DRn7%iYvNvdd z^uTUNrRmn!jfNh7F<;s7d2k@{|;c)g{eU-mF5?i>_Z?re5EzEV|QGvMtR@`z`IWseOtD<0iT=-E}mAz_1tD&-?%aXNh#BP(kD|?**?E`dwMvVmR!Gy7e z#gey-9>Oy3f=jCkPRa3?+4r-~<8C#sa!Bnh9o{pb&d#~zc#SH;z*zStr>B<|jSnhS zr|P1=uJ)#Tz+8DQ+F<|_nx?fe)IrT&Tlq#s_tj-Nj zVjOeIZjtwJWK7xJC|Nj7X64lMgi|Ulr|eF!Y#c7rGVYdrFgAkG0>%azbrxN9+A9^O z-IU!0W(w?#7(W`pv=%((dtUW?$(gT0a61GuIH_)wQLp5ck=v+a-g(zAFCgS8yA9P@ zm|)2mtz(v)mw#&OIA1Mv2vfl=aWtl2*>;S^6V9~v=u$j@WI{t#| z(-uzk?J#=WTdKHoUTKEx&0wa%?u5|`ixsb2nJ;9dbhZcS;9<)}>(G zEunQ>4f^C?fpuxHchs#*!@3=??xeSbaG&tzoO0Q%Sat>~TSSR@O5ut2aqZaw`_8(h zs(apdXWX)3XE2{PNP*$5TaktpJ3}kD0p#Uc5W$T0QC3c4_IT6t3~EOEDT`&n@k_>p zOwEK0x@1hCVRVrWmluSq6K~TKX_@3`<${#aE@+W4b%-J2-k%p63ng`B8`WRSM;1JV!rBii4%mu;F4Sm2AATz zWCA-{Yj*ha$ZAveR;<;uqdRniDpcJIj$ft1*1(W9JkbzKPNnP~6zGvQIa;iKbKZ9x zjJl0r(xg=(X^q;#eY#KDwJlv}fCD2Vr(F*8l-&Ui$3_aHYpzWF9Q6!B8=g{tX0-lZ z6w<(G+U4ZZJBx2)Y!(+eA&C?A*8{?LBBOyjQ(n5X>JGfi8IzJTt~0}l!R?h3sD+4IZJ+dN+qOU_-O?q6={e6u zj0V+XOXa~7MiMu@-B`cPKIYBNDz~IY$}@h^)zf*?GL zjtx#WT>Yt-``4gW*A2z{pe|fD=#Orgun)m+@CVn&JHT#OV!4zJLFriYgZg2(`wv`s z^~FiIj;pN|Ib$X*?`?iH%O!&EJIGZmASDK(YD5F_jc9;10_3VFj+B7Ld~&rzjTnG6 z1+5F>N(BL(!2JU5#50MUm0`F+$5sIbktGHJ@C+!#CJaiALf~wT2YtPWQ*Oc+`S|@{(Z3CTW}4QF&&-8~DY@Fy zGpvSHXpiCJLrPOVK-H#ve0;fT^$Y`D2-`8Ib^N`W>yb$N0lGnX3-YkS_`%rOPk9US zP53<)>}oxxD2c8nQ-|&f9#Q6a-avi+`tN3+0`JVTG#Dg4?*dYKWcyjJEw+Fu1a5;a6M^U+K|P|NzkZF$t2eE z5tNi1k>m0Jr~sP8YOnD3DE=03KLzPCcpAeS{Ejt(XFeSQ9fdsGJAw^$R9=VNguH^< zQF#cp6WA?=a6gDWTFJveO-AHc(AqepUxvgGvWg5O()fPBGYU};41z|m12Z%@p;E*x zp&Xod^pj5c5&aYkhrB%Lw45i2MW-Q~w(Q%6L1SFJ=qb*NS%rpQmgJa)ow z0ffl+41`FYDG@#7S7*US_=!_I`v7I$OkqpD99{wP}l%js) z=ro8N>kJ~tKAIBy!;dZ0WAHb_r=DADQtZ?1wGwq5sFk9+unZ$cMBLblLEP9+QzF{7 z871m6SexM!YYTN5ux(%M9mXm2pF!BdIB`ud5F1-D5L-AC4Zy}9WtcjSzi_-8Oik75 z*tc-J8yGDd?*=ss^;zotYWW^1iPU@RQ(1{4d&yTQT+Ym)3=INlBN z73!-)U({frFZS7#7&rddjJ{kC%4Ob)#;Oi;vChC;?4v2Meg2S*?KgwAj1+wr&MAXv zh2z2iSz%we(itR+?-?YE{WB%{jz5^Au13_#NO11*XE3G|9TNtu3iWV>GiVhh4O+#1 zni7#JOlz&jfK-N0km|OY6yd3`PX?6=`((f<>S_)Y9kzQTXcWZs#h4}6|Bwf+;TLlm zo7;2zvQWJ8^iSV=e{wGOn;*2wzCFMHE2rjLAO?~34}$pSBOq?BB@j0u{)&c+_jf^D zT%QGTF?s>S?c!G;4*1tVT-M(Nv7NW@GPioR&FepX&Y_J{NOz}^Zh1(wJp8iG{L=9W z@LX;0Fl1uCOGG)j5)ULENIZ~uAn`!rfy4ue2NDk?9!NZpcp&k>|CxqZtPgw>b?WsQC>LLB}}|!P6>;7|gdo1RjY85%>8q=)>SW zY>QWT??a>1PcYs5gaGY=$2K{(D0kNL=PUeqK}ADTA*6(w* literal 0 HcmV?d00001 diff --git a/tests/testthat/test-compatibility.R b/tests/testthat/test-compatibility.R index 728f5102..94e9d192 100644 --- a/tests/testthat/test-compatibility.R +++ b/tests/testthat/test-compatibility.R @@ -63,3 +63,20 @@ test_that("we can read LAPD arrest sheets", { expect_match(lapd$ARR_LOC[9], "HOLLYWOOD") expect_identical(lapd$CHG_DESC[27], "EX CON W/ A GUN") }) + +# https://github.com/tidyverse/readxl/issues/611 +# xls file produced by ABBYY FineReader (OCR of PDFs) +# inspired libxls to add support for rich-text strings in BIFF5 +# https://github.com/libxls/libxls/commit/b6d9d872756f69780b743dbaec9cd2ec30c37740 +test_that("we can read xls from ABBYY FineReader", { + expect_error_free( + abbyy <- read_excel( + test_sheet("biff5-rich-text-string.xls"), + col_names = FALSE, + n_max = 1 + ) + ) + expect_equal(nrow(abbyy), 1) + expect_equal(ncol(abbyy), 1) + expect_match(abbyy[[1,1]], "^ELECTORAL") +})