Skip to content

Commit

Permalink
Account for rich text strings in BIFF5 files
Browse files Browse the repository at this point in the history
  • Loading branch information
jennybc committed Mar 19, 2022
1 parent d9ea739 commit a019b23
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 3 deletions.
19 changes: 16 additions & 3 deletions debug/debug.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,20 @@
devtools::clean_dll()
devtools::load_all()

files <- fs::dir_ls("investigations/Data", recurse = TRUE, glob = "*.XLS")
#files
read_xls(files[[1]])
# success <- read_excel(
# "investigations/sample_data/success.xls",
# col_names = FALSE,
# range = "A1"
# )
failure <- read_excel("investigations/sample_data/failure.xls", col_names=F)

# cell_is_readable
# cell->id is 214 for the failure
# 214 in hexadecimal is 0xD6
# in xlsstruct.h:
# #define XLS_RECORD_RSTRING 0x00D6

# cell->id is 516 for the success
# 516 in hexadecimal is 0x204
# in xlsstruct.h:
# #define XLS_RECORD_LABEL 0x0204
1 change: 1 addition & 0 deletions src/XlsCell.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ class XlsCell {
switch(cell_->id) {
case XLS_RECORD_LABELSST:
case XLS_RECORD_LABEL:
case XLS_RECORD_RSTRING:
{
std::string s = cell_->str == NULL ? "" : cell_->str;
ct = na.contains(s, trimWs) ? CELL_BLANK : CELL_TEXT;
Expand Down
1 change: 1 addition & 0 deletions src/XlsCellSet.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ class XlsCellSet {
bool cell_is_readable(const xls::xlsCell* cell) {
return cell && (
cell->id == XLS_RECORD_MULRK ||
cell->id == XLS_RECORD_RSTRING ||
cell->id == XLS_RECORD_NUMBER ||
cell->id == XLS_RECORD_RK ||
cell->id == XLS_RECORD_LABELSST ||
Expand Down
Binary file added tests/testthat/sheets/biff5-rich-text-string.xls
Binary file not shown.
17 changes: 17 additions & 0 deletions tests/testthat/test-compatibility.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,20 @@ test_that("we can read LAPD arrest sheets", {
expect_match(lapd$ARR_LOC[9], "HOLLYWOOD")
expect_identical(lapd$CHG_DESC[27], "EX CON W/ A GUN")
})

# https://github.com/tidyverse/readxl/issues/611
# xls file produced by ABBYY FineReader (OCR of PDFs)
# inspired libxls to add support for rich-text strings in BIFF5
# https://github.com/libxls/libxls/commit/b6d9d872756f69780b743dbaec9cd2ec30c37740
test_that("we can read xls from ABBYY FineReader", {
expect_error_free(
abbyy <- read_excel(
test_sheet("biff5-rich-text-string.xls"),
col_names = FALSE,
n_max = 1
)
)
expect_equal(nrow(abbyy), 1)
expect_equal(ncol(abbyy), 1)
expect_match(abbyy[[1,1]], "^ELECTORAL")
})

0 comments on commit a019b23

Please sign in to comment.