Skip to content

Commit

Permalink
Merge pull request #118 from shrektan/shrektan/encoding
Browse files Browse the repository at this point in the history
UTF-8 in/out
  • Loading branch information
ycphs authored Nov 23, 2020
2 parents 44014fa + 2b6b5ba commit 4dfbf55
Show file tree
Hide file tree
Showing 15 changed files with 132 additions and 40 deletions.
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# openxlsx 4.2.3

## New Features

* Most of functions in openxlsx now support non-ASCII arguments better. More specifically, we can use non-ASCII strings as names or contents for `createNamedRegion()` ([#103](https://github.com/ycphs/openxlsx/issues/103)), `writeComment()`, `writeData()`, `writeDataTable()` and `writeFormula()`. In addition, openxlsx now reads comments and region names that contain non-ASCII strings correctly on Windows. Thanks to @shrektan for the PR [#118](https://github.com/ycphs/openxlsx/pull/118).

## Fixes for Check issues

* Fix to pass the tests for link-time optimization type mismatches
Expand Down
4 changes: 4 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ get_letters <- function() {
.Call(`_openxlsx_get_letters`)
}

markUTF8 <- function(x, clone) {
.Call(`_openxlsx_markUTF8`, x, clone)
}

loadworksheets <- function(wb, styleObjects, xmlFiles, is_chart_sheet) {
.Call(`_openxlsx_loadworksheets`, wb, styleObjects, xmlFiles, is_chart_sheet)
}
Expand Down
8 changes: 5 additions & 3 deletions R/WorkbookClass.R
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ Workbook$methods(
# The result is saved to a new chart xml file
newfl <- file.path(dirname(fl), newname)
charts[newname] <<- newfl
chart <- readLines(fl, warn = FALSE, encoding = "UTF-8")
chart <- readUTF8(fl)
chart <-
gsub(
stri_join("(?<=')", sheet_names[[clonedSheet]], "(?='!)"),
Expand Down Expand Up @@ -1150,6 +1150,9 @@ Workbook$methods(
ref,
as.integer(totalsRowCount)
)
# because tableName might be native encoded non-ASCII strings, we need to ensure
# it's UTF-8 encoded
table <- enc2utf8(table)

nms <- names(tables)
tSheets <- attr(tables, "sheet")
Expand Down Expand Up @@ -17825,8 +17828,7 @@ Workbook$methods(
Workbook$methods(
loadStyles = function(stylesXML) {
## Build style objects from the styles XML
stylesTxt <-
readLines(stylesXML, warn = FALSE, encoding = "UTF-8")
stylesTxt <- readUTF8(stylesXML)
stylesTxt <- removeHeadTag(stylesTxt)

## Indexed colours
Expand Down
4 changes: 4 additions & 0 deletions R/helperFunctions.R
Original file line number Diff line number Diff line change
Expand Up @@ -917,3 +917,7 @@ hashPassword <- function(password) {
hash <- bitwXor(bitwXor(hash, length(chars)), 0xCE4B)
format(as.hexmode(hash), upper.case = TRUE)
}

readUTF8 <- function(x) {
readLines(x, warn = FALSE, encoding = "UTF-8")
}
26 changes: 13 additions & 13 deletions R/loadWorkbook.R
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {

## core
if (length(coreXML) == 1) {
coreXML <- paste(readLines(con = coreXML, encoding = "UTF-8", warn = FALSE), collapse = "")
coreXML <- paste(readUTF8(coreXML), collapse = "")
wb$core <- removeHeadTag(x = coreXML)
}

Expand All @@ -115,7 +115,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {
worksheet_rId_mapping <- NULL
workbookRelsXML <- xmlFiles[grepl("workbook.xml.rels$", xmlFiles, perl = TRUE)]
if (length(workbookRelsXML) > 0) {
workbookRelsXML <- paste(readLines(con = workbookRelsXML, encoding = "UTF-8", warn = FALSE), collapse = "")
workbookRelsXML <- paste(readUTF8(workbookRelsXML), collapse = "")
workbookRelsXML <- getChildlessNode(xml = workbookRelsXML, tag = "<Relationship ")
worksheet_rId_mapping <- workbookRelsXML[grepl("worksheets/sheet", workbookRelsXML, fixed = TRUE)]
}
Expand All @@ -142,7 +142,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {
## xl\
## xl\workbook
if (length(workbookXML) > 0) {
workbook <- readLines(workbookXML, warn = FALSE, encoding = "UTF-8")
workbook <- readUTF8(workbookXML)
workbook <- removeHeadTag(workbook)

sheets <- unlist(regmatches(workbook, gregexpr("(?<=<sheets>).*(?=</sheets>)", workbook, perl = TRUE)))
Expand Down Expand Up @@ -176,7 +176,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {
for (i in 1:length(sheetrId)) {
if (is_chart_sheet[i]) {
count <- 0
txt <- paste(readLines(chartSheetsXML[j], warn = FALSE, encoding = "UTF-8"), collapse = "")
txt <- paste(readUTF8(chartSheetsXML[j]), collapse = "")

zoom <- regmatches(txt, regexpr('(?<=zoomScale=")[0-9]+', txt, perl = TRUE))
if (length(zoom) == 0) {
Expand Down Expand Up @@ -235,7 +235,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {

## xl\sharedStrings
if (length(sharedStringsXML) > 0) {
sharedStrings <- readLines(sharedStringsXML, warn = FALSE, encoding = "UTF-8")
sharedStrings <- readUTF8(sharedStringsXML)
sharedStrings <- paste(sharedStrings, collapse = "\n")
sharedStrings <- removeHeadTag(sharedStrings)

Expand Down Expand Up @@ -419,7 +419,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {

## xl\theme
if (length(themeXML) > 0) {
wb$theme <- removeHeadTag(paste(unlist(lapply(sort(themeXML)[[1]], function(x) readLines(x, warn = FALSE, encoding = "UTF-8"))), collapse = ""))
wb$theme <- removeHeadTag(paste(unlist(lapply(sort(themeXML)[[1]], readUTF8)), collapse = ""))
}


Expand Down Expand Up @@ -521,7 +521,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {

xml <- lapply(1:length(allRels), function(i) {
if (haveRels[i]) {
xml <- readLines(allRels[[i]], warn = FALSE, encoding = "UTF-8")
xml <- readUTF8(allRels[[i]])
xml <- removeHeadTag(xml)
xml <- gsub("<Relationships .*?>", "", xml)
xml <- gsub("</Relationships>", "", xml)
Expand Down Expand Up @@ -622,7 +622,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {
tablesXML <- tablesXML[sprintf("table%s.xml", unlist(tables))]

## tables are now in correct order so we can read them in as they are
wb$tables <- sapply(tablesXML, function(x) removeHeadTag(paste(readLines(x, warn = FALSE), collapse = "")))
wb$tables <- sapply(tablesXML, function(x) removeHeadTag(paste(readUTF8(x), collapse = "")))

## pull out refs and attach names
refs <- regmatches(wb$tables, regexpr('(?<=ref=")[0-9A-Z:]+', wb$tables, perl = TRUE))
Expand Down Expand Up @@ -690,14 +690,14 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {
hasDrawing <- sapply(drawXMLrelationship, length) > 0 ## which sheets have a drawing

if (length(drawingRelsXML) > 0) {
dRels <- lapply(drawingRelsXML, readLines, warn = FALSE)
dRels <- lapply(drawingRelsXML, readUTF8)
dRels <- unlist(lapply(dRels, removeHeadTag))
dRels <- gsub("<Relationships .*?>", "", dRels)
dRels <- gsub("</Relationships>", "", dRels)
}

if (length(drawingsXML) > 0) {
dXML <- lapply(drawingsXML, readLines, warn = FALSE, encoding = "UTF-8")
dXML <- lapply(drawingsXML, readUTF8)
dXML <- unlist(lapply(dXML, removeHeadTag))
dXML <- gsub("<xdr:wsDr .*?>", "", dXML)
dXML <- gsub("</xdr:wsDr>", "", dXML)
Expand Down Expand Up @@ -753,7 +753,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {
ind <- grepl(target, vmlDrawingXML)

if (any(ind)) {
txt <- paste(readLines(vmlDrawingXML[ind], warn = FALSE), collapse = "\n")
txt <- paste(readUTF8(vmlDrawingXML[ind]), collapse = "\n")
txt <- removeHeadTag(txt)

i1 <- regexpr("<v:shapetype", txt, fixed = TRUE)
Expand Down Expand Up @@ -792,7 +792,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {
ind <- grepl(target, vmlDrawingXML)

if (any(ind)) {
txt <- paste(readLines(vmlDrawingXML[ind], warn = FALSE), collapse = "\n")
txt <- paste(readUTF8(vmlDrawingXML[ind]), collapse = "\n")
txt <- removeHeadTag(txt)

cd <- unique(getNodes(xml = txt, tagIn = "<x:ClientData"))
Expand All @@ -803,7 +803,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {
target <- unlist(lapply(commentXMLrelationship[[i]], function(x) regmatches(x, gregexpr('(?<=Target=").*?"', x, perl = TRUE))[[1]]))
target <- basename(gsub('"$', "", target))

txt <- paste(readLines(commentsXML[grepl(target, commentsXML)], warn = FALSE), collapse = "\n")
txt <- paste(readUTF8(commentsXML[grepl(target, commentsXML)]), collapse = "\n")
txt <- removeHeadTag(txt)

authors <- getNodes(xml = txt, tagIn = "<author>")
Expand Down
6 changes: 3 additions & 3 deletions R/readWorkbook.R
Original file line number Diff line number Diff line change
Expand Up @@ -172,14 +172,14 @@ read.xlsx.default <- function(xlsxFile,

## get workbook names
workbookRelsXML <-
paste(readLines(workbookRelsXML, warn = FALSE, encoding = "UTF-8"),
paste(readUTF8(workbookRelsXML),
collapse = ""
)
workbookRelsXML <-
getChildlessNode(xml = workbookRelsXML, tag = "<Relationship ")

workbook <-
unlist(readLines(workbook, warn = FALSE, encoding = "UTF-8"))
unlist(readUTF8(workbook))
workbook <- removeHeadTag(workbook)

sheets <-
Expand Down Expand Up @@ -499,7 +499,7 @@ read.xlsx.default <- function(xlsxFile,
}

stylesXML <- xmlFiles[grepl("styles.xml", xmlFiles)]
styles <- readLines(stylesXML, warn = FALSE)
styles <- readUTF8(stylesXML)
styles <- removeHeadTag(styles)

## Number formats
Expand Down
8 changes: 3 additions & 5 deletions R/wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -2716,8 +2716,6 @@ createNamedRegion <- function(wb, sheet, cols, rows, name) {

if (tolower(name) %in% ex_names) {
stop(sprintf("Named region with name '%s' already exists!", name))
} else if (grepl("[^A-Z0-9_\\.]", name[1], ignore.case = TRUE)) {
stop("Invalid characters in name")
} else if (grepl("^[A-Z]{1,3}[0-9]+$", name)) {
stop("name cannot look like a cell reference.")
}
Expand Down Expand Up @@ -2800,7 +2798,7 @@ getNamedRegions.default <- function(x) {
xmlFiles <- unzip(x, exdir = xmlDir)

workbook <- xmlFiles[grepl("workbook.xml$", xmlFiles, perl = TRUE)]
workbook <- unlist(readLines(workbook, warn = FALSE, encoding = "UTF-8"))
workbook <- unlist(readUTF8(workbook))

dn <- getChildlessNode(xml = removeHeadTag(workbook), tag = "<definedName ")
if (length(dn) == 0) {
Expand Down Expand Up @@ -3269,7 +3267,7 @@ getDateOrigin <- function(xlsxFile) {
on.exit(unlink(xmlDir, recursive = TRUE), add = TRUE)

workbook <- xmlFiles[grepl("workbook.xml$", xmlFiles, perl = TRUE)]
workbook <- paste(unlist(readLines(workbook, warn = FALSE)), collapse = "")
workbook <- paste(unlist(readUTF8(workbook)), collapse = "")

if (grepl('date1904="1"|date1904="true"', workbook, ignore.case = TRUE)) {
origin <- "1904-01-01"
Expand Down Expand Up @@ -3312,7 +3310,7 @@ getSheetNames <- function(file) {
on.exit(unlink(xmlDir, recursive = TRUE), add = TRUE)

workbook <- xmlFiles[grepl("workbook.xml$", xmlFiles, perl = TRUE)]
workbook <- readLines(workbook, warn = FALSE, encoding = "UTF-8")
workbook <- readUTF8(workbook)
workbook <- removeHeadTag(workbook)
sheets <- unlist(regmatches(workbook, gregexpr("(?<=<sheets>).*(?=</sheets>)", workbook, perl = TRUE)))
sheets <- unlist(regmatches(sheets, gregexpr("<sheet[^>]*>", sheets, perl = TRUE)))
Expand Down
2 changes: 0 additions & 2 deletions R/writeData.R
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,6 @@ writeData <- function(wb,

if (name %in% ex_names) {
stop(sprintf("Named region with name '%s' already exists!", name))
} else if (grepl("[^A-Z0-9_\\.]", name[1], ignore.case = TRUE)) {
stop("Invalid characters in name")
} else if (grepl("^[A-Z]{1,3}[0-9]+$", name)) {
stop("name cannot look like a cell reference.")
}
Expand Down
13 changes: 13 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,18 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// markUTF8
CharacterVector markUTF8(CharacterVector x, bool clone);
RcppExport SEXP _openxlsx_markUTF8(SEXP xSEXP, SEXP cloneSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< CharacterVector >::type x(xSEXP);
Rcpp::traits::input_parameter< bool >::type clone(cloneSEXP);
rcpp_result_gen = Rcpp::wrap(markUTF8(x, clone));
return rcpp_result_gen;
END_RCPP
}
// loadworksheets
SEXP loadworksheets(Reference wb, List styleObjects, std::vector<std::string> xmlFiles, LogicalVector is_chart_sheet);
RcppExport SEXP _openxlsx_loadworksheets(SEXP wbSEXP, SEXP styleObjectsSEXP, SEXP xmlFilesSEXP, SEXP is_chart_sheetSEXP) {
Expand Down Expand Up @@ -456,6 +468,7 @@ static const R_CallMethodDef CallEntries[] = {
{"_openxlsx_cppReadFile", (DL_FUNC) &_openxlsx_cppReadFile, 1},
{"_openxlsx_read_file_newline", (DL_FUNC) &_openxlsx_read_file_newline, 1},
{"_openxlsx_get_letters", (DL_FUNC) &_openxlsx_get_letters, 0},
{"_openxlsx_markUTF8", (DL_FUNC) &_openxlsx_markUTF8, 2},
{"_openxlsx_loadworksheets", (DL_FUNC) &_openxlsx_loadworksheets, 4},
{"_openxlsx_getNodes", (DL_FUNC) &_openxlsx_getNodes, 2},
{"_openxlsx_getOpenClosedNode", (DL_FUNC) &_openxlsx_getOpenClosedNode, 3},
Expand Down
16 changes: 16 additions & 0 deletions src/helper_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,3 +312,19 @@ std::vector<std::string> get_letters(){
return(LETTERS);

}


// [[Rcpp::export]]
CharacterVector markUTF8(CharacterVector x, bool clone) {
CharacterVector out;
if (clone) {
out = Rcpp::clone(x);
} else {
out = x;
}
const size_t n = x.size();
for (size_t i = 0; i < n; ++i) {
out[i] = Rf_mkCharCE(x[i], CE_UTF8);
}
return out;
}
17 changes: 9 additions & 8 deletions src/load_workbook.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -778,8 +778,8 @@ SEXP getNodes(std::string xml, std::string tagIn){

}

return wrap(r) ;

CharacterVector out = wrap(r);
return markUTF8(out);
}


Expand Down Expand Up @@ -814,9 +814,8 @@ SEXP getOpenClosedNode(std::string xml, std::string open_tag, std::string close_

}

return wrap(r) ;


CharacterVector out = wrap(r);
return markUTF8(out);
}


Expand Down Expand Up @@ -852,7 +851,7 @@ SEXP getAttr(CharacterVector x, std::string tag){
}
}

return wrap(r) ;
return markUTF8(r); // no need to wrap as r is already a CharacterVector

}

Expand Down Expand Up @@ -912,7 +911,8 @@ CharacterVector getChildlessNode(std::string xml, std::string tag){

}

return wrap(r) ;
CharacterVector out = wrap(r);
return markUTF8(out);

}

Expand Down Expand Up @@ -960,7 +960,8 @@ CharacterVector get_extLst_Major(std::string xml){

}

return wrap(r) ;
CharacterVector out = wrap(r);
return markUTF8(out);

}

Expand Down
3 changes: 1 addition & 2 deletions src/openxlsx.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,5 +81,4 @@ LogicalVector isInternalHyperlink(CharacterVector x);
// helper functions
string itos(int i);
SEXP write_file(std::string parent, std::string xmlText, std::string parentEnd, std::string R_fileName);


CharacterVector markUTF8(CharacterVector x, bool clone = false);
5 changes: 3 additions & 2 deletions src/write_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ SEXP buildMatrixMixed(CharacterVector v,


// If column is date class and no strings exist in column
if( (std::find(dateCols.begin(), dateCols.end(), i) != dateCols.end()) &
if( (std::find(dateCols.begin(), dateCols.end(), i) != dateCols.end()) &&
(std::find(charCols.begin(), charCols.end(), i) == charCols.end()) ){

// these are all dates and no characters --> safe to convert numerics
Expand Down Expand Up @@ -342,7 +342,8 @@ CharacterVector build_table_xml(std::string table, std::string tableStyleXML, st
table = table + tableCols + tableStyleXML + "</table>";


return wrap(table);
CharacterVector out = wrap(table);
return markUTF8(out);

}

Expand Down
Loading

0 comments on commit 4dfbf55

Please sign in to comment.