From aa5c0fc386fd750a5d14a45dcff5406b9a052b82 Mon Sep 17 00:00:00 2001 From: "Thomas J. Leeper" Date: Sun, 25 Sep 2016 15:25:26 +0100 Subject: [PATCH] enable import of specific table from html (closes #126) --- DESCRIPTION | 4 ++-- NAMESPACE | 1 + NEWS.md | 4 ++++ R/import.R | 4 ++-- R/import_methods.R | 8 ++++++-- inst/examples/twotables.html | 12 ++++++++++++ man/import.Rd | 2 +- tests/testthat/test_format_html.R | 7 +++++-- 8 files changed, 33 insertions(+), 9 deletions(-) create mode 100644 inst/examples/twotables.html diff --git a/DESCRIPTION b/DESCRIPTION index 525a9f2..e93bef8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: rio Type: Package Title: A Swiss-Army Knife for Data I/O -Version: 0.4.15 -Date: 2016-09-19 +Version: 0.4.16 +Date: 2016-09-25 Authors@R: c(person("Jason", "Becker", role = "ctb", email = "jason@jbecker.co"), person("Chung-hong", "Chan", role = "aut", email = "chainsawtiney@gmail.com"), person("Geoffrey CH", "Chan", role = "ctb", email = "gefchchan@gmail.com"), diff --git a/NAMESPACE b/NAMESPACE index bdd16a2..ffbddad 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -122,5 +122,6 @@ importFrom(xml2,write_xml) importFrom(xml2,xml_add_child) importFrom(xml2,xml_add_sibling) importFrom(xml2,xml_children) +importFrom(xml2,xml_find_all) importFrom(yaml,as.yaml) importFrom(yaml,yaml.load) diff --git a/NEWS.md b/NEWS.md index 252d120..297460e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# CHANGES TO v0.4.15 # + + * Added support for importing from multi-table HTML files using the `which` argument. (#126) + # CHANGES TO v0.4.15 # * Improved behavior of `import()` and `export()` with respect to unrecognized file types. (#124, #125, h/t Jason Becker) diff --git a/R/import.R b/R/import.R index c8a38e7..d270cea 100644 --- a/R/import.R +++ b/R/import.R @@ -4,7 +4,7 @@ #' @param file A character string naming a file, URL, or single-file .zip or .tar archive. #' @param format An optional character string code of file format, which can be used to override the format inferred from \code{file}. Shortcuts include: \dQuote{,} (for comma-separated values), \dQuote{;} (for semicolon-separated values), and \dQuote{|} (for pipe-separated values). #' @param setclass An optional character vector specifying one or more classes to set on the import. By default, all the return object is always a \dQuote{data.frame}. Reasonable values for this might be \dQuote{tbl_df} (if using dplyr) or \dQuote{data.table} (if using data.table). Warnings will be produced if a class is used from a package that is not loaded and/or available. -#' @param which This argument is used to control import from multi-object files. If \code{file} is a compressed directory, \code{which} can be either a character string specifying a filename or an integer specifying which file (in locale sort order) to extract from the compressed directory. For Excel spreadsheets, this can be used to specify a sheet number. For .Rdata files, this can be an object name. Ignored otherwise. A character string value will be used as a regular expression, such that the extracted file is the first match of the regular expression against the file names in the archive. +#' @param which This argument is used to control import from multi-object files. If \code{file} is a compressed directory, \code{which} can be either a character string specifying a filename or an integer specifying which file (in locale sort order) to extract from the compressed directory. For Excel spreadsheets, this can be used to specify a sheet number. For .Rdata files, this can be an object name. For HTML files, which table to exract (from document order). Ignored otherwise. A character string value will be used as a regular expression, such that the extracted file is the first match of the regular expression against the file names in the archive. #' @param \dots Additional arguments passed to the underlying import functions. For example, this can control column classes for delimited file types, or control the use of haven for Stata and SPSS or readxl for Excel (.xlsx) format. See details below. #' @return An R data.frame. If \code{setclass} is used, this data.frame may have additional class attribute values. #' @details This function imports a data frame or matrix from a data file with the file format based on the file extension (or the manually specified format, if \code{format} is specified). @@ -102,7 +102,7 @@ import <- function(file, format, setclass, which, ...) { if (missing(format)) { fmt <- get_ext(file) if (fmt %in% c("gz", "gzip")) { - fmt <- file_ext(file_path_sans_ext(file, compress = FALSE)) + fmt <- file_ext(file_path_sans_ext(file, compression = FALSE)) file <- gzfile(file, "r") on.exit(close(file)) } diff --git a/R/import_methods.R b/R/import_methods.R index 0ed6846..f038c08 100644 --- a/R/import_methods.R +++ b/R/import_methods.R @@ -327,10 +327,14 @@ function(file, which = 1, fread = TRUE, sep = "auto", sep2 = "auto", d } -#' @importFrom xml2 read_html as_list +#' @importFrom xml2 read_html as_list xml_find_all #' @export .import.rio_html <- function(file, which = 1, stringsAsFactors = FALSE, ...) { - x <- as_list(read_html(unclass(file)))[["body"]][["table"]] + tables <- xml_find_all(read_html(unclass(file)), ".//table") + if (which > length(tables)) { + stop(paste0("Requested table exceeds number of tables found in file (", length(tables),")!")) + } + x <- as_list(tables[[which]]) if ("th" %in% names(x[[1]])) { col_names <- unlist(x[[1]][names(x[[1]]) %in% "th"]) out <- do.call("rbind", lapply(x[-1], function(y) { diff --git a/inst/examples/twotables.html b/inst/examples/twotables.html new file mode 100644 index 0000000..d608c98 --- /dev/null +++ b/inst/examples/twotables.html @@ -0,0 +1,12 @@ + + + + + R Exported Data + + +
mpgcyldisphpdratwtqsecvsamgearcarb
2161601103.92.6216.460144
2161601103.92.87517.020144
22.84108933.852.3218.611141
21.462581103.083.21519.441031
18.783601753.153.4417.020032
18.162251052.763.4620.221031
14.383602453.213.5715.840034
24.44146.7623.693.19201042
22.84140.8953.923.1522.91042
19.26167.61233.923.4418.31044
17.86167.61233.923.4418.91044
16.48275.81803.074.0717.40033
17.38275.81803.073.7317.60033
15.28275.81803.073.78180033
10.484722052.935.2517.980034
10.4846021535.42417.820034
14.784402303.235.34517.420034
32.4478.7664.082.219.471141
30.4475.7524.931.61518.521142
33.9471.1654.221.83519.91141
21.54120.1973.72.46520.011031
15.583181502.763.5216.870032
15.283041503.153.43517.30032
13.383502453.733.8415.410034
19.284001753.083.84517.050032
27.3479664.081.93518.91141
264120.3914.432.1416.70152
30.4495.11133.771.51316.91152
15.883512644.223.1714.50154
19.761451753.622.7715.50156
1583013353.543.5714.60158
21.441211094.112.7818.61142
+

+
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
5.13.51.40.2setosa
4.931.40.2setosa
4.73.21.30.2setosa
4.63.11.50.2setosa
53.61.40.2setosa
5.43.91.70.4setosa
4.63.41.40.3setosa
53.41.50.2setosa
4.42.91.40.2setosa
4.93.11.50.1setosa
5.43.71.50.2setosa
4.83.41.60.2setosa
4.831.40.1setosa
4.331.10.1setosa
5.841.20.2setosa
5.74.41.50.4setosa
5.43.91.30.4setosa
5.13.51.40.3setosa
5.73.81.70.3setosa
5.13.81.50.3setosa
5.43.41.70.2setosa
5.13.71.50.4setosa
4.63.610.2setosa
5.13.31.70.5setosa
4.83.41.90.2setosa
531.60.2setosa
53.41.60.4setosa
5.23.51.50.2setosa
5.23.41.40.2setosa
4.73.21.60.2setosa
4.83.11.60.2setosa
5.43.41.50.4setosa
5.24.11.50.1setosa
5.54.21.40.2setosa
4.93.11.50.2setosa
53.21.20.2setosa
5.53.51.30.2setosa
4.93.61.40.1setosa
4.431.30.2setosa
5.13.41.50.2setosa
53.51.30.3setosa
4.52.31.30.3setosa
4.43.21.30.2setosa
53.51.60.6setosa
5.13.81.90.4setosa
4.831.40.3setosa
5.13.81.60.2setosa
4.63.21.40.2setosa
5.33.71.50.2setosa
53.31.40.2setosa
73.24.71.4versicolor
6.43.24.51.5versicolor
6.93.14.91.5versicolor
5.52.341.3versicolor
6.52.84.61.5versicolor
5.72.84.51.3versicolor
6.33.34.71.6versicolor
4.92.43.31versicolor
6.62.94.61.3versicolor
5.22.73.91.4versicolor
523.51versicolor
5.934.21.5versicolor
62.241versicolor
6.12.94.71.4versicolor
5.62.93.61.3versicolor
6.73.14.41.4versicolor
5.634.51.5versicolor
5.82.74.11versicolor
6.22.24.51.5versicolor
5.62.53.91.1versicolor
5.93.24.81.8versicolor
6.12.841.3versicolor
6.32.54.91.5versicolor
6.12.84.71.2versicolor
6.42.94.31.3versicolor
6.634.41.4versicolor
6.82.84.81.4versicolor
6.7351.7versicolor
62.94.51.5versicolor
5.72.63.51versicolor
5.52.43.81.1versicolor
5.52.43.71versicolor
5.82.73.91.2versicolor
62.75.11.6versicolor
5.434.51.5versicolor
63.44.51.6versicolor
6.73.14.71.5versicolor
6.32.34.41.3versicolor
5.634.11.3versicolor
5.52.541.3versicolor
5.52.64.41.2versicolor
6.134.61.4versicolor
5.82.641.2versicolor
52.33.31versicolor
5.62.74.21.3versicolor
5.734.21.2versicolor
5.72.94.21.3versicolor
6.22.94.31.3versicolor
5.12.531.1versicolor
5.72.84.11.3versicolor
6.33.362.5virginica
5.82.75.11.9virginica
7.135.92.1virginica
6.32.95.61.8virginica
6.535.82.2virginica
7.636.62.1virginica
4.92.54.51.7virginica
7.32.96.31.8virginica
6.72.55.81.8virginica
7.23.66.12.5virginica
6.53.25.12virginica
6.42.75.31.9virginica
6.835.52.1virginica
5.72.552virginica
5.82.85.12.4virginica
6.43.25.32.3virginica
6.535.51.8virginica
7.73.86.72.2virginica
7.72.66.92.3virginica
62.251.5virginica
6.93.25.72.3virginica
5.62.84.92virginica
7.72.86.72virginica
6.32.74.91.8virginica
6.73.35.72.1virginica
7.23.261.8virginica
6.22.84.81.8virginica
6.134.91.8virginica
6.42.85.62.1virginica
7.235.81.6virginica
7.42.86.11.9virginica
7.93.86.42virginica
6.42.85.62.2virginica
6.32.85.11.5virginica
6.12.65.61.4virginica
7.736.12.3virginica
6.33.45.62.4virginica
6.43.15.51.8virginica
634.81.8virginica
6.93.15.42.1virginica
6.73.15.62.4virginica
6.93.15.12.3virginica
5.82.75.11.9virginica
6.83.25.92.3virginica
6.73.35.72.5virginica
6.735.22.3virginica
6.32.551.9virginica
6.535.22virginica
6.23.45.42.3virginica
5.935.11.8virginica
+ + diff --git a/man/import.Rd b/man/import.Rd index fcb3e39..747603a 100644 --- a/man/import.Rd +++ b/man/import.Rd @@ -13,7 +13,7 @@ import(file, format, setclass, which, ...) \item{setclass}{An optional character vector specifying one or more classes to set on the import. By default, all the return object is always a \dQuote{data.frame}. Reasonable values for this might be \dQuote{tbl_df} (if using dplyr) or \dQuote{data.table} (if using data.table). Warnings will be produced if a class is used from a package that is not loaded and/or available.} -\item{which}{This argument is used to control import from multi-object files. If \code{file} is a compressed directory, \code{which} can be either a character string specifying a filename or an integer specifying which file (in locale sort order) to extract from the compressed directory. For Excel spreadsheets, this can be used to specify a sheet number. For .Rdata files, this can be an object name. Ignored otherwise. A character string value will be used as a regular expression, such that the extracted file is the first match of the regular expression against the file names in the archive.} +\item{which}{This argument is used to control import from multi-object files. If \code{file} is a compressed directory, \code{which} can be either a character string specifying a filename or an integer specifying which file (in locale sort order) to extract from the compressed directory. For Excel spreadsheets, this can be used to specify a sheet number. For .Rdata files, this can be an object name. For HTML files, which table to exract (from document order). Ignored otherwise. A character string value will be used as a regular expression, such that the extracted file is the first match of the regular expression against the file names in the archive.} \item{\dots}{Additional arguments passed to the underlying import functions. For example, this can control column classes for delimited file types, or control the use of haven for Stata and SPSS or readxl for Excel (.xlsx) format. See details below.} } diff --git a/tests/testthat/test_format_html.R b/tests/testthat/test_format_html.R index 7cf989d..79aba92 100644 --- a/tests/testthat/test_format_html.R +++ b/tests/testthat/test_format_html.R @@ -2,11 +2,14 @@ context("HTML imports/exports") require("datasets") test_that("Export to HTML", { - expect_true(export(iris, "iris.html") %in% dir()) + expect_true(export(iris, "iris.html") %in% dir(), label = "export to html works") }) test_that("Import from HTML", { - expect_true(is.data.frame(import("iris.html"))) + expect_true(is.data.frame(import("iris.html")), label = "import from single-table html works") + f <- system.file("examples", "twotables.html", package = "rio") + expect_true(all(dim(import(f, which = 1)) == c(32, 11)), label = "import from two-table html works (which = 1)") + expect_true(all(dim(import(f, which = 2)) == c(150, 5)), label = "import from two-table html works (which = 2)") }) unlink("iris.html")