Skip to content

Commit

Permalink
enable import of specific table from html (closes #126)
Browse files Browse the repository at this point in the history
  • Loading branch information
leeper committed Sep 25, 2016
1 parent 6e677d4 commit aa5c0fc
Show file tree
Hide file tree
Showing 8 changed files with 33 additions and 9 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: rio
Type: Package
Title: A Swiss-Army Knife for Data I/O
Version: 0.4.15
Date: 2016-09-19
Version: 0.4.16
Date: 2016-09-25
Authors@R: c(person("Jason", "Becker", role = "ctb", email = "[email protected]"),
person("Chung-hong", "Chan", role = "aut", email = "[email protected]"),
person("Geoffrey CH", "Chan", role = "ctb", email = "[email protected]"),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -122,5 +122,6 @@ importFrom(xml2,write_xml)
importFrom(xml2,xml_add_child)
importFrom(xml2,xml_add_sibling)
importFrom(xml2,xml_children)
importFrom(xml2,xml_find_all)
importFrom(yaml,as.yaml)
importFrom(yaml,yaml.load)
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# CHANGES TO v0.4.15 #

* Added support for importing from multi-table HTML files using the `which` argument. (#126)

# CHANGES TO v0.4.15 #

* Improved behavior of `import()` and `export()` with respect to unrecognized file types. (#124, #125, h/t Jason Becker)
Expand Down
4 changes: 2 additions & 2 deletions R/import.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#' @param file A character string naming a file, URL, or single-file .zip or .tar archive.
#' @param format An optional character string code of file format, which can be used to override the format inferred from \code{file}. Shortcuts include: \dQuote{,} (for comma-separated values), \dQuote{;} (for semicolon-separated values), and \dQuote{|} (for pipe-separated values).
#' @param setclass An optional character vector specifying one or more classes to set on the import. By default, all the return object is always a \dQuote{data.frame}. Reasonable values for this might be \dQuote{tbl_df} (if using dplyr) or \dQuote{data.table} (if using data.table). Warnings will be produced if a class is used from a package that is not loaded and/or available.
#' @param which This argument is used to control import from multi-object files. If \code{file} is a compressed directory, \code{which} can be either a character string specifying a filename or an integer specifying which file (in locale sort order) to extract from the compressed directory. For Excel spreadsheets, this can be used to specify a sheet number. For .Rdata files, this can be an object name. Ignored otherwise. A character string value will be used as a regular expression, such that the extracted file is the first match of the regular expression against the file names in the archive.
#' @param which This argument is used to control import from multi-object files. If \code{file} is a compressed directory, \code{which} can be either a character string specifying a filename or an integer specifying which file (in locale sort order) to extract from the compressed directory. For Excel spreadsheets, this can be used to specify a sheet number. For .Rdata files, this can be an object name. For HTML files, which table to exract (from document order). Ignored otherwise. A character string value will be used as a regular expression, such that the extracted file is the first match of the regular expression against the file names in the archive.
#' @param \dots Additional arguments passed to the underlying import functions. For example, this can control column classes for delimited file types, or control the use of haven for Stata and SPSS or readxl for Excel (.xlsx) format. See details below.
#' @return An R data.frame. If \code{setclass} is used, this data.frame may have additional class attribute values.
#' @details This function imports a data frame or matrix from a data file with the file format based on the file extension (or the manually specified format, if \code{format} is specified).
Expand Down Expand Up @@ -102,7 +102,7 @@ import <- function(file, format, setclass, which, ...) {
if (missing(format)) {
fmt <- get_ext(file)
if (fmt %in% c("gz", "gzip")) {
fmt <- file_ext(file_path_sans_ext(file, compress = FALSE))
fmt <- file_ext(file_path_sans_ext(file, compression = FALSE))
file <- gzfile(file, "r")
on.exit(close(file))
}
Expand Down
8 changes: 6 additions & 2 deletions R/import_methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -327,10 +327,14 @@ function(file, which = 1, fread = TRUE, sep = "auto", sep2 = "auto",
d
}

#' @importFrom xml2 read_html as_list
#' @importFrom xml2 read_html as_list xml_find_all
#' @export
.import.rio_html <- function(file, which = 1, stringsAsFactors = FALSE, ...) {
x <- as_list(read_html(unclass(file)))[["body"]][["table"]]
tables <- xml_find_all(read_html(unclass(file)), ".//table")
if (which > length(tables)) {
stop(paste0("Requested table exceeds number of tables found in file (", length(tables),")!"))
}
x <- as_list(tables[[which]])
if ("th" %in% names(x[[1]])) {
col_names <- unlist(x[[1]][names(x[[1]]) %in% "th"])
out <- do.call("rbind", lapply(x[-1], function(y) {
Expand Down
Loading

0 comments on commit aa5c0fc

Please sign in to comment.