Skip to content

Commit

Permalink
Merge pull request #8 from Boehringer-Ingelheim/rc/2.1.0
Browse files Browse the repository at this point in the history
Rc/2.1.0 to main
  • Loading branch information
mingstat authored Nov 28, 2024
2 parents f724639 + e0529ff commit 5c93964
Show file tree
Hide file tree
Showing 12 changed files with 314 additions and 77 deletions.
9 changes: 6 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,22 +1,25 @@
Package: dv.loader
Type: Package
Title: Data loading module
Version: 2.0.0
Version: 2.1.0
Authors@R: c(
person( "Boehringer-Ingelheim Pharma GmbH & Co.KG", role = c("cph", "fnd")),
person( given = "Ming", family = "Yang", role = c("aut", "cre"), email = "[email protected]"),
person( given = "Steven", family = "Brooks", role = "aut", email = "[email protected]"),
person( given = "Sorin", family = "Voicu", role = "aut", email = "[email protected]")
)
Description: This is a module for loading .RDS / .sas7bdat data files from a network file storage environment. It also allows loading data locally.
Description: A package for loading multiple data files, returning a list of data frames with associated metadata, designed to integrate with the modular DaVinci framework.
License: Apache License (>= 2)
Encoding: UTF-8
LazyData: true
Depends: R (>= 3.5.0)
Imports: haven
Imports:
haven,
checkmate
Suggests:
testthat,
knitr,
rmarkdown
RoxygenNote: 7.3.0
VignetteBuilder: knitr
Config/testthat/edition: 3
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
export(get_cre_path)
export(get_nfs_path)
export(load_data)
export(load_files)
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# dv.loader 2.1.0

- Added `load_files()` to load data using explicit file paths.

# dv.loader 2.0.0

- GitHub release with QC report
Expand Down
38 changes: 38 additions & 0 deletions R/dvloader.R
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,41 @@ load_data <- function(sub_dir = NULL, file_names, use_wd = FALSE, prefer_sas = F

return(data_list)
}

#' Load data files from explicit paths
#'
#' Read data from provided paths and return it as a list of data frames.
#' Supports both .rds and .sas7bdat formats.
#'
#' @param file_paths [character(1+)] Files to read. Optionally named.
#'
#' @return [list] A named list of data frames, where each name is either:
#' - the name associated to the element in the `file_paths` argument, or, if not provided...
#' - the name of the file itself, after stripping it of its leading path and trailing extension
#'
#' @export
load_files <- function(file_paths) {
checkmate::assert_character(file_paths, min.len = 1)
checkmate::assert_file_exists(file_paths, access = "r", extension = c("rds", "sas7bdat"))

data_list <- lapply(file_paths, read_file_and_attach_metadata)

# Use names provided as arguments
arg_names <- names(file_paths)
if (is.null(arg_names)) arg_names <- rep("", length(file_paths))
names(data_list) <- arg_names

# If names are not provided, fall back to file names without leading path or trailing extension
empty_name_indices <- which(arg_names == "")
names(data_list)[empty_name_indices] <- tools::file_path_sans_ext(basename(file_paths[empty_name_indices]))

dup_indices <- duplicated(names(data_list))
if (any(dup_indices)) {
stop(sprintf(
"Duplicate entries detected (%s). Please review `file_paths` argument.",
paste(names(data_list)[dup_indices], collapse = ", ")
))
}

return(data_list)
}
53 changes: 25 additions & 28 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,7 @@ create_data_list <- function(file_path, file_names, prefer_sas) {
stop(paste("create_data_list(): No RDS or SAS files found for", file_path, x))
}

output <- read_file(file_path, file_name_to_load)

output <- read_file_and_attach_metadata(file.path(file_path, file_name_to_load))
return(output)
})

Expand All @@ -47,35 +46,33 @@ create_data_list <- function(file_path, file_names, prefer_sas) {
}


#' Reads RDS/SAS file and metadatas from first 6 items from file.info() its file path
#' @param file_path a path to a file
#' @param file_name name of a file
#' @return a data object with an extra attribute of metadata
read_file <- function(file_path, file_name) {
ext <- tools::file_ext(file_name)

if (!(toupper(ext) %in% c("RDS", "SAS7BDAT"))) {
stop("Usage error: read_file: file_name: file must either be RDS or SAS7BDAT.")
#' Read a data file and attach metadata
#'
#' Reads an .rds or .sas7bdat file from the given path and attaches metadata about the file
#' as an attribute.
#'
#' @param path [character(1)] Path to the data file to read
#'
#' @return A data frame with metadata attached as an attribute named "meta".
#'
#' @keywords internal
read_file_and_attach_metadata <- function(path) {
extension <- tools::file_ext(path)

if (toupper(extension) == "RDS") {
data <- readRDS(path)
} else if (toupper(extension) == "SAS7BDAT") {
data <- haven::read_sas(path)
} else {
stop("Not supported file type, only .rds or .sas7bdat files can be loaded.")
}

is_rds <- toupper(ext) == "RDS"

file <- file.path(file_path, file_name)
file_name <- tools::file_path_sans_ext(file_name)

# grab file info
meta <- file.info(file)[1L:6L]
meta[["path"]] <- row.names(meta)
meta[["file_name"]] <- file_name
meta <- data.frame(meta, stringsAsFactors = FALSE)
meta <- file.info(path, extra_cols = FALSE)
meta[["path"]] <- path
meta[["file_name"]] <- basename(path)
row.names(meta) <- NULL

if (is_rds) {
out <- readRDS(file)
} else {
out <- haven::read_sas(file)
}
attr(out, "meta") <- meta
attr(data, "meta") <- meta

return(out)
return(data)
}
68 changes: 54 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,30 +1,70 @@
# Data Loading

The {dv.loader} package provides a simple interface for loading data from a network file storage folder or
locally. It is designed to be used with `.RDS` and `.sas7bdat` file formats.
The package provides a simple function, `load_data()`, which loads R and SAS data files into memory.
Loading data from SQL databases is not yet supported. The function returns a list named by the file names passed,
and containing data frames, along with metadata for that table. By default, the function will look for files in a
sub-directory `sub_dir` of the base path defined by a environment variable "RXD_DATA". You can check if the base path
is set by running `Sys.getenv("RXD_DATA")`. A single file or multiple files can be loaded at once.
To make the loading process faster for large datasets, it is suggested that '.sas7bdat' files are converted to
'.RDS' files. The function will prefer '.RDS' files over '.sas7bdat' files by default.
The `dv.loader` package provides two functions for loading `.rds` and `.sas7bdat` files into R.

- `load_data()`: loads data files from a specified subdirectory of the base path defined by the environment variable "RXD_DATA". This function is useful when working with data files stored in a centralized location.
- `load_files()`: accepts explicit file paths to load data files from any location on your system. You can optionally provide custom names for the data frames in the returned list.

## Installation

The `dv.loader` package is available on GitHub. To install it, you can use the following commands:

```r
if (!require("remotes")) install.packages("remotes")
remotes::install_github("Boehringer-Ingelheim/dv.loader")
```

## Basic usage
After installation, you can load the package using:

```r
# getting data from a network file storage folder
dv.loader::load_data(sub_dir = "subdir1/subdir2", file_names = c("adsl", "adae"))
library(dv.loader)
```

## Basic Usage

### Using `load_data()`

The `load_data()` function loads data from the specified subdirectory relative to `RXD_DATA`. For the `file_names` argument, you can optionally specify the file extensions in the names. If not provided, the function will attempt to search for `.rds` and `.sas7bdat` files in the subdirectory and decide which one to load based on the `prefer_sas` argument when both file types are present. By default, `prefer_sas` is `FALSE`, meaning `.rds` files are preferred due to their smaller file size and faster loading time.

```r
# getting data locally (e.g., if you have file `./data/adsl.RDS`)
dv.loader::load_data(sub_dir = "data", file_names = c("adsl"), use_wd = TRUE)
# Set the RXD_DATA environment variable
Sys.setenv(RXD_DATA = "path/to/data/folder")

# Load data from path/to/data/folder/subdir1
load_data(
sub_dir = "subdir1",
file_names = c("file1", "file2"),
prefer_sas = TRUE
)

# Load data from path/to/data/folder/subdir1/subdir2
load_data(
sub_dir = "subdir1/subdir2",
file_names = c("file1.rds", "file2.sas7bdat"),
)
```

### Using `load_files()`

The `load_files()` function requires you to provide explicit file paths including the file extensions for the data files you want to load. You can optionally provide custom names for the data frames in the returned list.


```r
# Load data files with default names
load_files(
file_paths = c(
"path/to/file1.rds",
"path/to/file2.sas7bdat"
)
)

# Load data files with custom names
load_files(
file_paths = c(
"file1 (rds)" = "path/to/file1.rds",
"file2 (sas)" = "path/to/file2.sas7bdat"
)
)
```

For more details, please refer to the package vignettes and function documentation.
2 changes: 1 addition & 1 deletion man/load_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 20 additions & 0 deletions man/load_files.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 0 additions & 19 deletions man/read_file.Rd

This file was deleted.

19 changes: 19 additions & 0 deletions man/read_file_and_attach_metadata.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

90 changes: 90 additions & 0 deletions tests/testthat/test-load_files.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
test_that("load_files() correctly loads both RDS and SAS files", {
rds_file <- "inst/extdata/dummyads1.RDS"
sas_file <- "inst/extdata/dummyads2.sas7bdat"

data_list <- load_files(file_paths = c(rds_file, sas_file))

# Check that default names are correctly assigned based on filenames
expect_equal(names(data_list), c("dummyads1", "dummyads2"))

# Verify RDS file contents match direct reading
expect_equal(data_list[["dummyads1"]], readRDS(rds_file), ignore_attr = "meta")

# Verify SAS file contents match direct reading
expect_equal(data_list[["dummyads2"]], haven::read_sas(sas_file), ignore_attr = "meta")

# Create expected metadata for comparison
rds_metadata <- cbind(
file.info(rds_file, extra_cols = FALSE),
path = rds_file,
file_name = basename(rds_file)
)
sas_metadata <- cbind(
file.info(sas_file, extra_cols = FALSE),
path = sas_file,
file_name = basename(sas_file)
)
row.names(rds_metadata) <- NULL
row.names(sas_metadata) <- NULL

# Verify metadata is correctly attached to loaded data
expect_equal(attr(data_list[["dummyads1"]], "meta"), rds_metadata)
expect_equal(attr(data_list[["dummyads2"]], "meta"), sas_metadata)
})

test_that("load_files() works with different file extensions", {
# GitHub Actions (Assertion on 'file_paths' failed: File does not exist)
expect_error(
load_files(file_paths = c(
"inst/extdata/dummyads1.rds", # extension: RDS
"inst/extdata/dummyads2.SAS7BDAT" # extension: sas7bdat
))
)
})

test_that("load_files() properly validates file extensions", {
expect_error(
load_files(file_paths = c(
"inst/extdata/bad_file_type.myrds",
"inst/extdata/bad_file_type.txt"
))
)
})

test_that("load_files() can return both default and custom names for loaded data", {
# Check that duplicate names are caught and error is thrown
expect_error(
load_files(file_paths = c(
"inst/extdata/just_rds/dummyads1.RDS",
"inst/extdata/just_sas/dummyads1.sas7bdat"
)),
"Duplicate entries detected \\(dummyads1\\). Please review `file_paths` argument."
)

# Loading files with default names
data_list1 <- load_files(
file_paths = c(
"inst/extdata/just_rds/dummyads1.RDS",
"inst/extdata/just_sas/dummyads2.sas7bdat"
)
)
expect_equal(names(data_list1), c("dummyads1", "dummyads2"))

# Loading files with custom names
data_list2 <- load_files(
file_paths = c(
"rds_dummyads1" = "inst/extdata/just_rds/dummyads1.RDS",
"sas_dummyads2" = "inst/extdata/just_sas/dummyads2.sas7bdat"
)
)
expect_equal(names(data_list2), c("rds_dummyads1", "sas_dummyads2"))

# Loading files with mixed naming (custom and default)
data_list3 <- load_files(
file_paths = c(
"rds_dummyads1" = "inst/extdata/just_rds/dummyads1.RDS",
"inst/extdata/dummyads2.sas7bdat"
)
)
expect_equal(names(data_list3), c("rds_dummyads1", "dummyads2"))
})
Loading

0 comments on commit 5c93964

Please sign in to comment.