forked from bokov/Fall-2021-TSCI5230-Example
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.R
142 lines (134 loc) · 5.83 KB
/
data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#'---
#' title: "[TRIPOD 1] Identify the study as developing and/or validating a multivariable prediction model, the target population, and the outcome to be predicted."
#' author: 'Author One ^1,✉^, Author Two ^1^'
#' abstract: |
#' | [TRIPOD 2] Provide a summary of objectives, study design, setting, participants, sample size, predictors, outcome, statistical analysis, results, and conclusions.
#' documentclass: article
#' description: 'Manuscript'
#' clean: false
#' self_contained: true
#' number_sections: false
#' keep_md: true
#' fig_caption: true
#' output:
#' html_document:
#' code_folding: hide
#' toc: true
#' toc_float: true
#' ---
#'
#+ init, echo=FALSE, message=FALSE, warning=FALSE,results='hide'
debug <- 0;
knitr::opts_chunk$set(echo=debug>-1, warning=debug>0, message=debug>0);
#' Load libraries
library(GGally);
library(rio);
library(dplyr);
library(pander);
#library(synthpop);
#' Here are the libraries R currently sees:
search() %>% pander();
#' Load data
inputdata <- c(dat0='data/sim_veteran.xlsx');
if(file.exists('local.config.R')) source('local.config.R',local=TRUE,echo=FALSE);
dat0 <- import(inputdata['dat0']);
#' Make a scatterplot matrix
ggpairs(dat0);
#' # Using `pander`
#'
#' The `pander()` command helps show tabular (and other) data in a more readable
#' manner. It's optional if you're working in the R console, but if you are also
#' preparing reports from those same scripts (which we are in this class) then
#' it makes a big difference. In most of the examples here we will use `pander`
#' with the following syntax:
#+ panderexample, eval=FALSE
# Standalone
pander(dat0,split.tables=Inf, split.cells=Inf);
# As part of a pipeline of data transformation commands
head(dat0) %>% pander(split.tables=Inf, split.cells=Inf);
#' # Using `dplyr` to transform data
#'
#' ## Row Operations
#'
#' ### `filter`
#'
#' `filter()` selects rows that meet a set of criteria you specify.
#+ filter01
filter(dat0, time > 70 & celltype=='adeno') %>% pander(split.tables=Inf, split.cells=Inf);
#' ### `slice`
#'
#' The following two statements return the same result:
#+ slice01
# Show rows 2 through 6
slice(dat0, 2:6) %>% pander(split.tables=Inf, split.cells=Inf);
dat0[2:16,] %>% pander(split.tables=Inf, split.cells=Inf);
dat0 %>% slice(2:6) %>% pander(split.tables=Inf, split.cells=Inf);
#' Show the top 6 or the bottom 6 rows, respectively. These are equivalent to
#' base R's `head()` and `tail()` commands when used with the `n` argument.
#+ slice02
dat0 %>% slice_head(n=6) %>% pander(split.tables=Inf, split.cells=Inf);
dat0 %>% slice_tail(n=6) %>% pander(split.tables=Inf, split.cells=Inf);
#' The `slice` family of functions can take a `prop` argument instead of an `n`.
#' This specified a proportion of the total rows rather than a fixed number.
#' For example, the top and bottom 10% of the data:
#+ slice03
dat0 %>% slice_head(prop = 0.1) %>% pander(split.tables=Inf, split.cells=Inf);
dat0 %>% slice_tail(prop = 0.1) %>% pander(split.tables=Inf, split.cells=Inf);
#' A _very_ useful function is `slice_sample`. It selects a random sample of `n`
#' or `prop` rows from your data.
#+ slice04
slice_sample(dat0, n = 5, replace = TRUE) %>% pander(split.tables=Inf, split.cells=Inf);
slice_sample(dat0, n = 5, replace = TRUE) %>% pander(split.tables=Inf, split.cells=Inf);
#' ### `arrange`
#'
#' `arrange` returns the same data frame but sorted on the columns you specify
#' in the arguments. For example, by `trt` and `celltype`:
#+ arrange01
arrange(dat0, trt,celltype) %>% slice_head(n=20) %>% pander(split.tables=Inf, split.cells=Inf);
#' You can wrap any of the arguments in `desc()` to make it sort from largest to
#' smallest instead of the default smallest to largest.
#+
arrange(dat0, trt,age,celltype) %>% slice_head(n=5) %>% pander(split.tables=Inf, split.cells=Inf);
arrange(dat0, trt,desc(age),celltype) %>% slice_head(n=5) %>% pander(split.tables=Inf, split.cells=Inf);
#' ## Column Operations
#'
#' ### `select` and `where`
#'
#' Select extracts the columns of your choosing. The simplest way to do it is
#' by giving the unquoted, quoted, or back-quoted (` `` `) names of the columns
#' you want, each as a separate un-named argument. If your columns contain
#' spaces or punctuation marks, you have to quote or back-quote them. Here we
#' select three columns, one quoted, one unquoted, one back-quoted.
#+ select01
select(dat0,'karno',age,`prior`) %>% slice_head(n=5) %>% pander(split.tables=Inf, split.cells=Inf);
#' You can also select by the _type_ of column by using the `where()` helper
#' function. That function, in turn, should be wrapped around the _un_quoted
#' name of a function that can take a vector as its first argument and returns a
#' _single_ `TRUE` or `FALSE`. For example, `is.numeric()` and `is.character()`.
#' But you have to use them without the parentheses because you're referencing
#' the functions themselves, not their result (which is what the parentheses
#' would mean).
#+ select02
select(dat0,where(is.numeric)) %>% slice_head(n=5) %>% pander(split.tables=Inf, split.cells=Inf);
select(dat0,where(is.character)) %>% slice_head(n=5) %>% pander(split.tables=Inf, split.cells=Inf);
#' You can also select columns whose names match a particular pattern by using
#' `start_with()`, `ends_with()`, or `matches()` helper functions.
#+ select03
select(dat0,starts_with('t')) %>% slice_head(n=5) %>% pander(split.tables=Inf, split.cells=Inf);
select(dat0,ends_with('time')) %>% slice_head(n=5) %>% pander(split.tables=Inf, split.cells=Inf);
select(dat0,matches('ag')) %>% slice_head(n=5) %>% pander(split.tables=Inf, split.cells=Inf);
#'
#' ### `mutate`
#'
#' ### `rename`
#'
#' ### `relocate`
#'
#' ## Groups of Rows
#'
#' ### `group_by` and `summarise`
#' Set all the two-value columns to be TRUE/FALSE
dat1 <- mutate(dat0
, across(where( function(xx) length(unique(xx))<3), as.factor));
#' Now try the scatterplot matrix again
ggpairs(dat1);