Many changes

cangermueller · May 17, 2015 · 9a39738 · 9a39738
1 parent b4caba7
commit 9a39738
Show file tree

Hide file tree

Showing 80 changed files with 3,729 additions and 694 deletions.
diff --git a/R/argparse.R b/R/argparse.R
@@ -2,11 +2,10 @@
 
 library(argparse)
 
-ap <- ArgumentParser(description='Concatenates several files')
-ap$add_argument('files', metavar='FILE', nargs='+', help='Input files to be concatenated')
-ap$add_argument('--output-file', dest='outfile', metavar='FILE', help='Output file')
-ap$add_argument('--verbose', action='store_true', help='Be verbose')
-ap$add_argument('--level', dtype='integer', default='100', help='Some number')
+p <- ArgumentParser(description='Description')
+p$add_argument('in_file', help='Input file')
+p$add_argument('-o', '--out_file', help='Output file')
+p$add_argument('--verbose', action='store_true', help='More detailed log messages')
 
-args <- ap$parse_args(commandArgs(TRUE))
+args <- p$parse_args(commandArgs(TRUE))
 print(args)
diff --git a/R/biomart.txt b/R/biomart.txt
@@ -0,0 +1,16 @@
+library(biomaRt)
+
+mart <- useMart('ensembl')
+mart <- useDataset('mmusculus_gene_ensembl', mart=mart)
+attr <- c('ensembl_gene_id', 'mgi_id', 'mgi_symbol')
+q <- getBM(attr=attr, mart=mart)
+
+getBM(attr=, mart=)
+  attr=c('ensembl_gene_id')  // retrieved fields
+  filter=c('ensembl_gene_id') // filter fields
+  value=c('ENSMUSG00000064336') // filter ensembl_gene_id == ENSMUSG00000064336
+
+listMarts()
+listDatasets()
+listAttributes()
+listFilters()
diff --git a/R/brewer.txt b/R/brewer.txt
@@ -16,3 +16,30 @@
 'RdYlGn'
 colorRampPalette(pal)(200)  // interpolate color palette to 200 colors
 
+
+
+brewer_cols <- function(pal='Spectral', rev=F) {
+  ncol <- brewer.pal.info[pal, 'maxcolors']
+  col <- colorRampPalette(brewer.pal(ncol, pal))(50)
+  if (rev) {
+    col <- rev(col)
+  }
+  return (col)
+}
+
+## Sequential
+cols <- list()
+cols[[length(cols)+1]] <- brewer_cols('YlGnBu')
+cols[[length(cols)+1]] <- brewer_cols('YlGn')
+cols[[length(cols)+1]] <- brewer_cols('Reds')
+cols[[length(cols)+1]] <- brewer_cols('RdPu')
+cols[[length(cols)+1]] <- brewer_cols('Purples')
+cols[[length(cols)+1]] <- brewer_cols('PuBuGn')
+cols[[length(cols)+1]] <- brewer_cols('PuBu')
+cols[[length(cols)+1]] <- brewer_cols('OrRd')
+cols[[length(cols)+1]] <- brewer_cols('Greens')
+cols[[length(cols)+1]] <- brewer_cols('GnBu')
+cols[[length(cols)+1]] <- brewer_cols('BuPu')
+cols[[length(cols)+1]] <- brewer_cols('BuGn')
+cols[[length(cols)+1]] <- brewer_cols('Blues')
+cols[[length(cols)+1]] <- brewer_cols('Spectral', rev=T)
diff --git a/R/clust.txt b/R/clust.txt
@@ -27,8 +27,11 @@ use=
   - method=single/complete/average
   - hc$order  // order or rows
 * plot(hc, ...)
-  xlab=NA, ylab=NA, main=NA, sub=NA
-
+  xlab=NA
+  ylab=NA
+  main=NA
+  sub=NA
+
 * members = cutree(hc, k=, h=)
   - return cluster assignment for hierachical clustering
   - k: number of clusters

diff --git a/R/cor.txt b/R/cor.txt
@@ -0,0 +1,22 @@
+cor(M, method=, use=, ...)
+  * between columns of M
+  method=pearson, spearman, kendall
+  use=
+    * everything: NA produce column with NA
+    * all: NA throws error
+    * complete.obs: remove rows that contain >0 NA (all samples)
+    * pairwise.complete.obs: remove only rows for pairwise comparison (different rows for different pairs)
+
+# cor to dist
+dist = 0.5 * (1 - abs(r))
+
+# cor.test
+cor.test(x, y, ...)
+  alternative=two.sided (default), less, greater
+
+# weighted
+library(weights)
+weighted.mean(x)  // default stats package
+wtd.var(x, w=NULL)
+wtd.cor(x, y, w=NULL)
+
diff --git a/R/corrplot.txt b/R/corrplot.txt
@@ -1,16 +1,27 @@
+# corrplot
 corrplot(matrix, method=, ...)
-# methods
-color
-
-# parameters
-is.corr=FALSE
-order=c('origional|hclust') // hclust correct?
+method=color, num
+is.corr=FALSE // if matrix is not correlation matrix (e.g. distance)
+order='original'
+  hclust: hclust(as.dist(matrix)) // which is WRONG if matrix is correlation matrix
 addrect=5 // rectangles, only if order='hclust'
 tl.col='black'  // color labels
 col=color_palette // c(col, col) do obtain full range
 cl.lim=c(0, 1)
 
+# correplot.mixed
+corrplot.mixed(matrix, lower='number', upper='circle')
+square, circle, ellipse, shade, color, number
+
 # colors
 col1 = colorRampPalette(c('white', 'blue'))
 col=col1(200)
 
+# If m is correlation matrix
+corrplot(m)
+plot(hclust(as.dist(1 - m)))
+
+# if m is distance matrix
+corrplot(m, is.corr=F)
+plot(hclust(as.dist(m)))
+
diff --git a/R/dplyr.txt b/R/dplyr.txt
@@ -1,42 +1,106 @@
+# misc
 plyr: only data.frames
 library(plyr) // load plyr first
 library(dplyr)
+options(...)
+  dplyr.width=200  // output width tbl_df in pixels
+  dplyr.print_min=10  // minimum # records to print; default=10
 
+# tbl_df
+* removes rownames -> use add_rownames('name')
 tbl = tbl_df(df)  // convert to tbl_df; nicer print
-print(tbl, n=10)
+tbl_df %>% print(n=10)  // print first 10 rows
 
-filter(df, c1=v1, c2=v2|v3, c3 %in% c(v1, v2))  // select rows
+# data_frame
+* creates tbl_df
+* no rownames
+* special column names
+data_frame(a=1:10, b=a*2, 'a+b'=a+b) %>% glimpse
+
+# printing / inspecting
+print(...)
+  n=10  // 10 lines
+  width=400 // 400px width
+  width=Inf // all columns
+glimpse // similar to str
+
+
+# select
 select(df, c1, c2)  // select columns
-  c1:c10  // between
-  -(c1:c10) // except
-  starts_with('prefix')
-  ends_with('suffix')
-  contains('substr')
-  matches('pattern')
-arrange(df, c1, desc(c2)) // sort by multiple columns
+c1:c10  // between
+-(c1:c10) // except
+c1=mpg, c2=drat // select and rename
+starts_with('prefix')
+ends_with('suffix')
+contains('substr')
+matches('pattern')
+one_of(c('c1', 'c2')) // select by vector
+
+# filter
+filter(c1=1, c2>2)
+between(c1, min, max)
+!is.na(c1)
+
+# mutate, transmutate
 mutate(df, c = f(c1, c2)) // add or mutate columns
+transmutate(c=f(c1))  // like mutate, but only keeps new columns
+
+# misc
+filter(df, c1=v1, c2=v2|v3, c3 %in% c(v1, v2))  // select rows
+arrange(df, c1, desc(c2)) // sort by multiple columns
+  * df most be data.frame, not tbl_df!!
 rename(df, new=old) // rename columns; unquoted
+slice(start:end, c(i, j, k))  // select rows by index
+add_rownames('c_rownames')  // rownames -> column 'c_rownames'
 
 # joins
-left_join(df, by=)
-right_join()
-inner_join()
-outer_join()
+left_join(df, ...)  // in a; fill NA
+  by=c('c1', 'c2')
+  by=c('left_c1'='right_c1')  // different column names
+right_join()  // in b; fill NA
+inner_join()  // in a AND b
+## filtering
+* filter records in a; do not add additional columns
+semi_join() // only show records from a that match b
+anti_join() // only show records from a that do NOT match b
 
 
 # group_by / summarise
 group_by(columns) %>%
-  summarise(c=f(c), ...)  // aggregate columns
-group_by(columns) %>%
-  summarise_each(funs(f1, f2), columns) // apply same function(s) on column(s)
-group_by(columns) %>%
-  summarise(n=n(), n_val=n_distinct(c))  // counts # records
-group_by(columns) %>% // count # records
-  tally()
-group_by(columns) %>%
-  summarise_each(funs(mean(., na.rm=T)), matches('pattern'))
-group_by(columns) %>%
-  top_n(2, wt = column) // select top n records from group by column
+summarise(c=f(c), ...)  // aggregate columns
+summarise_each(funs(f1, f2), columns) // apply same function(s) on column(s)
+summarise_each(funs(mean(., na.rm=T)), matches('pattern'))
+summarise(n=n(), n_val=n_distinct(c))  // counts # records
+
+## functions
+group_by(id) %>% n_groups()  // # groups
+group_by(id) %>% summarise(n=n()) // # records in group
+group_by(.) %>% mutate(c=f(c))  // group-specific modifications
+count(id) // # records in group
+tally(var, ...) // == summarise(n=sum(var))
+  sort=T  // sort by n
+  wt=col  // weight records by col; summarize(n=sum(col))
+count(...)  // group_by(col) %>% tally
+slice(start:end, c(i, j))  // select by index
+slice(1)  // select single group member
+  head(3) // first 3 record (does not always work!)
+top_n(3, id) // select top n ordered by id in DESCENDING order
+  * returns MULTIPLE records, if id not unique!
+sample_n(3) // sample three records of each group
+distinct  // unique members; more efficient than unique
+
+## sorting
+d %>% group_by(k) %>% arrange(...)  // sort within groups
+d %>% group_by(k) %>% summarize %>% ungroup %>% arrange(...)  // sort globally
+d %>% group_by(k1, k2) %>% summarize(cs=f(c)) %>% arrange(cs)
+  * summarize removes one group level(g2)
+  * sort by cs for each group in k1
+
+## do
+group_by(...) %>% do(f(.))
+  f is function that returns data.frame
+  . is current group as data.frame
+data %>$ group_by(id) %>% do(data.frame(c1=sd(.$c1), c2=sd(.$c2)))
 
 
 # piping
@@ -53,6 +117,7 @@ lag(v, n = 1), lead(v, n = 1)
   * v - lag(v)  // change to previous value
 sample_n(v, n, replace = F, weights = NULL) // sample n rows
 sample_frac(v, frac, ...) // sample frac % of rows
+ntile(v, n) // assign each v to one of n bins; same #points in each bin
 
 
 # Database support
@@ -62,3 +127,14 @@ sample_frac(v, frac, ...) // sample frac % of rows
 my_db = src_sqlite('my_db.sqlite3')
 table = tbl(my_db, 'table')
 
+# Standard evaluation function
+function()  // non-standard evaluation
+function_() // standard evaluation
+
+# Strings/character in function
+fun_()
+filter_('c1 > 10')
+rename_('new'='old')
+arrange_('c1', 'desc(c2)')
+
+
diff --git a/R/general.txt b/R/general.txt
@@ -2,6 +2,10 @@
 options('name') // get
 options(name=value) // set
 
+# Exception handling
+results <- tryCatch(f(), error = function(x) return (y))
+  * return f(), or y if f() fails
+
 
 # Representing missing values
 * NA
@@ -39,9 +43,9 @@ Sys.setenv(env)
 * installed.packages()
 * available.packages()
 * remove.packages('pkg')  // uninstall package
-* update.packages() // update all packages in library
+* update.packages(ask=F, checkBuilt=T) // update all packages in library
   * ask=F // do not ask
-  * checkBuild=T // built with R 2.15; but now R 3.0 -> check
+  * checkBuilt=T // built with R 2.15; but now R 3.0 -> check
 * chooseCRANmirror()
 * R CMD INSTALL <pkg directory> // install source package manually
 * library, require(name): load packages