R notes

df[-(1:5), ] # Omit first 5 rows of my_df
df[, -4]     # Omit fourth column of my_df
lapply(X, FUN, …)
sapply(X, FUN, …, simplify = TRUE, USE.NAMES = TRUE)
vapply(X, FUN, FUN.VALUE, …, USE.NAMES = TRUE)
replicate(n, expr, simplify = "array")
simplify2array(x, higher = TRUE)
mean(c(1:9, NA), trim=0.1, na.rm = TRUE)

Environments

If a variable name is not defined inside a function, R will look one level up.
If a name also isn’t defined there, an error occurs:
x=1
rm(x) #remove a definition of x
g <- function() {
    y <- 1
    c(x,y)
}
Function should never depend on variables (that could be changed outside) other than the arguments.
Each call to a function has its own clear environment:
j <- function(){
   if(!exsist("a")) {
       a <- 1
   } else {
       a <- a+1
   } 
   print (a)
}   
Two types of vectors in R:
  • Atomic vectors (homogeneous) of 6 types: logical; integer; double;character; complex; raw.
  • Lists (heterogeneous recursive vectors) – can contain other lists
Every vector has 2 key properties: typeof(), length(). Additional meta data can be added to a vector through attributes. Creating and subsetting a list:
x1 <- list(a, b, c(TRUE, FALSE), 1:10)
x2 <- list(x1, 
           a = list(-1, -3),
           b = 1:3,
           c = "a string",
           d = pi)

x[i] - exstracts a sublist
x[[i]], x$i - extract elements (remove a level of hierarchy)

>str(x2[2])   
List of 1
 $ a:List of 2
  ..$ : num -1
  ..$ : num -3

>str(x2[[2]])
List of 2
 $ : num -1
 $ : num -3
Missing values
typeof(NULL) is NULL #indicates the absence of a vector
length(NULL) is 0
typeof(NA) is logical #indicates the absence of a value in a vector length(NA) is 1
Any basic math or logical comparison with NA always results in NA:
NA + 10 //NA
NA > 5 //NA
10 == NA //NA
NA == NA //NA
Use seq_along() instead of 1:ncol(df) as a loop iterator to provide handling for empty cases:
for (i in seq_along(empty_df)) {
  print(median(empty_df[[i]]))
}
x <- c(1:10, NA) 
rescale01 <- function(x) {
  rng <- range(x, na.rm = TRUE) 
  return((x - rng[1]) / (rng[2] - rng[1]))
}

# Count how many elements are missing in both x and y
both_na <- function(x,y)
  sum(is.na(x) & is.na(y))
Function calculates a confidence interval for a population mean:
mean_ci <- function(x, mean_ci <- function(x, level = 0.95) {
  if (length(x) == 0) {
    warning("`x` was empty", call. = FALSE)
    return(c(-Inf, Inf))
  } else { 
    se <- sd(x) / sqrt(length(x))
    alpha <- 1 - level
    return(mean(x) + se * qnorm(c(alpha / 2, 1 - alpha / 2)))
  }
}
>cat(LETTERS[1:4]) #the function designed just to display output
A B C D
> paste(LETTERS[1:4])
[1] "A" "B" "C" "D"
#replace NAs:
x[is.na(x)] <- replacement

#sampling
sample(c(1:5, NA), 100, replace = TRUE)
#rnorm()
#qnorm()

#Allocation
output <- numeric(ncol(df))

{purrr}

map functions 1. Loop over a vector .x 2. Applie the function .f to each element 3. Return the results, preserved from the input: map() returns a list or data frame map_lgl() returns a logical vector map_int() returns a integer vector map_dbl() returns a double vector map_chr() returns a character vector
map_dbl(df, mean, trim = 0.5, na.rm = TRUE)
sapply(df, mean)
##########################################
col_summary <- function(df, fun) {
  output <- numeric(ncol(df))
  for (i in seq_along(df)) {
    output[[i]] <- fun(df[[i]])
  }
  output
}
col_summary(df, mean)
##########################################
library(purr)
map_dbl(df, mean)
# Find the columns that are numeric
map_lgl(df, is.numeric)

# Find the type of each column
map_chr(df, typeof)

# Find a summary of each column
map(df, summary)

# Find the 5th percentile of each column, excluding missing values
map_dbl(.df, quantile, probs = c(0.05), na.rm = TRUE)
Using an anonymous function
map(cyl, function(df) lm(mpg~wt, data=df))
Using a formula
map(cyl, ~ lm(mpg ~ wt, data = .))
Using a string
list_of_results <- list(
  list(a = 1, b = "A"), 
  list(a = 2, b = "C"), 
  list(a = 3, b = "D")
)
#pull out the "a" element from every entry with string shortcut
map(list_of_results, "a")
# Save the result of linear model
models<- map(cyl, ~ lm(mpg ~ wt, data = .))
# Use map and coef to get the coefficients for each model: coefs
coefs<- map(models, coef)
# Use string shortcut to extract the wt coefficient 
map(coefs, "wt")
Using a numeric vector
coefs <- map(models, coef)
# use map_dbl with the numeric shortcut to pull out the second element
map_dbl(coefs,2)
Pipe operator: %>% x %>% f(y) is another way of writing f(x, y)
#Downloading the HTML files at each URL
urls <- list(
  example = "http://example.org",
  rproj = "http://www.r-project.org",
  asdf = "http://asdfasdasdkfjlda"
)
#generates an error:
map(urls, readLines)
#generates a list with returned value and an error 
map(urls, safely(readLines))
# Extract the result from one of the successful elements
(html$example)$result
# Extract the error from the element that was unsuccessful
html$asdf$error

#transpose() turns a list-of-lists "inside-out":
# Extract the results:
transpose(html)$result
# Extract the errors:
transpose(html)$error

# Initialize some objects
safe_readLines <- safely(readLines)
html <- map(urls, safe_readLines)
res <- transpose(html)[["result"]]
errs <- transpose(html)[["error"]]
# Create a logical vector is_ok
is_ok <- (map_lgl(errs, is_null))
# Extract the successful results
res[is_ok]
# Find the URLs that were unsuccessful
urls[!is_ok]

#3 calles of rnorm()
rnorm(5)
rnorm(10)
rnorm(20)
#one call of map()
map(list(5,10,20), rnorm)
#rnorm(n, mean=0, sd=1)

rnorm(5, mean=1)
rnorm(10, mean=5)
rnorm(20, mean=10)
map2() to iterate over 2 arguments
map2(list(5,10,20), list(1,5,10), rnorm)
pmap() to iterate over many arguments
#pmap(.l, .f, ...)
rnorm(5, mean=1, sd=0.1)
rnorm(10, mean=5, sd=0.5)
rnorm(20, mean=10, sd=0.1)
pmap(list(n =    list(5, 10, 20),
          mean = list(1,5,10),
          sd =   list(0.1, 0.5, 0.1)), rnorm)
invoke_map() to iterate over functions and arguments
#invoke_map(.f, .x = list(NULL), ...)
rnorm(5)
runif(5)
rexp(5)
invoke_map(list(rnorm, runif, rexp), n=5)
— walk() operates just like map() except it’s designed for functions that don’t return anything. use walk() for functions with side effects like printing, plotting or saving. — walk() functions return the object you passed to them (they can easily be used in pipelines). library(ggplot2) plots <- mtcars$cyl %>% map(~ggplot(., aes(mpg, wt)) + geom_point()) paths <- paste0(names(plots, "pdf") #take a description of a plot and save it to disc walk2(paths, plots, ggsave) #take a peek at a contents of X and find the lengths, in one line lengths <- x %>% walk(print) %>% map_dbl(length) rexp() runif() Robust Functions
Three main problems:
  • Type-unstable functions
  • Non-standart evaluation
  • Hidden arguments
  • ——————
  • Type inconsistent: the type of the return object depends on the input.
    E.g. df[,1] – sometimes returns a data frame, sometimes – a vector.
    Two common solutions for [ ]
    #1
    #Setting drop = FALSE forces single bracket sunsetting to be type-consistent
    last_row <- function(df){
      df[nrow(df), , drop=FALSE] #returns data frame
      #df[nrow(df),]  #returns int
    }
    df <- data.frame(x=1:3)
    last_row(df)
    
    #2
    #Subsetting the data frame like a list
    df[x]
    
    Note: all functions in {purrr} are type-consistent
    E.g. map() is a type consistent function - always returns a list.
    col_classes <- function(df) {
      class_list <- map(df, class)
      # Use map_chr() to extract first element in class_list
      map_chr(class_list,1 )
    }
    
    flatten_chr() takes a list and removes its hierarchy and will either return a character string or an error message.
    -------------------------------
  • Non-standart evaluation - functions which don't use the usual loock-up rules for variables - may cause problem when use it in own functions
    For possible solutions read Hadley Wickham's vignette.
  • #evaluated inside mtcars
    #It does not exist in global environment
    subset(mtcars, disp > 450)

    Pure functions
    • Output only depends on input
    • Don't effect the outside environment expect theit return value
  • Hidden arguments are function inputs that may be different for different users or sessions. E.g. arguments defaults that depend of global options.
  • Global options are settings that effect entire R session
    Getting and setting options:
    #get a list with all values of global options
    options()
    #An option that controls how many digits to print for numeric values 
    getOption("digits")
    options(digits = 5)
    
    getOption("stringsAsFactors")