Chapter 3 Dplyr

3.8 Join functions

##   x1 x2
## 1  A  1
## 2  B  2
## 3  C  3
##   x1 x3
## 1  A  T
## 2  B  F
## 3  D  T
## Warning: Column `x1` joining factors with different levels, coercing to
## character vector
##   x1 x2 x3
## 1  A  1  T
## 2  B  2  F
## Warning: Column `x1` joining factors with different levels, coercing to
## character vector
##   x1 x2   x3
## 1  A  1    T
## 2  B  2    F
## 3  C  3 <NA>
## Warning: Column `x1` joining factors with different levels, coercing to
## character vector
##   x1 x2 x3
## 1  A  1  T
## 2  B  2  F
## 3  D NA  T
## Warning: Column `x1` joining factors with different levels, coercing to
## character vector
##   x1 x2   x3
## 1  A  1    T
## 2  B  2    F
## 3  C  3 <NA>
## 4  D NA    T
## Warning: Column `x1` joining factors with different levels, coercing to
## character vector
##   x1 x2
## 1  A  1
## 2  B  2
## Warning: Column `x1` joining factors with different levels, coercing to
## character vector
##   x1 x2
## 1  C  3

3.10 Adding rows and columns

##   Age Name Gender  N
## 1  10    A      1 NA
## 2  15    B      0 NA
## 3  10    C      1 NA
## 4  15    B      0 NA
##   Age Name Gender
## 1  10    A      1
## 2  15    B      0
## 3  10    C      1
## 4  15    B      0
## 5  NA <NA>     NA

3.11 Remove duplicates

##   Age Name Gender
## 1  10    A      1
## 2  15    B      0
## 3  10    C      1
##   Age Name Gender
## 1  10    A      1
## 2  15    B      0
##   Age Name Gender
## 1  10    A      1
## 2  15    B      0

3.12 Select

##   Name Gender
## 1    A      1
## 2    B      0
## 3    C      1
## 4    B      0
##   Age
## 1  10
## 2  15
## 3  10
## 4  15
##   Age
## 1  10
## 2  15
## 3  10
## 4  15
##   Age
## 1  10
## 2  15
## 3  10
## 4  15
##   Name Gender
## 1    A      1
## 2    B      0
## 3    C      1
## 4    B      0
##   Age Gender
## 1  10      1
## 2  15      0
## 3  10      1
## 4  15      0
##   Gender
## 1      1
## 2      0
## 3      1
## 4      0
##   Age Name Gender
## 1  10    A      1
## 2  15    B      0
## 3  10    C      1
## 4  15    B      0
##   Gender Age Name
## 1      1  10    A
## 2      0  15    B
## 3      1  10    C
## 4      0  15    B
##   Gender Age Name
## 1      1  10    A
## 2      0  15    B
## 3      1  10    C
## 4      0  15    B

3.13 Filter rows

##   Age Name Gender
## 1  10    A      1
## 2  10    C      1
##   Age Name Gender
## 1  15    B      0
## 2  15    B      0
##   Age Name Gender
## 1  10    A      1
## 2  15    B      0
## 3  15    B      0
##   Age Name Gender
## 1  15    B      0
## 2  15    B      0
##   Age Name Gender
## 1  10    A      1
## 2  15    B      0
## 3  10    C      1
## 4  15    B      0
##   Age Name Gender
## 1  10    C      1
##   Age Name Gender
## 1  10    A      1

3.14 Summarize

summarize(df, avg = mean(Age), m = median(Age))

summarise_at(df, vars(Gender, Age), funs(n(), mean, median)) # multiple functions summarise_if(df, is.numeric, funs(n(),mean,median)) # for all numeric columns summarise_at(df, vars(Gender,Age), function(x) var(x - mean(x))) # custom function

summarise summarize_all # Allply funs to every column summarize_at # Apply funs to specific columns summarize_if # Apply funs to all cols of one type

3.15 Sort

##   Age Name Gender
## 1  10    A      1
## 2  10    C      1
## 3  15    B      0
## 4  15    B      0
##   Age Name Gender
## 1  15    B      0
## 2  15    B      0
## 3  10    A      1
## 4  10    C      1
##   Age Name Gender
## 1  10    A      1
## 2  10    C      1
## 3  15    B      0
## 4  15    B      0
## # A tibble: 4 x 3
## # Groups:   Age, Name [3]
##     Age Name  Gender
##   <dbl> <fct>  <dbl>
## 1    10 A          1
## 2    15 B          0
## 3    10 C          1
## 4    15 B          0

3.17 Combine data

intersect(x, y) # Rows that appear in both x and y. union(x, y) # Rows that appear in either or both x and y. setdiff(x, y) # Rows that appear in x but not y.

3.18 ‘by’ is a common variable (primary key) to join by.

inner_join(x, y, by = ) left_join(x, y, by = ) right_join(x, y, by = ) full_join(x, y, by = ) semi_join(x, y, by = ) anti_join(x, y, by = )

if_else(condition, true, false, missing = NULL) mydf =data.frame(x = c(1:5,NA))

3.19 Nested If_Else

mydf %>% mutate(newvar= if_else(is.na(x),“I am missing”, if_else(x==1,“I am one”, if_else(x==2,“I am two”, if_else(x==3,“I am three”,“Others”))))) #TODO bind_rows() bind_cols() ntile()

3.20 if() Family of Functions

3.21 TODO

select_if mutate_if pull()

3.22 Vectorize functions to columns

mutate transmute mutate_all mutate_at add_column rename

3.23 How to …

3.23.1 Convert empty spaces to NA

## [1] "a" "b" NA  "d"

3.23.2 Randomly select n rows

##    A   B
## 1 10 1.0
## 2  8 0.8
## 3  4 0.4

sample_n(df, size, …) # Randomly select size rows slice(df # select rows by position

3.24 Sources