Chapter 3 Dplyr
3.1 Select columns
# Select columns
select(A,B) # select columns by name
select(c(A,B)) # select multiple columns by name
select(A,B:D) # select multiple oclumns by names
select(-B,-B) # exclude columns by name
select(-c(A,B)) # exclude multiple columns by name
select(starts_with('a')) # select by names of fields starting with ...
select(-starts_with('a')) # exclude by names of fields starting with ...
select(ends_with('a')) # ends with a prefix
select(contains('a')) # select by names of fields contains a literal string
select(matches()) # matches a regular expression
select(num_range()) # numerical range like x01, x02, x03.
select(one_of()) # variables in character vector
select(everything()) # all variables
select(A, B:D, contains('foo'), contains('bar'))
# Reorder columns
select(A, everything()) # reorder variables, that A will be in the 1st column
select(C, B, A, everything()) # reordrer columns
select(C, B, A) # the same as previous
# Rename columns
rename(A=X) # rename column A as X
3.2 Filter rows by condition
# Filter rows
filter(A==1) # rows where column A is equal to 1
filter(A>1 & B<2) # multiple condition
filter(A %in% c(1,2,3)) # select rows if A is from given vector
filter(A %in% c(1,2) & C > 3) # multiple filtering AND
filter(A %in% c(1, 2) | C == 3) # multiple filtering OR
filter(!A %in% c(2, 3)) # filtering NOT
filter(grepl("a", A)) # filter rorws by grep values in column A
filter(is.na(A)) # filter rows if column A contains 'NA'
filter(!is.na(A)) # filter 'NA'
3.3 Slice rows by index
3.4 Adding rows and columns
3.5 Remove duplicates
3.6 Grouping and aggregation
3.7 Operations on several data frames
# Compair data
all_equal(x,y) # compair two data frames
# Combine data
intersect(x,y) # rows that appear in both x and y.
union(x,y) # rows that appear in either or both x and y.
setdiff(x,y) # rows that appear in x but not y.
# Sort
arrange(A) # sort rows by A column
arrange(desc(A)) # sort by descendence of values in column A
arrange(A, B) # sort by A & B
group_by(A) # group rows by A column
group_by(A = as.factor(A)) # group rows by A column
group_by(A = cut(A, 3)) # group by 3 ranges from A
3.8 Join functions
inner_join(x, y, by = )
left_join(x, y, by = )
right_join(x, y, by = )
full_join(x, y, by = )
semi_join(x, y, by = )
anti_join(x, y, by = )
library(dplyr)
a <- data.frame(x1=c('A','B','C'), x2=c(1,2,3))
b <- data.frame(x1=c('A','B','D'), x3=c('T','F','T'))
a
## x1 x2
## 1 A 1
## 2 B 2
## 3 C 3
## x1 x3
## 1 A T
## 2 B F
## 3 D T
## Warning: Column `x1` joining factors with different levels, coercing to
## character vector
## x1 x2 x3
## 1 A 1 T
## 2 B 2 F
## Warning: Column `x1` joining factors with different levels, coercing to
## character vector
## x1 x2 x3
## 1 A 1 T
## 2 B 2 F
## 3 C 3 <NA>
## Warning: Column `x1` joining factors with different levels, coercing to
## character vector
## x1 x2 x3
## 1 A 1 T
## 2 B 2 F
## 3 D NA T
## Warning: Column `x1` joining factors with different levels, coercing to
## character vector
## x1 x2 x3
## 1 A 1 T
## 2 B 2 F
## 3 C 3 <NA>
## 4 D NA T
## Warning: Column `x1` joining factors with different levels, coercing to
## character vector
## x1 x2
## 1 A 1
## 2 B 2
## Warning: Column `x1` joining factors with different levels, coercing to
## character vector
## x1 x2
## 1 C 3
3.9 Miscellaneous
df %>% na_if("") # Convert empty spaces to 'NA'
sample_frac(df, size=0.8) # Randomly select fraction of rows
sample_n(10) # Randomly select number of rows
# toy data
df <- data.frame("Age" = c(10,15,10,15), "Name" = c("A","B", "C", "B"), "Gender"=c(1,0,1,0))
df
## Age Name Gender
## 1 10 A 1
## 2 15 B 0
## 3 10 C 1
## 4 15 B 0
3.10 Adding rows and columns
## Age Name Gender N
## 1 10 A 1 NA
## 2 15 B 0 NA
## 3 10 C 1 NA
## 4 15 B 0 NA
## Age Name Gender
## 1 10 A 1
## 2 15 B 0
## 3 10 C 1
## 4 15 B 0
## 5 NA <NA> NA
3.11 Remove duplicates
## Age Name Gender
## 1 10 A 1
## 2 15 B 0
## 3 10 C 1
## Age Name Gender
## 1 10 A 1
## 2 15 B 0
## Age Name Gender
## 1 10 A 1
## 2 15 B 0
3.12 Select
## Name Gender
## 1 A 1
## 2 B 0
## 3 C 1
## 4 B 0
## Age
## 1 10
## 2 15
## 3 10
## 4 15
## Age
## 1 10
## 2 15
## 3 10
## 4 15
## Age
## 1 10
## 2 15
## 3 10
## 4 15
## Name Gender
## 1 A 1
## 2 B 0
## 3 C 1
## 4 B 0
## Age Gender
## 1 10 1
## 2 15 0
## 3 10 1
## 4 15 0
## Gender
## 1 1
## 2 0
## 3 1
## 4 0
# matches() Matches a regular expression
# num_range() Numerical range like x01, x02, x03.
# one_of() Variables in character vector.
# everything() All variables.
select(df, Age, everything()) # reorder variables, that Age will be in the 1st column
## Age Name Gender
## 1 10 A 1
## 2 15 B 0
## 3 10 C 1
## 4 15 B 0
## Gender Age Name
## 1 1 10 A
## 2 0 15 B
## 3 1 10 C
## 4 0 15 B
## Gender Age Name
## 1 1 10 A
## 2 0 15 B
## 3 1 10 C
## 4 0 15 B
3.13 Filter rows
## Age Name Gender
## 1 10 A 1
## 2 10 C 1
## Age Name Gender
## 1 15 B 0
## 2 15 B 0
## Age Name Gender
## 1 10 A 1
## 2 15 B 0
## 3 15 B 0
## Age Name Gender
## 1 15 B 0
## 2 15 B 0
## Age Name Gender
## 1 10 A 1
## 2 15 B 0
## 3 10 C 1
## 4 15 B 0
## Age Name Gender
## 1 10 C 1
## Age Name Gender
## 1 10 A 1
3.14 Summarize
summarize(df, avg = mean(Age), m = median(Age))
summarise_at(df, vars(Gender, Age), funs(n(), mean, median)) # multiple functions summarise_if(df, is.numeric, funs(n(),mean,median)) # for all numeric columns summarise_at(df, vars(Gender,Age), function(x) var(x - mean(x))) # custom function
summarise summarize_all # Allply funs to every column summarize_at # Apply funs to specific columns summarize_if # Apply funs to all cols of one type
3.15 Sort
## Age Name Gender
## 1 10 A 1
## 2 10 C 1
## 3 15 B 0
## 4 15 B 0
## Age Name Gender
## 1 15 B 0
## 2 15 B 0
## 3 10 A 1
## 4 10 C 1
## Age Name Gender
## 1 10 A 1
## 2 10 C 1
## 3 15 B 0
## 4 15 B 0
## # A tibble: 4 x 3
## # Groups: Age, Name [3]
## Age Name Gender
## <dbl> <fct> <dbl>
## 1 10 A 1
## 2 15 B 0
## 3 10 C 1
## 4 15 B 0
3.16 Pipes
## Age Name
## 1 10 C
## 2 15 B
3.17 Combine data
intersect(x, y) # Rows that appear in both x and y. union(x, y) # Rows that appear in either or both x and y. setdiff(x, y) # Rows that appear in x but not y.
3.18 ‘by’ is a common variable (primary key) to join by.
inner_join(x, y, by = ) left_join(x, y, by = ) right_join(x, y, by = ) full_join(x, y, by = ) semi_join(x, y, by = ) anti_join(x, y, by = )
if_else(condition, true, false, missing = NULL) mydf =data.frame(x = c(1:5,NA))
3.19 Nested If_Else
mydf %>% mutate(newvar= if_else(is.na(x),“I am missing”, if_else(x==1,“I am one”, if_else(x==2,“I am two”, if_else(x==3,“I am three”,“Others”))))) #TODO bind_rows() bind_cols() ntile()
3.20 if() Family of Functions
3.21 TODO
select_if mutate_if pull()
3.22 Vectorize functions to columns
mutate transmute mutate_all mutate_at add_column rename
3.23 How to …
3.23.1 Convert empty spaces to NA
## [1] "a" "b" NA "d"
3.23.2 Randomly select n rows
df <- data.frame(A=seq(1:10), B=seq(.1,1,.1))
df %>% sample_frac(size=0.3) # Randomly select fraction of rows
## A B
## 1 10 1.0
## 2 8 0.8
## 3 4 0.4
sample_n(df, size, …) # Randomly select size rows slice(df # select rows by position