Chapter 3 Dplyr

3.1 Select columns

# Select columns
select(A,B)                      # select columns by name
select(c(A,B))                   # select multiple columns by name
select(A,B:D)                    # select multiple oclumns by names

select(-B,-B)                    # exclude columns by name
select(-c(A,B))                  # exclude multiple columns by name

select(starts_with('a'))         # select by names of fields starting with ...
select(-starts_with('a'))        # exclude by names of fields starting with ...
select(ends_with('a'))           # ends with a prefix
select(contains('a'))            # select by names of fields contains a literal string
select(matches())                # matches a regular expression
select(num_range())              # numerical range like x01, x02, x03.
select(one_of())                 # variables in character vector
select(everything())             # all variables

select(A, B:D, contains('foo'), contains('bar'))

# Reorder columns
select(A, everything())          # reorder variables, that A will be in the 1st column
select(C, B, A, everything())    # reordrer columns
select(C, B, A)                  # the same as previous

# Rename columns
rename(A=X)                      # rename column A as X

3.2 Filter rows by condition

# Filter rows
filter(A==1)                     # rows where column A is equal to 1
filter(A>1 & B<2)                # multiple condition

filter(A %in% c(1,2,3))          # select rows if A is from given vector
filter(A %in% c(1,2) & C > 3)    # multiple filtering AND
filter(A %in% c(1, 2) | C == 3)  # multiple filtering OR
filter(!A %in% c(2, 3))          # filtering NOT
filter(grepl("a", A))            # filter rorws by grep values in column A

filter(is.na(A))                 # filter rows if column A contains 'NA'
filter(!is.na(A))                # filter 'NA'

3.3 Slice rows by index

slice(1L)            # first row
slice(2:5)           # 2-5 rows
slice(n())           # last row

# Indices must be either all positive or all negative
slice(5:n())         # from 5th to the last rows
slice(-5:-n())       # 1-4 rows
slice(1:4)           # 1-4 rows

3.4 Adding rows and columns

mutate(mycol = NA)               # add column 'mycol' filled with 'NA'
mutate(mycol = A*B)              # add column 'mycol' as A*B
mutate(mycol1, mycol2)           # add multiple columns
cbind(mycol = NA)                # add column 'mycol'

rbind(myrow = NA)                # add row filled with 'NA'

3.5 Remove duplicates

distinct()                       # remove duplicated rows
distinct(A, .keep_all=TRUE)      # remove rows when field 'A' is duplicated
distinct(A, B, .keep_all=TRUE)   # remove rows when several columns at the same row are duplicated

3.6 Grouping and aggregation

df %>%
  group_by(A) %>%
  summarize(avg_b = mean(B, na.rm = TRUE))

3.7 Operations on several data frames

# Compair data
all_equal(x,y)                   # compair two data frames

# Combine data
intersect(x,y)                   # rows that appear in both x and y.
union(x,y)                       # rows that appear in either or both x and y.
setdiff(x,y)                     # rows that appear in x but not y.

# Sort
arrange(A)                       # sort rows by A column
arrange(desc(A))                 # sort by descendence of values in column A
arrange(A, B)                    # sort by A & B

group_by(A)                      # group rows by A column
group_by(A = as.factor(A))       # group rows by A column
group_by(A = cut(A, 3))          # group by 3 ranges from A

3.8 Join functions

inner_join(x, y, by = )
left_join(x, y, by = )
right_join(x, y, by = )
full_join(x, y, by = )
semi_join(x, y, by = )
anti_join(x, y, by = )

library(dplyr)
a <- data.frame(x1=c('A','B','C'), x2=c(1,2,3))
b <- data.frame(x1=c('A','B','D'), x3=c('T','F','T'))
a

##   x1 x2
## 1  A  1
## 2  B  2
## 3  C  3

##   x1 x3
## 1  A  T
## 2  B  F
## 3  D  T

# Retain only rows in both sets.
dplyr::inner_join(a, b, by='x1')

## Warning: Column `x1` joining factors with different levels, coercing to
## character vector

##   x1 x2 x3
## 1  A  1  T
## 2  B  2  F

# Join matching rows from b to a
dplyr::left_join(a, b, by='x1')

## Warning: Column `x1` joining factors with different levels, coercing to
## character vector

##   x1 x2   x3
## 1  A  1    T
## 2  B  2    F
## 3  C  3 <NA>

# Join matching rows from a to b.
dplyr::right_join(a, b, by='x1')

## Warning: Column `x1` joining factors with different levels, coercing to
## character vector

##   x1 x2 x3
## 1  A  1  T
## 2  B  2  F
## 3  D NA  T

# Retain all values, all rows
dplyr::full_join(a, b, by = "x1")

## Warning: Column `x1` joining factors with different levels, coercing to
## character vector

##   x1 x2   x3
## 1  A  1    T
## 2  B  2    F
## 3  C  3 <NA>
## 4  D NA    T

# All rows in a that have a match in b.
dplyr::semi_join(a, b, by = "x1")

## Warning: Column `x1` joining factors with different levels, coercing to
## character vector

##   x1 x2
## 1  A  1
## 2  B  2

# All rows in a that do not have a match in b.
dplyr::anti_join(a, b, by = "x1")

## Warning: Column `x1` joining factors with different levels, coercing to
## character vector

##   x1 x2
## 1  C  3

3.9 Miscellaneous

df %>% na_if("")            # Convert empty spaces to 'NA'

sample_frac(df, size=0.8)   # Randomly select fraction of rows
sample_n(10)                # Randomly select number of rows

# toy data
df <- data.frame("Age" = c(10,15,10,15), "Name" = c("A","B", "C", "B"), "Gender"=c(1,0,1,0))
df

##   Age Name Gender
## 1  10    A      1
## 2  15    B      0
## 3  10    C      1
## 4  15    B      0

3.10 Adding rows and columns

df %>% mutate(N = NA)    # add new last column 'N' filled NA

##   Age Name Gender  N
## 1  10    A      1 NA
## 2  15    B      0 NA
## 3  10    C      1 NA
## 4  15    B      0 NA

df %>% rbind(N = NA)     # add new last row 'N' filled 'NA'

##   Age Name Gender
## 1  10    A      1
## 2  15    B      0
## 3  10    C      1
## 4  15    B      0
## 5  NA <NA>     NA

3.11 Remove duplicates

df %>% distinct()                                # remove duplicated rows

##   Age Name Gender
## 1  10    A      1
## 2  15    B      0
## 3  10    C      1

df %>% distinct(Age, .keep_all=TRUE)             # remove rows when field 'Age' is duplicated

##   Age Name Gender
## 1  10    A      1
## 2  15    B      0

df %>% distinct(Age, Gender, .keep_all=TRUE)     # remove rows when fields 'A' & 'B' are duplicated

##   Age Name Gender
## 1  10    A      1
## 2  15    B      0

3.12 Select

df %>% select(Name, Gender)          # select fields

##   Name Gender
## 1    A      1
## 2    B      0
## 3    C      1
## 4    B      0

df %>% select(-Name, -Gender)        # exclude fields

##   Age
## 1  10
## 2  15
## 3  10
## 4  15

df %>% select(-c(Name, Gender))      # same as privious

##   Age
## 1  10
## 2  15
## 3  10
## 4  15

df %>% select(starts_with("A"))      # select names of fields: A -> Age

##   Age
## 1  10
## 2  15
## 3  10
## 4  15

select(df, -starts_with("A"))        # select all except A -> Age

##   Name Gender
## 1    A      1
## 2    B      0
## 3    C      1
## 4    B      0

select(df, contains("G"))            # Contains a literal string

##   Age Gender
## 1  10      1
## 2  15      0
## 3  10      1
## 4  15      0

select(df, ends_with("r"))           # Ends with a prefix

##   Gender
## 1      1
## 2      0
## 3      1
## 4      0

          # matches()   Matches a regular expression
          # num_range() Numerical range like x01, x02, x03.
          # one_of()    Variables in character vector.
          # everything()    All variables.
select(df, Age, everything())        # reorder variables, that Age will be in the 1st column

##   Age Name Gender
## 1  10    A      1
## 2  15    B      0
## 3  10    C      1
## 4  15    B      0

select(df, Gender, Age, Name, everything()) # reordrer columns

##   Gender Age Name
## 1      1  10    A
## 2      0  15    B
## 3      1  10    C
## 4      0  15    B

select(df, Gender, Age, Name)               # the same as previous

##   Gender Age Name
## 1      1  10    A
## 2      0  15    B
## 3      1  10    C
## 4      0  15    B

3.13 Filter rows

filter(df, Gender==1)

##   Age Name Gender
## 1  10    A      1
## 2  10    C      1

filter(df, Age>10)

##   Age Name Gender
## 1  15    B      0
## 2  15    B      0

filter(df, Name %in% c('A','B'))

##   Age Name Gender
## 1  10    A      1
## 2  15    B      0
## 3  15    B      0

filter(df, Name %in% c('A','B') & Age > 10)     # multiple filtering AND

##   Age Name Gender
## 1  15    B      0
## 2  15    B      0

filter(df, Name %in% c('A', 'B') | Gender == 1) # multiple filtering OR

##   Age Name Gender
## 1  10    A      1
## 2  15    B      0
## 3  10    C      1
## 4  15    B      0

filter(df, !Name %in% c('A', 'B'))              # filtering NOT

##   Age Name Gender
## 1  10    C      1

filter(df, grepl("A", Name))                    # grepl function

##   Age Name Gender
## 1  10    A      1

3.14 Summarize

summarize(df, avg = mean(Age), m = median(Age))

summarise_at(df, vars(Gender, Age), funs(n(), mean, median)) # multiple functions summarise_if(df, is.numeric, funs(n(),mean,median)) # for all numeric columns summarise_at(df, vars(Gender,Age), function(x) var(x - mean(x))) # custom function

summarise summarize_all # Allply funs to every column summarize_at # Apply funs to specific columns summarize_if # Apply funs to all cols of one type

3.15 Sort

arrange(df, Age)

##   Age Name Gender
## 1  10    A      1
## 2  10    C      1
## 3  15    B      0
## 4  15    B      0

arrange(df, desc(Age))

##   Age Name Gender
## 1  15    B      0
## 2  15    B      0
## 3  10    A      1
## 4  10    C      1

arrange(df, Age, Name)

##   Age Name Gender
## 1  10    A      1
## 2  10    C      1
## 3  15    B      0
## 4  15    B      0

group_by(df, Age, Name) # ???

## # A tibble: 4 x 3
## # Groups:   Age, Name [3]
##     Age Name  Gender
##   <dbl> <fct>  <dbl>
## 1    10 A          1
## 2    15 B          0
## 3    10 C          1
## 4    15 B          0

3.16 Pipes

df %>% select(Age, Name) %>% arrange(Age) %>% filter(Name %in% c('C','B')) %>% distinct() # select columns and sort by and select rows and remove duplicated rows

##   Age Name
## 1  10    C
## 2  15    B

3.17 Combine data

intersect(x, y) # Rows that appear in both x and y. union(x, y) # Rows that appear in either or both x and y. setdiff(x, y) # Rows that appear in x but not y.

3.18 ‘by’ is a common variable (primary key) to join by.

inner_join(x, y, by = ) left_join(x, y, by = ) right_join(x, y, by = ) full_join(x, y, by = ) semi_join(x, y, by = ) anti_join(x, y, by = )

if_else(condition, true, false, missing = NULL) mydf =data.frame(x = c(1:5,NA))

3.19 Nested If_Else

mydf %>% mutate(newvar= if_else(is.na(x),“I am missing”, if_else(x==1,“I am one”, if_else(x==2,“I am two”, if_else(x==3,“I am three”,“Others”))))) #TODO bind_rows() bind_cols() ntile()

3.20 if() Family of Functions

3.21 TODO

select_if mutate_if pull()

3.22 Vectorize functions to columns

mutate transmute mutate_all mutate_at add_column rename

3.23 How to …

3.23.1 Convert empty spaces to NA

df <- c("a", "b", "", "d")
df %>% na_if("")              # "a" "b" NA  "d"

## [1] "a" "b" NA  "d"

3.23.2 Randomly select n rows

df <- data.frame(A=seq(1:10), B=seq(.1,1,.1))
df %>% sample_frac(size=0.3)   # Randomly select fraction of rows

##    A   B
## 1 10 1.0
## 2  8 0.8
## 3  4 0.4

sample_n(df, size, …) # Randomly select size rows slice(df # select rows by position

A Minimal Book Example