# Managing Data using dplyr 
# In previous chapters we have already discussed some tools like the subset() function 
# and the use of [ and $ operators to extract subsets of data frames. However, other operations, 
# like filtering, re-ordering, and collapsing, can often be tedious operations in R whose syntax 
# is not very intuitive. The dplyr package is designed to mitigate a lot of these problems and to 
# provide a highly optimized set of routines specifically for dealing with data frames.
# Some of the key “verbs” provided by the dplyr package are
# * select: return a subset of the columns of a data frame, using a flexible notation
# * filter: extract a subset of rows from a data frame based on logical conditions
# * arrange: reorder rows of a data frame
# * rename: rename variables in a data frame
# * mutate: add new variables/columns or transform existing variables
# * summarise / summarize: generate summary statistics of different variables in the data frame, possibly within strata
# * %>%: the “pipe” operator is used to connect multiple verb actions together into a pipeline
#
# Last Update: June 4, 2020 
# Jin Man Lee, Phd, DePaul University 

library(dplyr)
data("babynames")
str(babynames)
# 1. select 
d1 <- select(babynames, year)
d2 <- select(babynames, year:name)
d3 <- select(babynames,c(name,year))
d4 <- select(babynames, -(n:prop))
d5 <- select(babynames, -c(prop,n))
d6 <- select(babynames, starts_with("n"))
d7 <- select(babynames, ends_with("p"))  
rm(d1,d2,d3,d4,d5,d6,d7)

# 2. filter 
d1 <- filter(babynames, name== "Taylor" & year==2017)
d2 <- filter(babynames, (name== "Smith" | name=="Taylor"), year==2017)
d3 <- filter(babynames, n > mean(n,na.rm=TRUE), year==2017)
rm(d1,d2)

# 3. Arrange
d1 <- arrange(babynames,year,name)
d2 <- arrange(babynames,desc(year),name)
rm(d1,d2)

# 4. rename 
d1 <- rename(babynames, year_born=year, firstname=name)
rm(d1)
# 5. group_by
d1 <- group_by(babynames,year)
summarize(d1,mean_n = mean(n,na.rm=TRUE),median_n = median(n,na.rm=TRUE))

# 6. mutate
d1 <- mutate(babynames, nmean = mean(n), n_dev = n - mean(n))
d2 <- group_by(babynames,sex)
d2 <- mutate(d2, nmean = mean(n), n_dev = n - mean(n))

# 7. %>% pipeline 
select(babynames, n,year,name,sex)  %>%
  filter(year > 2010)               %>%
  group_by(year,sex)                %>%
  mutate(nmean=mean(n))             %>%
  summarize(nmea_n=mean(n))   
  
# How many boys with the name "Taylor" 
sum(select(filter(babynames,sex=="M",name=="Taylor"),n))
# Using pipeline 
babynames %>% 
    filter(sex=="M",name=="Taylor",year>2015) %>%  
    select(year,n) %>% 
    print(n)