# Managing Data using dplyr # In previous chapters we have already discussed some tools like the subset() function # and the use of [ and $ operators to extract subsets of data frames. However, other operations, # like filtering, re-ordering, and collapsing, can often be tedious operations in R whose syntax # is not very intuitive. The dplyr package is designed to mitigate a lot of these problems and to # provide a highly optimized set of routines specifically for dealing with data frames. # Some of the key “verbs” provided by the dplyr package are # * select: return a subset of the columns of a data frame, using a flexible notation # * filter: extract a subset of rows from a data frame based on logical conditions # * arrange: reorder rows of a data frame # * rename: rename variables in a data frame # * mutate: add new variables/columns or transform existing variables # * summarise / summarize: generate summary statistics of different variables in the data frame, possibly within strata # * %>%: the “pipe” operator is used to connect multiple verb actions together into a pipeline # # Last Update: June 4, 2020 # Jin Man Lee, Phd, DePaul University library(dplyr) data("babynames") str(babynames) # 1. select d1 <- select(babynames, year) d2 <- select(babynames, year:name) d3 <- select(babynames,c(name,year)) d4 <- select(babynames, -(n:prop)) d5 <- select(babynames, -c(prop,n)) d6 <- select(babynames, starts_with("n")) d7 <- select(babynames, ends_with("p")) rm(d1,d2,d3,d4,d5,d6,d7) # 2. filter d1 <- filter(babynames, name== "Taylor" & year==2017) d2 <- filter(babynames, (name== "Smith" | name=="Taylor"), year==2017) d3 <- filter(babynames, n > mean(n,na.rm=TRUE), year==2017) rm(d1,d2) # 3. Arrange d1 <- arrange(babynames,year,name) d2 <- arrange(babynames,desc(year),name) rm(d1,d2) # 4. rename d1 <- rename(babynames, year_born=year, firstname=name) rm(d1) # 5. group_by d1 <- group_by(babynames,year) summarize(d1,mean_n = mean(n,na.rm=TRUE),median_n = median(n,na.rm=TRUE)) # 6. mutate d1 <- mutate(babynames, nmean = mean(n), n_dev = n - mean(n)) d2 <- group_by(babynames,sex) d2 <- mutate(d2, nmean = mean(n), n_dev = n - mean(n)) # 7. %>% pipeline select(babynames, n,year,name,sex) %>% filter(year > 2010) %>% group_by(year,sex) %>% mutate(nmean=mean(n)) %>% summarize(nmea_n=mean(n)) # How many boys with the name "Taylor" sum(select(filter(babynames,sex=="M",name=="Taylor"),n)) # Using pipeline babynames %>% filter(sex=="M",name=="Taylor",year>2015) %>% select(year,n) %>% print(n)