Fast aggregation of large data (e.g. 100GB in RAM), fast ordered
joins, fast add/modify/delete of columns by group using no copies at
all, list columns, a fast friendly file reader and parallel
file writer. Offers a natural and flexible syntax, for
faster development.
dplyr
A fast, consistent tool for working with data frame like objects,
both in memory and out of memory.
Pipelines.
tidyr
An evolution of ‘reshape2’. It’s designed specifically for data
tidying (not general reshaping or aggregating) and works well with
dplyr data pipelines.
# The data.table packagelibrary(data.table)# Create my_first_data_tablemy_first_data_table<-data.table(x=c('a','b','c','d','e'),y=c(1,2,3,4,5))# Create a data.table using recyclingDT<-data.table(a=1:2,b=c('A','B','C','D'))# Print the third row to the consoleDT[3,]
12
## a b
## 1: 1 C
# Print the second and third row to the console, but do not commasDT[2:3]
123
## a b
## 1: 2 B
## 2: 1 C
Getting to know a data.table
Like head, tail.
# Print the penultimate row of DT using .NDT[.N-1]
12
## a b
## 1: 1 C
# Print the column names of DT, and number of rows and number of columnscolnames(DT)
1
## [1] "a" "b"
dim(DT)
1
## [1] 4 2
# Select row 2 twice and row 3, returning a data.table with three rows where row 2 is a duplicate of row 1.DT[c(2,2,3)]
1234
## a b
## 1: 2 B
## 2: 2 B
## 3: 1 C
DT is a data.table/data.frame, but DT[ , B] is a vector; DT[ , .(B)] is a subsetted data.table.
Subsetting data tables
DT[i, j, by] means take DT, subset rows using i, then calculate j grouped by by. You can wrap j with .().
A<-c(1,2,3,4,5)B<-c('a','b','c','d','e')C<-c(6,7,8,9,10)DT<-data.table(A,B,C)# Subset rows 1 and 3, and columns B and CDT[c(1,3),.(B,C)]
123
## B C
## 1: a 6
## 2: c 8
# Assign to ans the correct valueans<-data.table(DT[,.(B,val=A*C)])# Fill in the blanks such that ans2 equals target#target <- data.table(B = c('a', 'b', 'c', 'd', 'e', 'a', 'b', 'c', 'd', 'e'), val = as.integer(c(6:10, 1:5)))ans2<-data.table(DT[,.(B,val=as.integer(c(6:10,1:5)))])
The by basics
# iris and iris3 are already available in the workspace# Convert iris to a data.table: DTDT<-as.data.table(iris)# For each Species, print the mean Sepal.LengthDT[,.(mean(Sepal.Length)),by=.(Species)]
# Print mean Sepal.Length, grouping by first letter of SpeciesDT[,.(mean(Sepal.Length)),by=.(substr(Species,1,1))]
123
## substr V1
## 1: s 5.006
## 2: v 6.262
Using .N and by
.N, number, in row or column.
# data.table version of iris: DTDT<-as.data.table(iris)# Group the specimens by Sepal area (to the nearest 10 cm2) and count how many occur in each group.DT[,.N,by=10*round(Sepal.Length*Sepal.Width/10)]
1234
## round N
## 1: 20 117
## 2: 10 29
## 3: 30 4
# Now name the output columns `Area` and `Count`DT[,.(Count=.N),by=.(Area=10*round(Sepal.Length*Sepal.Width/10))]
1234
## Area Count
## 1: 20 117
## 2: 10 29
## 3: 30 4
Return multiple numbers in j
# Create the data.table DTset.seed(1L)DT<-data.table(A=rep(letters[2:1],each=4L),B=rep(1:4,each=2L),C=sample(8))# Create the new data.table, DT2DT2<-DT[,.(C=cumsum(C)),by=.(A,B)]# Select from DT2 the last two values from C while you group by ADT2[,.(C=tail(C,2)),by=A]
## A B C
## 1: b 1 3
## 2: b 1 8
## 3: b 2 4
## 4: b 2 5
## 5: a 3 1
## 6: a 3 7
## 7: a 4 2
## 8: a 4 6
# Use chaining# Cumsum of C while grouping by A and B, and then select last two values of C while grouping by ADT[,.(C=cumsum(C)),by=.(A,B)][,.(C=tail(C,2)),by=.(A)]
12345
## A C
## 1: b 4
## 2: b 9
## 3: a 2
## 4: a 8
Chaining your iris dataset
DT<-data.table(iris)# Perform chained operations on DTDT[,.(Sepal.Length=median(Sepal.Length),Sepal.Width=median(Sepal.Width),Petal.Length=median(Petal.Length),Petal.Width=median(Petal.Width)),by=.(Species)][order(Species,decreasing=TRUE)]
x<-c(2,1,2,1,2,2,1)y<-c(1,3,5,7,9,11,13)z<-c(2,4,6,8,10,12,14)DT<-data.table(x,y,z)# Mean of columnsDT[,lapply(.SD,mean),by=.(x)]
123
## x y z
## 1: 2 6.500000 7.500000
## 2: 1 7.666667 8.666667
# Median of columnsDT[,lapply(.SD,median),by=.(x)]
123
## x y z
## 1: 2 7 8
## 2: 1 7 8
Introducing .SDcols
.SDcols specifies the columns of DT that are included in .SD.
grp<-c(6,6,8,8,8)Q1<-c(4,3,3,5,3)Q2<-c(1,4,1,4,4)Q3<-c(3,1,5,5,2)H1<-c(1,2,3,2,4)H2<-c(1,4,3,4,3)DT<-data.table(grp,Q1,Q2,Q3,H1,H2)# Calculate the sum of the Q columnsDT[,lapply(.SD,sum),.SDcols=2:4]
12
## Q1 Q2 Q3
## 1: 18 14 16
# Calculate the sum of columns H1 and H2 DT[,lapply(.SD,sum),.SDcols=5:6]
12
## H1 H2
## 1: 12 15
# Select all but the first row of groups 1 and 2, returning only the grp column and the Q columns. DT[,.SD[-1],.SDcols=2:4,by=.(grp)]
x<-c(2,1,2,1,2,2,1)y<-c(1,3,5,7,9,11,13)z<-c(2,4,6,8,10,12,14)DT<-data.table(x,y,z)# Sum of all columns and the number of rows# For the first part, you need to combine the returned list from lapply, .SD and .SDcols and the integer vector of .N. You have to this because the result of the two together has to be a list again, with all values put together.DT
The set function is used to repeatedly update a data.table by
reference. You can think of the set function as a loopable.
A<-c(2,2,3,5,2,5,5,4,4,1)B<-c(2,1,4,2,4,3,4,5,2,4)C<-c(5,2,4,1,2,2,1,2,5,2)D<-c(3,3,3,1,5,4,4,1,4,3)DT<-data.table(A,B,C,D)# Set the seedset.seed(1)# Check the DTDT
# For loop with setfor (lin2:4)set(DT,sample(10,3),l,NA)# Change the column names to lowercasesetnames(DT,c('A','B','C','D'),c('a','b','c','d'))# Print the resulting DT to the consoleDT
1 2 3 4 5 6 7 8 91011
## a b c d
## 1: 2 2 5 3
## 2: 2 1 NA 3
## 3: 3 NA 4 3
## 4: 5 NA 1 1
## 5: 2 NA 2 5
## 6: 5 3 2 NA
## 7: 5 4 1 4
## 8: 4 5 NA 1
## 9: 4 2 5 NA
## 10: 1 4 NA NA
# Convert iris to a data.tableiris<-data.table('Sepal.Length'=iris$Sepal.Length,'Sepal.Width'=iris$Sepal.Width,'Petal.Length'=iris$Petal.Length,'Petal.Width'=iris$Petal.Width,'Species'=iris$Species)iris
# Remove the 'Sepal.' prefix#gsub('([ab])', '\\1_\\1_', 'abc and ABC') = pattern, replacement, xsetnames(iris,c('Sepal.Length','Sepal.Width'),c('Length','Width'))#gsub('^Sepal\\.','', iris)# Remove the two columns starting with 'Petal'iris[,c('Petal.Length','Petal.Width'):=NULL]
# The 'keyed' data.table DTDT<-data.table(A=letters[c(2,1,2,3,1,2,3)],B=c(5,4,1,9,8,8,6),C=6:12)setkey(DT,A,B)# Select the 'b' groupDT['b']
1234
## A B C
## 1: b 1 8
## 2: b 5 6
## 3: b 8 11
# 'b' and 'c' groupsDT[c('b','c')]
123456
## A B C
## 1: b 1 8
## 2: b 5 6
## 3: b 8 11
## 4: c 6 12
## 5: c 9 9
# The first row of the 'b' and 'c' groupsDT[c('b','c'),mult='first']
123
## A B C
## 1: b 1 8
## 2: c 6 12
# First and last row of the 'b' and 'c' groupsDT[c('b','c'),.SD[c(1,.N)],by=.EACHI]
12345
## A B C
## 1: b 1 8
## 2: b 8 11
## 3: c 6 12
## 4: c 9 9
# Copy and extend code for instruction 4: add printoutDT[c('b','c'),{print(.SD);.SD[c(1,.N)]},by=.EACHI]
1 2 3 4 5 6 7 8 910111213
## B C
## 1: 1 8
## 2: 5 6
## 3: 8 11
## B C
## 1: 6 12
## 2: 9 9
## A B C
## 1: b 1 8
## 2: b 8 11
## 3: c 6 12
## 4: c 9 9
Rolling joins - part one
# Keyed data.table DTDT<-data.table(A=letters[c(2,1,2,3,1,2,3)],B=c(5,4,1,9,8,8,6),C=6:12,key='A,B')# Get the key of DTkey(DT)
1
## [1] "A" "B"
# Row where A == 'b' & B == 6setkey(DT,A,B)DT[.('b',6)]
12
## A B C
## 1: b 6 NA
# Return the prevailing rowDT[.('b',6),roll=TRUE]
12
## A B C
## 1: b 6 6
# Return the nearest rowDT[.('b',6),roll=+Inf]
12
## A B C
## 1: b 6 6
Rolling joins - part two
# Keyed data.table DTDT<-data.table(A=letters[c(2,1,2,3,1,2,3)],B=c(5,4,1,9,8,8,6),C=6:12,key='A,B')# Look at the sequence (-2):10 for the 'b' groupDT[.('b',(-2):10)]
1 2 3 4 5 6 7 8 91011121314
## A B C
## 1: b -2 NA
## 2: b -1 NA
## 3: b 0 NA
## 4: b 1 8
## 5: b 2 NA
## 6: b 3 NA
## 7: b 4 NA
## 8: b 5 6
## 9: b 6 NA
## 10: b 7 NA
## 11: b 8 11
## 12: b 9 NA
## 13: b 10 NA
# Add code: carry the prevailing values forwardsDT[.('b',(-2):10),roll=TRUE]
1 2 3 4 5 6 7 8 91011121314
## A B C
## 1: b -2 NA
## 2: b -1 NA
## 3: b 0 NA
## 4: b 1 8
## 5: b 2 8
## 6: b 3 8
## 7: b 4 8
## 8: b 5 6
## 9: b 6 6
## 10: b 7 6
## 11: b 8 11
## 12: b 9 11
## 13: b 10 11
# Add code: carry the first observation backwardsDT[.('b',(-2):10),roll=TRUE,rollends=TRUE]
1 2 3 4 5 6 7 8 91011121314
## A B C
## 1: b -2 8
## 2: b -1 8
## 3: b 0 8
## 4: b 1 8
## 5: b 2 8
## 6: b 3 8
## 7: b 4 8
## 8: b 5 6
## 9: b 6 6
## 10: b 7 6
## 11: b 8 11
## 12: b 9 11
## 13: b 10 11
# Load the dplyr packagelibrary(dplyr)library(dtplyr)# Load the hflights package# A data only package containing commercial domestic flights that departed Houston (IAH and HOU) in 2011library(hflights)# Call both head() and summary() on hflightshead(hflights)
# Create the object carriers, containing only the UniqueCarrier variable of hflightscarriers<-hflights$UniqueCarrier
Changing labels of hflights, part 1 of 2
# addlut<-c('AA'='American','AS'='Alaska','B6'='JetBlue','CO'='Continental','DL'='Delta','OO'='SkyWest','UA'='United','US'='US_Airways','WN'='Southwest','EV'='Atlantic_Southeast','F9'='Frontier','FL'='AirTran','MQ'='American_Eagle','XE'='ExpressJet','YV'='Mesa')# Use lut to translate the UniqueCarrier column of hflightshflights$UniqueCarrier<-lut[hflights$UniqueCarrier]# Inspect the resulting raw values of your variablesglimpse(hflights)
# Build the lookup table: lutlut<-c("A"="carrier","B"="weather","C"="FFA","D"="security","E"="not cancelled")# Add the Code columnhflights$Code<-lut[hflights$CancellationCode]# Glimpse at hflightsglimpse(hflights)
# Print out a tbl as described in the second instruction, using both helper functions and variable namesselect(hflights,UniqueCarrier,ends_with('Num'),starts_with('Cancel'))
1 2 3 4 5 6 7 8 91011121314
## # A tibble: 227,496 × 5
## UniqueCarrier FlightNum TailNum Cancelled CancellationCode
## * <chr> <int> <chr> <int> <chr>
## 1 American 428 N576AA 0
## 2 American 428 N557AA 0
## 3 American 428 N541AA 0
## 4 American 428 N403AA 0
## 5 American 428 N492AA 0
## 6 American 428 N262AA 0
## 7 American 428 N493AA 0
## 8 American 428 N477AA 0
## 9 American 428 N476AA 0
## 10 American 428 N504AA 0
## # ... with 227,486 more rows
# Print out a tbl as described in the third instruction, using only helper functions.select(hflights,ends_with('Time'),ends_with('Delay'))
# Add the new variable ActualGroundTime to a copy of hflights and save the result as g1g1<-mutate(hflights,ActualGroundTime=ActualElapsedTime-AirTime)glimpse(hflights)
# Add a second variable loss_percent to the dataset: m1m1<-mutate(hflights,loss=ArrDelay-DepDelay,loss_percent=(ArrDelay-DepDelay)/DepDelay*100)# Copy and adapt the previous command to reduce redendancy: m2m2<-mutate(hflights,loss=ArrDelay-DepDelay,loss_percent=loss/DepDelay*100)# Add the three variables as described in the third instruction: m3m3<-mutate(hflights,TotalTaxi=TaxiIn+TaxiOut,ActualGroundTime=ActualElapsedTime-AirTime,Diff=TotalTaxi-ActualGroundTime)glimpse(m3)
# All cancelled weekend flightsfilter(hflights,DayOfWeek%in%c(6,7)&Cancelled==1)
1 2 3 4 5 6 7 8 9101112131415161718
## # A tibble: 585 × 21## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier## <int> <int> <int> <int> <int> <int> <chr>## 1 2011 1 9 7 NA NA American## 2 2011 1 29 6 NA NA Continental## 3 2011 1 9 7 NA NA Continental## 4 2011 1 9 7 NA NA Delta## 5 2011 1 9 7 NA NA SkyWest## 6 2011 1 2 7 NA NA Southwest## 7 2011 1 29 6 NA NA Delta## 8 2011 1 9 7 NA NA Atlantic_Southeast## 9 2011 1 1 6 NA NA AirTran## 10 2011 1 9 7 NA NA AirTran## # ... with 575 more rows, and 14 more variables: FlightNum <int>,## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,## # Diverted <int>
# All flights that were cancelled after being delayedfilter(hflights,DepDelay>0&Cancelled==1)
1 2 3 4 5 6 7 8 9101112131415161718
## # A tibble: 40 × 21## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier## <int> <int> <int> <int> <int> <int> <chr>## 1 2011 1 26 3 1926 NA Continental## 2 2011 1 11 2 1100 NA US_Airways## 3 2011 1 19 3 1811 NA ExpressJet## 4 2011 1 7 5 2028 NA ExpressJet## 5 2011 2 4 5 1638 NA American## 6 2011 2 8 2 1057 NA Continental## 7 2011 2 2 3 802 NA ExpressJet## 8 2011 2 9 3 904 NA ExpressJet## 9 2011 2 1 2 1508 NA SkyWest## 10 2011 3 31 4 1016 NA Continental## # ... with 30 more rows, and 14 more variables: FlightNum <int>,## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,## # Diverted <int>
Blend together what you’ve learned!
# Select the flights that had JFK as their destination: c1c1<-filter(hflights,Dest=='JFK')# Combine the Year, Month and DayofMonth variables to create a Date column: c2c2<-mutate(c1,Date=paste(Year,Month,DayofMonth,sep='-'))# Print out a selection of columns of c2select(c2,Date,DepTime,ArrTime,TailNum)
# Remove rows that have NA ArrDelay: temp1temp1<-filter(hflights,!is.na(ArrDelay))# Generate summary about ArrDelay column of temp1summarise(temp1,earliest=min(ArrDelay),average=mean(ArrDelay),latest=max(ArrDelay),sd=sd(ArrDelay))
# Keep rows that have no NA TaxiIn and no NA TaxiOut: temp2temp2<-filter(hflights,!is.na(TaxiIn),!is.na(TaxiOut))# Print the maximum taxiing difference of temp2 with summarise()summarise(temp2,max_taxi_diff=max(abs(TaxiIn-TaxiOut)))
# Filter hflights to keep all American Airline flights: aaaa<-filter(hflights,UniqueCarrier=='American')# Generate summarizing statistics for aa summarise(aa,n_flights=n(),n_canc=sum(Cancelled),p_canc=n_canc/n_flights*100,avg_delay=mean(ArrDelay,na.rm=TRUE))
# Part 1, concerning the selection and creation of columnsd<-hflights%>%select(Dest,UniqueCarrier,Distance,ActualElapsedTime)%>%mutate(RealTime=ActualElapsedTime+100,mph=Distance/RealTime*60)# Part 2, concerning flights that had an actual average speed of < 70 mph.d%>%filter(!is.na(mph),mph<70)%>%summarise(n_less=n(),n_dest=n_distinct(Dest),min_dist=min(Distance),max_dist=max(Distance))
# Solve the exercise using a combination of dplyr verbs and %>%hflights%>%#summarise(all_flights = n()) %>%filter(((Distance/(ActualElapsedTime+100)*60)<105)|Cancelled==1|Diverted==1)%>%summarise(n_non=n(),p_non=n_non/22751*100,n_dest=n_distinct(Dest),min_dist=min(Distance),max_dist=max(Distance))
# Make an ordered per-carrier summary of hflightshflights%>%group_by(UniqueCarrier)%>%summarise(n_flights=n(),n_canc=sum(Cancelled==1),p_canc=mean(Cancelled==1)*100,avg_delay=mean(ArrDelay,na.rm=TRUE))%>%arrange(avg_delay,p_canc)
# Make an ordered per-day summary of hflightshflights%>%group_by(DayOfWeek)%>%summarise(avg_taxi=mean(TaxiIn+TaxiOut,na.rm=TRUE))%>%arrange(desc(avg_taxi))
# Solution to first instructionhflights%>%filter(!is.na(ArrDelay))%>%group_by(UniqueCarrier)%>%summarise(p_delay=sum(ArrDelay>0)/n())%>%mutate(rank=rank(p_delay))%>%arrange(rank)
# Solution to second instructionhflights%>%filter(!is.na(ArrDelay),ArrDelay>0)%>%group_by(UniqueCarrier)%>%summarise(avg=mean(ArrDelay))%>%mutate(rank=rank(avg))%>%arrange(rank)
# Which plane (by tail number) flew out of Houston the most times? How many times? adv1adv1<-hflights%>%group_by(TailNum)%>%summarise(n=n())%>%filter(n==max(n))# How many airplanes only flew to one destination from Houston? adv2adv2<-hflights%>%group_by(TailNum)%>%summarise(ndest=n_distinct(Dest))%>%filter(ndest==1)%>%summarise(nplanes=n())# Find the most visited destination for each carrier: adv3adv3<-hflights%>%group_by(UniqueCarrier,Dest)%>%summarise(n=n())%>%mutate(rank=rank(desc(n)))%>%filter(rank==1)# Find the carrier that travels to each destination the most: adv4adv4<-hflights%>%group_by(Dest,UniqueCarrier)%>%summarise(n=n())%>%mutate(rank=rank(desc(n)))%>%filter(rank==1)
dplyr deals with different types
# Use summarise to calculate n_carriers2<-hflights%>%summarise(n_carrier=n_distinct(UniqueCarrier))
dplyr and mySQL databases
Code only.
# set up a src that connects to the mysql database (src_mysql is provided by dplyr)my_db<-src_mysql(dbname='dplyr',host='dplyr.csrrinzqubik.us-east-1.rds.amazonaws.com',port=3306,user='dplyr',password='dplyr')# and reference a table within that src: nycflights is now available as an R object that references to the remote nycflights tablenycflights<-tbl(my_db,'dplyr')# glimpse at nycflightsglimpse(nycflights)# Calculate the grouped summaries detailed in the instructionsnycflights%>%group_by(carrier)%>%summarise(n_flights=n(),avg_delay=mean(arr_delay))%>%arrange(avg_delay)