Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, a fast friendly file reader and parallel file writer. Offers a natural and flexible syntax, for faster development.
dplyr
A fast, consistent tool for working with data frame like objects, both in memory and out of memory.
Pipelines.
tidyr
An evolution of ‘reshape2’. It’s designed specifically for data tidying (not general reshaping or aggregating) and works well with dplyr data pipelines.
# The data.table packagelibrary(data.table)# Create my_first_data_tablemy_first_data_table<-data.table(x=c('a','b','c','d','e'),y=c(1,2,3,4,5))# Create a data.table using recyclingDT<-data.table(a=1:2,b=c('A','B','C','D'))# Print the third row to the consoleDT[3,]
12
## a b
## 1: 1 C
# Print the second and third row to the console, but do not commasDT[2:3]
DT[i, j, by] means take DT, subset rows using i, then calculate j grouped by by. You can wrap j with .().
A<-c(1,2,3,4,5)B<-c('a','b','c','d','e')C<-c(6,7,8,9,10)DT<-data.table(A,B,C)# Subset rows 1 and 3, and columns B and CDT[c(1,3),.(B,C)]
123
## B C
## 1: a 6
## 2: c 8
# Assign to ans the correct valueans<-data.table(DT[,.(B,val=A*C)])# Fill in the blanks such that ans2 equals target#target <- data.table(B = c('a', 'b', 'c', 'd', 'e', 'a', 'b', 'c', 'd', 'e'), val = as.integer(c(6:10, 1:5)))ans2<-data.table(DT[,.(B,val=as.integer(c(6:10,1:5)))])
# iris and iris3 are already available in the workspace# Convert iris to a data.table: DTDT<-as.data.table(iris)# For each Species, print the mean Sepal.LengthDT[,.(mean(Sepal.Length)),by=.(Species)]
# data.table version of iris: DTDT<-as.data.table(iris)# Group the specimens by Sepal area (to the nearest 10 cm2) and count how many occur in each group.DT[,.N,by=10*round(Sepal.Length*Sepal.Width/10)]
1234
## round N
## 1: 20 117
## 2: 10 29
## 3: 30 4
# Now name the output columns `Area` and `Count`DT[,.(Count=.N),by=.(Area=10*round(Sepal.Length*Sepal.Width/10))]
# Create the data.table DTset.seed(1L)DT<-data.table(A=rep(letters[2:1],each=4L),B=rep(1:4,each=2L),C=sample(8))# Create the new data.table, DT2DT2<-DT[,.(C=cumsum(C)),by=.(A,B)]# Select from DT2 the last two values from C while you group by ADT2[,.(C=tail(C,2)),by=A]
## A B C
## 1: b 1 3
## 2: b 1 8
## 3: b 2 4
## 4: b 2 5
## 5: a 3 1
## 6: a 3 7
## 7: a 4 2
## 8: a 4 6
# Use chaining# Cumsum of C while grouping by A and B, and then select last two values of C while grouping by ADT[,.(C=cumsum(C)),by=.(A,B)][,.(C=tail(C,2)),by=.(A)]
12345
## A C
## 1: b 4
## 2: b 9
## 3: a 2
## 4: a 8
Chaining your iris dataset
DT<-data.table(iris)# Perform chained operations on DTDT[,.(Sepal.Length=median(Sepal.Length),Sepal.Width=median(Sepal.Width),Petal.Length=median(Petal.Length),Petal.Width=median(Petal.Width)),by=.(Species)][order(Species,decreasing=TRUE)]
.SDcols specifies the columns of DT that are included in .SD.
grp<-c(6,6,8,8,8)Q1<-c(4,3,3,5,3)Q2<-c(1,4,1,4,4)Q3<-c(3,1,5,5,2)H1<-c(1,2,3,2,4)H2<-c(1,4,3,4,3)DT<-data.table(grp,Q1,Q2,Q3,H1,H2)# Calculate the sum of the Q columnsDT[,lapply(.SD,sum),.SDcols=2:4]
12
## Q1 Q2 Q3
## 1: 18 14 16
# Calculate the sum of columns H1 and H2 DT[,lapply(.SD,sum),.SDcols=5:6]
12
## H1 H2
## 1: 12 15
# Select all but the first row of groups 1 and 2, returning only the grp column and the Q columns. DT[,.SD[-1],.SDcols=2:4,by=.(grp)]
x<-c(2,1,2,1,2,2,1)y<-c(1,3,5,7,9,11,13)z<-c(2,4,6,8,10,12,14)DT<-data.table(x,y,z)# Sum of all columns and the number of rows# For the first part, you need to combine the returned list from lapply, .SD and .SDcols and the integer vector of .N. You have to this because the result of the two together has to be a list again, with all values put together.DT
The set function is used to repeatedly update a data.table by reference. You can think of the set function as a loopable.
A<-c(2,2,3,5,2,5,5,4,4,1)B<-c(2,1,4,2,4,3,4,5,2,4)C<-c(5,2,4,1,2,2,1,2,5,2)D<-c(3,3,3,1,5,4,4,1,4,3)DT<-data.table(A,B,C,D)# Set the seedset.seed(1)# Check the DTDT
# For loop with setfor (lin2:4)set(DT,sample(10,3),l,NA)# Change the column names to lowercasesetnames(DT,c('A','B','C','D'),c('a','b','c','d'))# Print the resulting DT to the consoleDT
1 2 3 4 5 6 7 8 91011
## a b c d
## 1: 2 2 5 3
## 2: 2 1 NA 3
## 3: 3 NA 4 3
## 4: 5 NA 1 1
## 5: 2 NA 2 5
## 6: 5 3 2 NA
## 7: 5 4 1 4
## 8: 4 5 NA 1
## 9: 4 2 5 NA
## 10: 1 4 NA NA
# Convert iris to a data.tableiris<-data.table('Sepal.Length'=iris$Sepal.Length,'Sepal.Width'=iris$Sepal.Width,'Petal.Length'=iris$Petal.Length,'Petal.Width'=iris$Petal.Width,'Species'=iris$Species)iris
# Remove the 'Sepal.' prefix#gsub('([ab])', '\\1_\\1_', 'abc and ABC') = pattern, replacement, xsetnames(iris,c('Sepal.Length','Sepal.Width'),c('Length','Width'))#gsub('^Sepal\\.','', iris)# Remove the two columns starting with 'Petal'iris[,c('Petal.Length','Petal.Width'):=NULL]
# Keyed data.table DTDT<-data.table(A=letters[c(2,1,2,3,1,2,3)],B=c(5,4,1,9,8,8,6),C=6:12,key='A,B')# Get the key of DTkey(DT)
1
## [1] "A" "B"
# Row where A == 'b' & B == 6setkey(DT,A,B)DT[.('b',6)]
12
## A B C
## 1: b 6 NA
# Return the prevailing rowDT[.('b',6),roll=TRUE]
12
## A B C
## 1: b 6 6
# Return the nearest rowDT[.('b',6),roll=+Inf]
12
## A B C
## 1: b 6 6
Rolling joins – part two
# Keyed data.table DTDT<-data.table(A=letters[c(2,1,2,3,1,2,3)],B=c(5,4,1,9,8,8,6),C=6:12,key='A,B')# Look at the sequence (-2):10 for the 'b' groupDT[.('b',(-2):10)]
1 2 3 4 5 6 7 8 91011121314
## A B C
## 1: b -2 NA
## 2: b -1 NA
## 3: b 0 NA
## 4: b 1 8
## 5: b 2 NA
## 6: b 3 NA
## 7: b 4 NA
## 8: b 5 6
## 9: b 6 NA
## 10: b 7 NA
## 11: b 8 11
## 12: b 9 NA
## 13: b 10 NA
# Add code: carry the prevailing values forwardsDT[.('b',(-2):10),roll=TRUE]
1 2 3 4 5 6 7 8 91011121314
## A B C
## 1: b -2 NA
## 2: b -1 NA
## 3: b 0 NA
## 4: b 1 8
## 5: b 2 8
## 6: b 3 8
## 7: b 4 8
## 8: b 5 6
## 9: b 6 6
## 10: b 7 6
## 11: b 8 11
## 12: b 9 11
## 13: b 10 11
# Add code: carry the first observation backwardsDT[.('b',(-2):10),roll=TRUE,rollends=TRUE]
1 2 3 4 5 6 7 8 91011121314
## A B C
## 1: b -2 8
## 2: b -1 8
## 3: b 0 8
## 4: b 1 8
## 5: b 2 8
## 6: b 3 8
## 7: b 4 8
## 8: b 5 6
## 9: b 6 6
## 10: b 7 6
## 11: b 8 11
## 12: b 9 11
## 13: b 10 11