R-data.table

上一节介绍了最基本的如何操作data.frame的行列来实现各种功能需求的数据计算。这一节将介绍另外一个功能更加强大的R包:data.table

library(data.table)
DT = data.table(x=rep(c("a","b","c"),each=3), y=c(1,3,6), v=1:9);

行列选择

DT;
DT[2]; //第2行
DT[,v]; //第v列(向量形式)
DT[,list(v)]; //第v列(列表形式)
DT[2:3,sum(v)]; //第v列第2行和第3行之和
DT[2:5,cat(v,”\n”)]; //列出第2至第5行v列的值
DT[c(FALSE,TRUE)] //列出偶数行的值

DT;
DT[2];  
DT[,v];
DT[,list(v)];
DT[2:3,sum(v)];
DT[2:5,cat(v,"\n")];
DT[c(FALSE,TRUE)]













xyv
1a11
2a32
3a63
4b14
5b35
6b66
7c17
8c38
9c69





xyv
1a32

  1. 1

  2. 2

  3. 3

  4. 4

  5. 5

  6. 6

  7. 7

  8. 8

  9. 9














v
11
22
33
44
55
66
77
88
99

5
2 3 4 5
NULL









xyv
1a32
2b14
3b66
4c38
DT[,2,with=FALSE];
DT["a"];
DT[y== 6]













y
11
23
36
41
53
66
71
83
96







xyvv2m
1a142NA42
2a342NA42
3a642NA42







xyvv2m
1a642NA42
2b66845
3c69NA8

分组计算

DT;
DT[,sum(v),by=x];
DT[,sum(v),by=key(DT)]
DT["a",sum(v)]













xyv
1a11
2a32
3a63
4b14
5b35
6b66
7c17
8c38
9c69







xV1
1a6
2b15
3c24
DT[,sum(v),by=y]







yV1
1112
2315
3618
DT["a",sum(v)]
DT[c("a","b"),sum(v),by=.EACHI]

6







xV1
1a6
2b15
X = data.table(c("b","c"),foo=c(4,2))
X
DT






V1foo
1b4
2c2













xyv
1a11
2a32
3a63
4b14
5b35
6b66
7c17
8c38
9c69

合并

DT[X] #join(i为data.table,i join to x 通过x的key值)

DT[X]










xyvfoo
1b144
2b354
3b664
4c172
5c382
6c692
DT[X,sum(v),by=.EACHI]      #join and eval j for each row in i
DT[X,sum(v),by=.EACHI]






xV1
1b15
2c24
DT[X,mult="first"]          #first row of each group
DT[X,mult="first"]






xyvfoo
1b144
2c172
DT[X,sum(v)*foo,by=.EACHI]  #join inherited scope
DT[X,sum(v)*foo,by=.EACHI]






xV1
1b60
2c48
setkey(DT,x,y)
setkeyv(DT,c("x","y")
DT
DT["a"]













xyvv2m
1a142NA42
2a342NA42
3a642NA42
4b14845
5b35845
6b66845
7c17NA8
8c38NA8
9c69NA8







xyvv2m
1a142NA42
2a342NA42
3a642NA42

当 i 以列名的形式求值于data.table或list表达式中时,执行selfjoin操作

DT
DT["a"]
DT[.("a",1)]













xyvv2m
1a142NA42
2a342NA42
3a642NA42
4b14845
5b35845
6b66845
7c17NA8
8c38NA8
9c69NA8







xyvv2m
1a142NA42
2a342NA42
3a642NA42







xyvv2mV2
1a142NA421
2a342NA421
3a642NA421
DT[list("a",3:5)]













xyvV2
1a113
2a323
3a633
4a114
5a324
6a634
7a115
8a325
9a635
by: a list() of expressions of column names,在data.table框架中被计算,因此列名是被当作一个变量,然后data.table 以by计算的值分组,为每一个组计算j的值。
当i是data.table时,DT[i,j,by=.EACHI],以DT里面每一行i值分组,你能同时合并(in i)并计算(in j)

DT
DT[,sum(v),by=.(y%%2)]
DT[X]
DT[X,sum(v),by=.EACHI]













xyv
1a11
2a32
3a63
4b14
5b35
6b66
7c17
8c38
9c69






yV1
1127
2018










xyvfoo
1b144
2b354
3b664
4c172
5c382
6c692






xV1
1b15
2c24
x = data.table(c("b","c"),foo=c(4,2))
x
DT
DT[,.SD[2],by=x]






V1foo
1b4
2c2













xyv
1a11
2a32
3a63
4b14
5b35
6b66
7c17
8c38
9c69







xyv
1a32
2b35
3c38
DT[,tail(.SD,2),by=x]










xyv
1a32
2a63
3b35
4b66
5c38
6c69
DT
DT[,lapply(.SD,sum),by=x]













xyv
1a11
2a32
3a63
4b14
5b35
6b66
7c17
8c38
9c69







xyv
1a106
2b1015
3c1024
DT
DT[,list(MySum=sum(v),
MyMin=min(v),
MyMax=max(v)),
by=.(x,y%%2)]













xyv
1a11
2a32
3a63
4b14
5b35
6b66
7c17
8c38
9c69










xyMySumMyMinMyMax
1a1312
2a0333
3b1945
4b0666
5c11578
6c0999
DT[,sum(v),x][V1<20]
DT[,sum(v),x][order(-V1)]






xV1
1a6
2b15







xV1
1c24
2b15
3a6
print(DT[,z:=42L])
print(DT[,z:=NULL])
print(DT["a",v:=42L])

   x y v  z
1: a 1 1 42
2: a 3 2 42
3: a 6 3 42
4: b 1 4 42
5: b 3 5 42
6: b 6 6 42
7: c 1 7 42
8: c 3 8 42
9: c 6 9 42
   x y v
1: a 1 1
2: a 3 2
3: a 6 3
4: b 1 4
5: b 3 5
6: b 6 6
7: c 1 7
8: c 3 8
9: c 6 9
   x y  v
1: a 1 42
2: a 3 42
3: a 6 42
4: b 1  4
5: b 3  5
6: b 6  6
7: c 1  7
8: c 3  8
9: c 6  9

print(DT["b",v2:=84L])

   x y  v v2
1: a 1 42 NA
2: a 3 42 NA
3: a 6 42 NA
4: b 1  4 84
5: b 3  5 84
6: b 6  6 84
7: c 1  7 NA
8: c 3  8 NA
9: c 6  9 NA

add new column by reference by group

#NB: postfix [] is shortcut to print()

DT[,m:=mean(v),by=x][] 













xyvv2m
1a142NA42
2a342NA42
3a642NA42
4b14845
5b35845
6b66845
7c17NA8
8c38NA8
9c69NA8
DT[,.SD[which.min(v)],by=x][]







xyvv2m
1a142NA42
2b14845
3c17NA8
DT[!.("a")]
DT[!"a"]
DT[!2:4]
DT[x!="b" | y!=3]
DT[!.("b",3)]










xyvv2m
1b14845
2b35845
3b66845
4c17NA8
5c38NA8
6c69NA8










xyvv2m
1b14845
2b35845
3b66845
4c17NA8
5c38NA8
6c69NA8










xyvv2m
1a142NA42
2b35845
3b66845
4c17NA8
5c38NA8
6c69NA8












xyvv2m
1a142NA42
2a342NA42
3a642NA42
4b14845
5b66845
6c17NA8
7c38NA8
8c69NA8










xyvv2m
1a142NA42
2a342NA42
3a642NA42
4c17NA8
5c38NA8
6c69NA8
DT1 = data.table(x=c("c", "a", "b", "a", "b"), a=1:5)
DT2 = data.table(x=c("d", "c", "b"), mul=6:8)
DT1
DT2









xa
1c1
2a2
3b3
4a4
5b5







xmul
1d6
2c7
3b8
library(data.table)
DT1[DT2,on=c(x="x")]

Error in `[.data.table`(DT1, DT2, on = c(x = "x")): unused argument (on = c(x = "x"))

update.packages()

Error in contrib.url(repos, type): trying to use CRAN without setting a mirror

rownames(DT)

  1. ‘1’

  2. ‘2’

  3. ‘3’

  4. ‘4’

  5. ‘5’

  6. ‘6’

  7. ‘7’

  8. ‘8’

  9. ‘9’

typeof(colnames(DT))

‘character’

t