library(data.table)
DT = data.table(a=LETTERS[c(1,1:3)],b=4:7,key="a")
DT








ab
1A4
2A5
3B6
4C7

:= 通过引用的方式增加或者更新列
类似transform方法,但是更容易操作

DT <- data.table(a=rep(1:3, each=2), b=1:6)
DT2 <- transform(DT, c = a^2)
DT[, c:=a^2]
identical(DT,DT2)










abc
1111
2121
3234
4244
5359
6369

TRUE

DT2 <- within(DT, {
b <- rev(b)
c <- a*2
rm(a)
})
DT[,`:=`(b = rev(b),
c = a*2,
a = NULL)]
identical(DT,DT2)










bc
162
252
344
434
526
616

TRUE

DT[,c:=8]
DT[,d:=9L]
DT[,c:=NULL]
DT[2,d:=10L]








abdc
1A498
2A5108
3B698
4C798








abdc
1A498
2A598
3B698
4C798








abd
1A49
2A59
3B69
4C79








abd
1A49
2A510
3B69
4C79
DT[b>4,b:=d*2L]
DT["A",b:=0L]








abd
1A49
2A2010
3B189
4C189








abd
1A09
2A010
3B189
4C189
DT[,e:=mean(d),by=a]
DT["B",f:=mean(d)]








abde
1A099.5
2A0109.5
3B1899
4C1899








abdef
1A099.5NA
2A0109.5NA
3B18999
4C1899NA

between 选择符合范围判断的项

DT = data.table(a=1:5, b=6:10)
DT[b %between% c(7,9)]







ab
127
238
349

chmatch 快速匹配两个字符向量,返回一个匹配成功位置的向量

a<-LETTERS[c(1:5)]
b<-LETTERS[c(4:10)]
chmatch(a,b) 
a %chin% b

  1. NA

  2. NA

  3. NA

  4. 1

  5. 2


  1. FALSE

  2. FALSE

  3. FALSE

  4. TRUE

  5. TRUE

dcast.data.table 数据表的整形

melt.data.table用法

require(reshape2)
require(data.table)
DT <- data.table(
           i1 = c(1:5, NA), 
           i2 = c(NA,6,7,8,9,10), 
           f1 = factor(sample(c(letters[1:3], NA), 6, TRUE)), 
           c1 = sample(c(letters[1:3], NA), 6, TRUE), 
           d1 = as.Date(c(1:3,NA,4:5), origin="2013-09-01"), 
           d2 = as.Date(6:1, origin="2012-01-01"))
DT[, l1 := DT[, list(c=list(rep(i1, sample(5,1)))), by = i1]$c] # list cols
DT[, l2 := DT[, list(c=list(rep(c1, sample(5,1)))), by = i1]$c]

melt(DT, id=1:2, measure=3) 
melt(DT, id=c("i1", "i2"), measure="f1", value.factor=TRUE) # same as above, but value is factor

melt(DT, id=c("i1", "f1"), measure=c("d1", "d2")) # date class attribute lost
melt(DT, id=c("i1", "f1"), measure=c("c1", "d1")) # value is char, date attribute lost


melt(DT, id=1, measure=c("l1", "l2")) # value is a list
melt(DT, id=1, measure=c("c1", "l1")) # c1 coerced to list


     melt(DT, id=1, measure=c("c1", "f1")) # value is char
     melt(DT, id=1, measure=c("c1", "i2")) # i2 coerced to char


     melt(DT, id=1, measure=c("c1", "i2"), na.rm=TRUE) # remove NA



Loading required package: reshape2










i1i2f1c1d1d2l1
11NAca2013-09-022012-01-071,1,1
226ab2013-09-032012-01-062,2,2,2
337ca2013-09-042012-01-053,3,3,3,3
448NANANA2012-01-044
559ab2013-09-052012-01-035,5,5,5,5
6NA10ca2013-09-062012-01-02NA










i1i2f1c1d1d2l1l2
11NAca2013-09-022012-01-071,1,1a,a,a
226ab2013-09-032012-01-062,2,2,2b,b,b,b,b
337ca2013-09-042012-01-053,3,3,3,3a,a
448NANANA2012-01-044NA
559ab2013-09-052012-01-035,5,5,5,5b,b,b,b,b
6NA10ca2013-09-062012-01-02NAa,a,a










i1i2variablevalue
11NAf1c
226f1a
337f1c
448f1NA
559f1a
6NA10f1c










i1i2variablevalue
11NAf1c
226f1a
337f1c
448f1NA
559f1a
6NA10f1c
















i1f1variablevalue
11cd115950
22ad115951
33cd115952
44NAd1NA
55ad115953
6NAcd115954
71cd215346
82ad215345
93cd215344
104NAd215343
115ad215342
12NAcd215341
Warning message:
In melt.data.table(DT, id = c("i1", "f1"), measure = c("c1", "d1")): All 'measure.vars are NOT of the SAME type. By order of hierarchy, the molten data value column will be of type 'character'. Therefore all measure variables that are not of type 'character' will be coerced to. Check the DETAILS section of ?melt.data.table for more on coercion.
















i1f1variablevalue
11cc1a
22ac1b
33cc1a
44NAc1NA
55ac1b
6NAcc1a
71cd115950
82ad115951
93cd115952
104NAd1NA
115ad115953
12NAcd115954
















i1variablevalue
11l11,1,1
22l12,2,2,2
33l13,3,3,3,3
44l14
55l15,5,5,5,5
6NAl1NA
71l2a,a,a
82l2b,b,b,b,b
93l2a,a
104l2NA
115l2b,b,b,b,b
12NAl2a,a,a
Warning message:
In melt.data.table(DT, id = 1, measure = c("c1", "l1")): All 'measure.vars are NOT of the SAME type. By order of hierarchy, the molten data value column will be of type 'list'. Therefore all measure variables that are not of type 'list' will be coerced to. Check the DETAILS section of ?melt.data.table for more on coercion.
















i1variablevalue
11c1a
22c1b
33c1a
44c1NA
55c1b
6NAc1a
71l11,1,1
82l12,2,2,2
93l13,3,3,3,3
104l14
115l15,5,5,5,5
12NAl1NA
















i1variablevalue
11c1a
22c1b
33c1a
44c1NA
55c1b
6NAc1a
71f1c
82f1a
93f1c
104f1NA
115f1a
12NAf1c
Warning message:
In melt.data.table(DT, id = 1, measure = c("c1", "i2")): All 'measure.vars are NOT of the SAME type. By order of hierarchy, the molten data value column will be of type 'character'. Therefore all measure variables that are not of type 'character' will be coerced to. Check the DETAILS section of ?melt.data.table for more on coercion.
















i1variablevalue
11c1a
22c1b
33c1a
44c1NA
55c1b
6NAc1a
71i2NA
82i26
93i27
104i28
115i29
12NAi210
Warning message:
In melt.data.table(DT, id = 1, measure = c("c1", "i2"), na.rm = TRUE): All 'measure.vars are NOT of the SAME type. By order of hierarchy, the molten data value column will be of type 'character'. Therefore all measure variables that are not of type 'character' will be coerced to. Check the DETAILS section of ?melt.data.table for more on coercion.














i1variablevalue
11c1a
22c1b
33c1a
45c1b
5NAc1a
62i26
73i27
84i28
95i29
10NAi210

melt后,进行cast
reshape2的精髓是把整体数据(比如数据框)打碎(melt,或说熔化),让其回到一个一个数据点的状态,不过,这个数据点是可以根据观测的id名称和变量名称定位找到的(事实上,数据框是二维的,在数据框中,根据观测id和某变量名称,即可交叉定位到某数据点)。然后,再根据观测id和变量名称的组合,随心所欲的锻造需要的数据形态(cast),中间可能有求和、求平均值之类的数据运算。

names(ChickWeight) <- tolower(names(ChickWeight))
DT <- melt(as.data.table(ChickWeight), id=2:4)
dcast(DT, time ~ variable, fun=mean)
dcast(DT, diet ~ variable, fun=mean)
dcast(DT, diet+chick ~ time, drop=FALSE)
dcast(DT, diet+chick ~ time, drop=FALSE, fill=0)
dcast.data.table(DT, chick ~ time, fun=mean, subset=.(time < 10 & chick < 20))
















timeweight
1041.06
2249.22
3459.95918
4674.30612
5891.2449
610107.8367
712129.2449
814143.8125
916168.0851
1018190.1915
1120209.7174
1221218.6889








dietweight
11102.6455
22122.6167
33142.95
44135.2627
Warning message:
In `[<-.factor`(`*tmp*`, ri, value = "⋮"): invalid factor level, NA generatedWarning message:
In `[<-.factor`(`*tmp*`, ri, value = "⋮"): invalid factor level, NA generatedWarning message:
In `[<-.factor`(`*tmp*`, ri, value = "⋮"): invalid factor level, NA generatedWarning message:
In `[<-.factor`(`*tmp*`, ri, value = "⋮"): invalid factor level, NA generated

































































dietchick0246810121416182021
11183935NANANANANANANANANANA
211641454951575154NANANANANA
31154149566468686768NANANANA
4113414853606567717071819196
51942515968859690929310010098
6120414754586573778998107115117
71104144526374818996101112120124
818425061718493110116126134125NA
911742516172838998103113123133142
101194348556265718288106120144157
1114424956677487102108136154160157
12164149597497124141148155160160157
1311143516384112139168177182184181175
1413433955678499115138163187198202
1511425159647693106125149171199205
16112414956627288119135162185195205
17124049587284103122138162187209215
18154142486079106141164197199220223
1911441496279101128164192227248259266
20174149577189112146174218250288305
21124NANANANANANANANANANANANA
22130NANANANANANANANANANANANA
23122NANANANANANANANANANANANA
24123NANANANANANANANANANANANA
25127NANANANANANANANANANANANA
26128NANANANANANANANANANANANA
27126NANANANANANANANANANANANA
28125NANANANANANANANANANANANA
29129NANANANANANANANANANANANA
30121NANANANANANANANANANANANA
31NANA
171424NANANANANANANANANANANANA
172430NANANANANANANANANANANANA
173422NANANANANANANANANANANANA
174423NANANANANANANANANANANANA
175427NANANANANANANANANANANANA
176428NANANANANANANANANANANANA
177426NANANANANANANANANANANANA
178425NANANANANANANANANANANANA
179429NANANANANANANANANANANANA
180421NANANANANANANANANANANANA
181433NANANANANANANANANANANANA
182437NANANANANANANANANANANANA
183436NANANANANANANANANANANANA
184431NANANANANANANANANANANANA
185439NANANANANANANANANANANANA
186438NANANANANANANANANANANANA
187432NANANANANANANANANANANANA
188440NANANANANANANANANANANANA
189434NANANANANANANANANANANANA
190435NANANANANANANANANANANANA
19144442516586103118127138145146NANA
1924454150617898117135141147174197196
19344342556996131157184188197198199200
19444142516685103124155153175184199204
19544741536679100123148157168185210205
19644940536485108128152166184203233237
19744640526282101120144156173210231238
19845041546784105122155175205234264264
19944242496384103126160174204234269281
20044839506280104125154170222261303322
Warning message:
In `[<-.factor`(`*tmp*`, ri, value = "⋮"): invalid factor level, NA generatedWarning message:
In `[<-.factor`(`*tmp*`, ri, value = "⋮"): invalid factor level, NA generatedWarning message:
In `[<-.factor`(`*tmp*`, ri, value = "⋮"): invalid factor level, NA generatedWarning message:
In `[<-.factor`(`*tmp*`, ri, value = "⋮"): invalid factor level, NA generated

































































dietchick0246810121416182021
111839350000000000
21164145495157515400000
311541495664686867680000
4113414853606567717071819196
51942515968859690929310010098
6120414754586573778998107115117
71104144526374818996101112120124
8184250617184931101161261341250
911742516172838998103113123133142
101194348556265718288106120144157
1114424956677487102108136154160157
12164149597497124141148155160160157
1311143516384112139168177182184181175
1413433955678499115138163187198202
1511425159647693106125149171199205
16112414956627288119135162185195205
17124049587284103122138162187209215
18154142486079106141164197199220223
1911441496279101128164192227248259266
20174149577189112146174218250288305
21124000000000000
22130000000000000
23122000000000000
24123000000000000
25127000000000000
26128000000000000
27126000000000000
28125000000000000
29129000000000000
30121000000000000
31NANA
171424000000000000
172430000000000000
173422000000000000
174423000000000000
175427000000000000
176428000000000000
177426000000000000
178425000000000000
179429000000000000
180421000000000000
181433000000000000
182437000000000000
183436000000000000
184431000000000000
185439000000000000
186438000000000000
187432000000000000
188440000000000000
189434000000000000
190435000000000000
1914444251658610311812713814514600
1924454150617898117135141147174197196
19344342556996131157184188197198199200
19444142516685103124155153175184199204
19544741536679100123148157168185210205
19644940536485108128152166184203233237
19744640526282101120144156173210231238
19845041546784105122155175205234264264
19944242496384103126160174204234269281
20044839506280104125154170222261303322









chick02468
1183935NaNNaNNaN
2164145495157
3154149566468
4134148536065
594251596885

duplicated 根据键值判断当前行是否与之前的有重复
unique 返回去掉重复后的值
anyDuplicated 返回第一个重复值的index i 没有就为0
uniqueN 相等于length(unique(x))

DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3), C = rep(1:2, 6), key = "A,B")
DT
duplicated(DT)
unique(DT)
















ABC
1111
2112
3111
4122
5221
6222
7231
8232
9331
10342
11341
12342

  1. FALSE

  2. TRUE

  3. TRUE

  4. FALSE

  5. FALSE

  6. TRUE

  7. FALSE

  8. TRUE

  9. FALSE

  10. FALSE

  11. TRUE

  12. TRUE











ABC
1111
2122
3221
4231
5331
6342
DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3), C = rep(1:2, 6), key = "A,B")
DT
duplicated(DT, by="B", fromLast=TRUE)
unique(DT, by="B", fromLast=TRUE)
















ABC
1111
2112
3111
4122
5221
6222
7231
8232
9331
10342
11341
12342

  1. TRUE

  2. TRUE

  3. FALSE

  4. TRUE

  5. TRUE

  6. FALSE

  7. TRUE

  8. TRUE

  9. FALSE

  10. TRUE

  11. TRUE

  12. FALSE









ABC
1111
2222
3331
4342
anyDuplicated(DT, by=c("A", "B"))
any(duplicated(DT, by=c("A", "B")))

2

TRUE

x[i, j, by, keyby, with = TRUE…..]
根据语法,当数据表的i为data.table时,x必须设置key值,i和x连接
DT[J(…)]就是一种更可读的方法,通过创造一个data.table作为i传入,与DT连接
x[data.table(id)]与x[J(id)]等同 也可以直接写成x[id],结果等与subset(x, key == id)
x[list(id)]与x[.(id)]等同
DT[“b”]

DT = data.table(A=5:1,B=letters[5:1])
DT
setkey(DT,B)
DT[J("b")]
CJ(c(5,NA,1), c(1,3,2))
CJ(c(5,NA,1), c(1,3,2), sorted=FALSE)
x = c(1,1,2)
y = c(4,6,4)
CJ(x, y, unique=TRUE)









AB
15e
24d
33c
42b
51a





AB
12b













V1V2
1NA1
2NA2
3NA3
411
512
613
751
852
953













V1V2
151
253
352
4NA1
5NA3
6NA2
711
813
912













V1V2unique
114TRUE
214TRUE
316TRUE
414TRUE
514TRUE
616TRUE
724TRUE
824TRUE
926TRUE

like 通过正则表达式匹配

DT = data.table(Name=c("Mary","George","Martha"), Salary=c(2,3,4))
DT
DT[Name %like% "^Mar"]







NameSalary
1Mary2
2George3
3Martha4






NameSalary
1Mary2
2Martha4

merge 合并 data.table的merge方法和data.frame的merge方法,由一个区别就是,在默认情况下,data.table通过共同的key进行合并,而data.frame则是通过共同的列名

dt1 <- data.table(A = letters[1:10], X = 1:10, key = "A")
dt2 <- data.table(A = letters[5:14], Y = 1:10, key = "A")
merge(dt1, dt2)
merge(dt1, dt2, all = TRUE)










AXY
1e51
2f62
3g73
4h84
5i95
6j106


















AXY
1a1NA
2b2NA
3c3NA
4d4NA
5e51
6f62
7g73
8h84
9i95
10j106
11kNA7
12lNA8
13mNA9
14nNA10
dt1 <- data.table(A = letters[rep(1:3, 2)], X = 1:6, key = "A")
dt2 <- data.table(A = letters[rep(2:4, 2)], Y = 6:1, key = "A")
merge(dt1, dt2, allow.cartesian=TRUE)












AXY
1b26
2b23
3b56
4b53
5c35
6c32
7c65
8c62

na.omit.data.table 根据指定的列去掉行值为空值的行
na.omit(object, cols=seq_along(object), invert=FALSE, …)

DT = data.table(x=c(1,NaN,NA,3), y=c(NAinteger, 1:3), z=c(“a”, NAcharacter, “b”, “c”))
/ default behaviour /
na.omit(DT)
/ omit rows where ‘x’ has a missing value/
na.omit(DT, cols=”x”)
/ omit rows where either ‘x’ or ‘y’ have missing values/
na.omit(DT, cols=c(“x”, “y”))

rbindlist 对一列表的data.table进行行合并,组成一个data.table

DT1 = data.table(A=1:3,B=letters[1:3])
DT2 = data.table(A=4:5,B=letters[4:5])
l = list(DT1,DT2)
rbindlist(l)









AB
11a
22b
33c
44d
55e
DT1 = data.table(A=1:3,B=letters[1:3])
DT2 = data.table(B=letters[4:5],A=4:5)
l = list(DT1,DT2)
rbindlist(l, use.names=TRUE)









AB
11a
22b
33c
44d
55e
DT1 = data.table(A=1:3,B=letters[1:3])
DT2 = data.table(B=letters[4:5],C=factor(1:2))
l = list(DT1,DT2)
rbindlist(l, use.names=TRUE, fill=TRUE)









ABC
11aNA
22bNA
33cNA
4NAd1
5NAe2

setattr(x,name,value)
setnames(x,old,new)

DT = data.table(a=1:2,b=3:4,c=5:6)
setnames(DT,"b","B")
setnames(DT,3,"C")
DT






aBC
1135
2246

setcolorder(x, neworder) 对列名进行指定的排序

DT = data.table(A=sample(3, 10, TRUE),B=sample(letters[1:3], 10, TRUE), C=sample(10))
setcolorder(DT, c("C", "A", "B"))
DT














CAB
142a
223a
333a
492a
572b
6101c
713c
852b
983b
1062b

setorder 对行进行排序

DT = data.table(A=sample(3, 10, TRUE),B=sample(letters[1:3], 10, TRUE), C=sample(10))
setorder(DT, A, -B)
setorderv(DT, c("A", "B"), c(1, -1))
DT














ABC
11a4
22c9
32c5
42c2
52b10
62a7
73c1
83c8
93a6
103a3

setDF 通过引用的方式把数据表变成数据框
setDT 通过引用的方式把数据框变成数据表

X = data.table(x=1:5, y=6:10)
setDF(X)[]
A = list(a=1:5, a=6:10)
setDT(A)[]









xy
116
227
338
449
5510









aa
116
227
338
449
5510

tables 列出所有class为data.table的对象

tables()

     NAME NROW NCOL MB COLS  KEY
[1,] A       5    2  1 a,a      
[2,] DT     10    3  1 A,B,C    
[3,] dt1     6    2  1 A,X   A  
[4,] DT1     3    2  1 A,B      
[5,] dt2     6    2  1 A,Y   A  
[6,] DT2     2    2  1 B,C      
Total: 6MB