原始数据(raw dataset)->预处理后的数据(clean dataset) 基本方法: - []:提取一个或多个类型相同的元素 -[[]]:从列表或者数据框中提取元素 -$:按名字从列表或数据框中提取元素
。
(1)
//向量的子集> x <- 1:10> x[1][1] 1> x[5][1] 5> x[1:5][1] 1 2 3 4 5> x[x>5][1] 6 7 8 9 10> x>5 [1] FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE> x[x>5 & x<7][1] 6> x[x<3 | x>7][1] 1 2 8 9 10> y <- 1:4> y[1] 1 2 3 4> names(y) <- c("a","b","c","d")> ya b c d 1 2 3 4 > y[2]b 2 > y["b"]b 2 > //矩阵的子集> x <- matrix(1:6, nrow = 2, ncol = 3)> x [,1] [,2] [,3][1,] 1 3 5[2,] 2 4 6> x[1,2] //拿一个元素[1] 3> x[1,] //拿一行元素[1] 1 3 5> x[,1] //拿一列元素[1] 1 2> x[2,c(1,3)][1] 2 6> class(x[1,2])[1] "integer"> x[1,2, drop = FALSE] //将拿出的元素仍为矩阵类型的 [,1][1,] 3//数据框的子集> data.frame(v1=1:5, v2=6:10, v3=11:15) v1 v2 v31 1 6 112 2 7 123 3 8 134 4 9 145 5 10 15> x$v3[c(2,4)] <- NA> x v1 v2 v31 1 6 112 2 7 NA3 3 8 134 4 9 NA5 5 10 15> x[,2][1] 6 7 8 9 10> x[,"v2"][1] 6 7 8 9 10> x[(x$v1<4 & x$v2>=8),] v1 v2 v33 3 8 13> x[(x$v1<4 | x$v2>=8),] v1 v2 v31 1 6 112 2 7 NA3 3 8 134 4 9 NA5 5 10 15> x[x$v1>2,] v1 v2 v33 3 8 134 4 9 NA5 5 10 15> x[which(x$v1>2),] v1 v2 v33 3 8 134 4 9 NA5 5 10 15> which(x$v1>2)[1] 3 4 5> x$v1>2[1] FALSE FALSE TRUE TRUE TRUE> subset(x,x$v1>2) v1 v2 v33 3 8 134 4 9 NA5 5 10 15.(2)列表的子集
[[]] /$ / [[]][] / [[]][[]]嵌套列表/不完全匹配(partial matching)> x <- list(id = 1:4, height = 170, gender = "male")//创建一个列表> x$id[1] 1 2 3 4$height[1] 170$gender[1] "male"> x[1]//取列表中的第一个元素$id[1] 1 2 3 4> x["id"]$id[1] 1 2 3 4//只取第一个元素的内容> x[[1]][1] 1 2 3 4> x[["id"]][1] 1 2 3 4> x$id[1] 1 2 3 4> > x[c(1,3)]//取列表中的第一个和第三个元素$id[1] 1 2 3 4$gender[1] "male"> > y <- "id"> x[["id"]][1] 1 2 3 4> x[[y]]//通过y来指代[1] 1 2 3 4> > > x$id[1] 1 2 3 4> x$y//这种方法不适用NULL> > x <- list(a=list(1,2,3,4), b=c("Monday","Tuesday"))//嵌套列表> x$a$a[[1]][1] 1$a[[2]][1] 2$a[[3]][1] 3$a[[4]][1] 4$b[1] "Monday" "Tuesday"> x[[1]][[1]][1] 1[[2]][1] 2[[3]][1] 3[[4]][1] 4> x[[1]][[2]]//第一个里面的第二个元素内容[1] 2> x[[1]][2]//第一个元素的第二个元素[[1]][1] 2> > > x[[c(1,3)]]//第一个里面的第三个[1] 3> x[[c(2,2)]][1] "Tuesday"> > > //不完全匹配> l <- list(sddfg = 1:10)> l$sddfg [1] 1 2 3 4 5 6 7 8 9 10> l$sddfg [1] 1 2 3 4 5 6 7 8 9 10> l$aNULL> l$s [1] 1 2 3 4 5 6 7 8 9 10> l[["s", exact = FALSE]] [1] 1 2 3 4 5 6 7 8 9 10(3)如何处理缺失值(missing value)
> x <- c(1, NA, 2, NA, 3)> is.na(x)[1] FALSE TRUE FALSE TRUE FALSE> x[!is.na(x)][1] 1 2 3> x <- c(1, NA, 2, NA, 3)> y <- c("a","b",NA,"c",NA)> z <- complete.cases(x,y)//x,y都不是缺失值得元素> z[1] TRUE FALSE FALSE FALSE FALSE> > x[z]//分别查看x和y中不是缺失值得值[1] 1> y[z][1] "a"> library(datasets)//加载一个数据集> head(airquality)//查看数据集的前6行 Ozone Solar.R Wind Temp Month Day1 41 190 7.4 67 5 12 36 118 8.0 72 5 23 12 149 12.6 74 5 34 18 313 11.5 62 5 45 NA NA 14.3 56 5 56 28 NA 14.9 66 5 6> g <- complete.cases(airquality)//查看缺失值> g [1] TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE [13] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [25] FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE [37] FALSE TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE TRUE TRUE [49] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [61] FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE [73] TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE [97] FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE TRUE[109] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE[121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE[133] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE[145] TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE> airquality[g,][1:10,]//选择数据集中不存在缺失值的行,列全要;查看1到10行,列全要。 Ozone Solar.R Wind Temp Month Day1 41 190 7.4 67 5 12 36 118 8.0 72 5 23 12 149 12.6 74 5 34 18 313 11.5 62 5 47 23 299 8.6 65 5 78 19 99 13.8 59 5 89 8 19 20.1 61 5 912 16 256 9.7 69 5 1213 11 290 9.2 66 5 1314 14 274 10.9 68 5 14(4)向量化操作
——可以作用于向量、矩阵等结构,使得代码简洁,易于阅读、效率高
向量矩阵的加减乘除都是针对每个元素的,即就是每个对应元素做加减乘除,如果想计算真正的矩阵乘法时,使用这个符号%*%
(1)循环 - R不仅有for/while循环语句,还有更强大的实现循环的“一句话”函数:
lapply:lapply可以循环处理列表中的每一个元素lapply(参数):lapply(列表,函数/函数名,其他参数)总是返回一个列表
sapply:简化结果
(1)结果列表元素长度均为1,返回向量(2)结果列表元素长度相同且大于1,返回矩阵x <- list(a=1:10, b=c(11,21,31,41,51))xlapply(x,mean)x <- 1:4lapply(x, runif)lapply(x, runif, min=0, max=100)x <- list(a=matrix(1:6,2,3), b=matrix(4:7,2,2))lapply(x, function(m) m[1,])#sapplyx <- list(a=1:10, b=c(11,21,31,41,51))xlapply(x,mean)sapply(x,mean)//执行步骤:> str(lapply)//查看一个函数的标准化function (X, FUN, ...) > x <- list(a=1:10, b=c(11,21,31,41,51))> x$a [1] 1 2 3 4 5 6 7 8 9 10$b[1] 11 21 31 41 51> lapply(x,mean)$a[1] 5.5$b[1] 31> x <- 1:4> lapply(x, runif)//runif取随机数[[1]][1] 0.8024411[[2]][1] 0.3922546 0.6929949[[3]][1] 0.64910476 0.06124001 0.45324513[[4]][1] 0.01928596 0.86259091 0.67297106 0.98231294> lapply(x, runif, min=0, max=100)[[1]][1] 34.09794[[2]][1] 20.99846 45.18515[[3]][1] 19.148935 81.885369 5.879639[[4]][1] 18.60201 53.44052 27.06450 15.64718> x <- list(a=matrix(1:6,2,3), b=matrix(4:7,2,2))> lapply(x, function(m) m[1,])//自定义函数,求矩阵的第一行$a[1] 1 3 5$b[1] 4 6> #sapply//简化结果> x <- list(a=1:10, b=c(11,21,31,41,51))> x$a [1] 1 2 3 4 5 6 7 8 9 10$b[1] 11 21 31 41 51> lapply(x,mean)$a[1] 5.5$b[1] 31> sapply(x,mean) a b 5.5 31.0 > class(sapply(x,mean))[1] "numeric"apply:沿着数组的某一维度处理数据(1)例如:将函数用于矩阵的行或者列(2)虽然与for/while循环的效率相似,但是只用一句话就可以完成apply(参数):apply(数组,维度,函数/函数名)x <- matrix(1:16,4,4)> x [,1] [,2] [,3] [,4][1,] 1 5 9 13[2,] 2 6 10 14[3,] 3 7 11 15[4,] 4 8 12 16> apply(x,2,mean)//2代表列[1] 2.5 6.5 10.5 14.5> apply(x,2,sum)[1] 10 26 42 58> sumfunction (..., na.rm = FALSE) .PRimitive("sum")> > > apply(x,1,mean)//1代表行[1] 7 8 9 10> apply(x,1,sum)[1] 28 32 36 40> rowSums(x)rowMeans(x)colSums(x)colMeans(x)//更简便的算行列和以及平均数的函数> x <- matrix(rnorm(100),10,10)> apply(x, 1, quantile, probs=c(0.25,0.75))//算分位数 [,1] [,2] [,3] [,4] [,5] [,6]25% 0.06054315 -0.8796558 -0.6019438 -0.3698089 -0.5951642 -0.8018764675% 0.74679590 0.3931769 0.2459020 0.6931527 0.3820894 0.08229792 [,7] [,8] [,9] [,10]25% -0.5985797 -0.1538297 -0.8994844 -1.20206575% 0.7508944 0.8694427 0.8951599 -0.278875> x [,1] [,2] [,3] [,4] [,5] [,6] [1,] 0.7943604 0.03776428 -1.30135026 0.1523084 0.12887977 1.8300554 [2,] 0.5349808 -0.77551826 -0.03223475 2.8928050 -0.38175344 -0.3784332 [3,] -2.5381823 1.06850374 -0.26883696 0.3875790 -0.65437178 0.8733678 [4,] 0.7108435 0.64008029 -2.75786167 -0.2609761 0.74268723 -0.6941522 [5,] 0.3862663 -0.65398901 -1.00904319 -0.3064571 -0.41868968 -1.3247414 [6,] 0.3072321 -0.67854760 -1.52100130 -0.8338515 -0.04192732 0.1019094 [7,] -0.7110167 0.99272860 0.49472202 0.3522931 -0.89273593 -0.1111968 [8,] 0.8180394 0.24945472 0.79362529 -0.3022865 0.88657718 -0.2078542 [9,] -0.5296272 0.10399219 -1.13527860 1.5599123 -1.32145044 2.1397333[10,] -0.9080016 0.90191879 -0.11167784 -0.6201164 -1.26570093 -2.0554333 [,7] [,8] [,9] [,10] [1,] 2.1890150 0.2225240 -1.097342438 0.604102489 [2,] -1.0996727 -1.4860049 -0.914368314 0.662635948 [3,] -0.1958515 -1.6066377 -0.179128701 -0.444659977 [4,] 0.5083954 0.8595397 -0.008981572 -0.406086520 [5,] -0.0432980 1.9158199 0.369558390 1.902044999 [6,] -0.7059515 -1.2839743 1.956473298 0.023463385 [7,] -0.2612688 1.1744811 0.836285136 -0.985409256 [8,] 1.1911508 3.4116767 -0.689197813 0.008243826 [9,] 1.0691268 -0.8876152 -0.903440765 0.373259033[10,] -0.6666487 -1.0111572 -2.403407215 -0.165127814> x <- array(rnorm(2*3*4), c(2,3,4))//处理多维数据> apply(x,c(1,2),mean) [,1] [,2] [,3][1,] -0.3919501 -0.6118854 -0.079962927[2,] -0.3715698 -0.0623516 0.007735163> apply(x,c(1,3),mean) [,1] [,2] [,3] [,4][1,] -0.8401368 0.2964799 -0.8814463 -0.01996122[2,] 0.3391606 -0.7196667 0.2350666 -0.42280886mapply:lapply的多元版本mapply(参数):mapply(函数/函数名,数据,函数相关的参数)> list(rep(1,4),rep(2,3),rep(3,2),rep(4,1))[[1]][1] 1 1 1 1[[2]][1] 2 2 2[[3]][1] 3 3[[4]][1] 4> mapply(rep, 1:4, 4:1)[[1]][1] 1 1 1 1[[2]][1] 2 2 2[[3]][1] 3 3[[4]][1] 4s <- function(n, mean, std){ rnorm(n, mean, std)}#从均值为mean标准差为std的数据中抽取n个数据> s <- function(n, mean, std){+ rnorm(n, mean, std)+ }> s(4,0,1)[1] 0.4510066 1.0603416 -1.7950502 -0.1977936> mapply(s, 1:5, 5:1, 2)[[1]][1] 3.891429[[2]][1] 3.411856 4.860403[[3]][1] 4.648763 1.810501 1.254752[[4]][1] 4.330916 4.414443 5.379381 1.924920[[5]][1] 0.3902404 0.5769118 2.1168463 -0.2346868 1.2657041tapply:(1)对向量的子集进行操作(2)tapply(参数):tapply(向量,因子/因子列表,函数/函数名)#5个正态分布,5个均匀分布,5个均值为1,标准差为0的正态分布x <- c(rnorm(5), runif(5), rnorm(5,1))#因子,3个水平,每个水平下有五个元素f <- gl(3,5)tapply(x,f,mean)tapply(x,f,mean, simplify=FALSE)//实现步骤> x <- c(rnorm(5), runif(5), rnorm(5,1))> x [1] 0.62053059 -0.56823581 -0.57007814 -1.15549512 1.30405929 0.58582563 [7] 0.32466797 0.61626307 0.11666598 0.29560003 1.80040297 1.15604482[13] 0.48488965 0.78446984 0.06004225> f <- gl(3,5)> f [1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3Levels: 1 2 3> tapply(x,f,mean) 1 2 3 -0.07384384 0.38780454 0.85716990 > tapply(x,f,mean, simplify=FALSE)$`1`[1] -0.07384384$`2`[1] 0.3878045$`3`[1] 0.8571699split(1)根据因子或者因子列表将向量或其他对象分组(2)通常与lapply一起使用(3)split(参数):split(向量/列表/数据框,因子/因子列表)x <- c(rnorm(5), runif(5), rnorm(5,1))xf <- gl(3,5)split(x,f)lapply(split(x,f),mean)head(airquality)#按月份查看:s <- split(airquality, airquality$Month)#查看有几个月,每个月包含几个记录table(airquality$Month)#用lapply计算每一个月的测量风速,温度等以及平均值lapply(s, function(x) colMeans(x[,c("Ozone","Wind","Temp")]))sapply(s, function(x) colMeans(x[,c("Ozone","Wind","Temp")]))sapply(s, function(x) colMeans(x[,c("Ozone","Wind","Temp")],na.rm = TRUE))//执行步骤:> x <- c(rnorm(5), runif(5), rnorm(5,1))> x [1] 1.23662119 0.67641029 -0.70324640 -0.09581818 -0.48951465 0.07295452 [7] 0.53722799 0.94607521 0.68805128 0.97326122 -0.45511553 0.99704495[13] 1.18909835 0.76038237 2.39615395> f <- gl(3,5)> split(x,f)$`1`[1] 1.23662119 0.67641029 -0.70324640 -0.09581818 -0.48951465$`2`[1] 0.07295452 0.53722799 0.94607521 0.68805128 0.97326122$`3`[1] -0.4551155 0.9970449 1.1890984 0.7603824 2.3961540> lapply(split(x,f),mean)$`1`[1] 0.1248905$`2`[1] 0.643514$`3`[1] 0.9775128> head(airquality) Ozone Solar.R Wind Temp Month Day1 41 190 7.4 67 5 12 36 118 8.0 72 5 23 12 149 12.6 74 5 34 18 313 11.5 62 5 45 NA NA 14.3 56 5 56 28 NA 14.9 66 5 6> s <- split(airquality, airquality$Month)> s$`5` Ozone Solar.R Wind Temp Month Day1 41 190 7.4 67 5 12 36 118 8.0 72 5 23 12 149 12.6 74 5 34 18 313 11.5 62 5 45 NA NA 14.3 56 5 56 28 NA 14.9 66 5 67 23 299 8.6 65 5 78 19 99 13.8 59 5 89 8 19 20.1 61 5 910 NA 194 8.6 69 5 1011 7 NA 6.9 74 5 1112 16 256 9.7 69 5 1213 11 290 9.2 66 5 1314 14 274 10.9 68 5 1415 18 65 13.2 58 5 1516 14 334 11.5 64 5 1617 34 307 12.0 66 5 1718 6 78 18.4 57 5 1819 30 322 11.5 68 5 1920 11 44 9.7 62 5 2021 1 8 9.7 59 5 2122 11 320 16.6 73 5 2223 4 25 9.7 61 5 2324 32 92 12.0 61 5 2425 NA 66 16.6 57 5 2526 NA 266 14.9 58 5 2627 NA NA 8.0 57 5 2728 23 13 12.0 67 5 2829 45 252 14.9 81 5 2930 115 223 5.7 79 5 3031 37 279 7.4 76 5 31$`6` Ozone Solar.R Wind Temp Month Day32 NA 286 8.6 78 6 133 NA 287 9.7 74 6 234 NA 242 16.1 67 6 335 NA 186 9.2 84 6 436 NA 220 8.6 85 6 537 NA 264 14.3 79 6 638 29 127 9.7 82 6 739 NA 273 6.9 87 6 840 71 291 13.8 90 6 941 39 323 11.5 87 6 1042 NA 259 10.9 93 6 1143 NA 250 9.2 92 6 1244 23 148 8.0 82 6 1345 NA 332 13.8 80 6 1446 NA 322 11.5 79 6 1547 21 191 14.9 77 6 1648 37 284 20.7 72 6 1749 20 37 9.2 65 6 1850 12 120 11.5 73 6 1951 13 137 10.3 76 6 2052 NA 150 6.3 77 6 2153 NA 59 1.7 76 6 2254 NA 91 4.6 76 6 2355 NA 250 6.3 76 6 2456 NA 135 8.0 75 6 2557 NA 127 8.0 78 6 2658 NA 47 10.3 73 6 2759 NA 98 11.5 80 6 2860 NA 31 14.9 77 6 2961 NA 138 8.0 83 6 30$`7` Ozone Solar.R Wind Temp Month Day62 135 269 4.1 84 7 163 49 248 9.2 85 7 264 32 236 9.2 81 7 365 NA 101 10.9 84 7 466 64 175 4.6 83 7 567 40 314 10.9 83 7 668 77 276 5.1 88 7 769 97 267 6.3 92 7 870 97 272 5.7 92 7 971 85 175 7.4 89 7 1072 NA 139 8.6 82 7 1173 10 264 14.3 73 7 1274 27 175 14.9 81 7 1375 NA 291 14.9 91 7 1476 7 48 14.3 80 7 1577 48 260 6.9 81 7 1678 35 274 10.3 82 7 1779 61 285 6.3 84 7 1880 79 187 5.1 87 7 1981 63 220 11.5 85 7 2082 16 7 6.9 74 7 2183 NA 258 9.7 81 7 2284 NA 295 11.5 82 7 2385 80 294 8.6 86 7 2486 108 223 8.0 85 7 2587 20 81 8.6 82 7 2688 52 82 12.0 86 7 2789 82 213 7.4 88 7 2890 50 275 7.4 86 7 2991 64 253 7.4 83 7 3092 59 254 9.2 81 7 31$`8` Ozone Solar.R Wind Temp Month Day93 39 83 6.9 81 8 194 9 24 13.8 81 8 295 16 77 7.4 82 8 396 78 NA 6.9 86 8 497 35 NA 7.4 85 8 598 66 NA 4.6 87 8 699 122 255 4.0 89 8 7100 89 229 10.3 90 8 8101 110 207 8.0 90 8 9102 NA 222 8.6 92 8 10103 NA 137 11.5 86 8 11104 44 192 11.5 86 8 12105 28 273 11.5 82 8 13106 65 157 9.7 80 8 14107 NA 64 11.5 79 8 15108 22 71 10.3 77 8 16109 59 51 6.3 79 8 17110 23 115 7.4 76 8 18111 31 244 10.9 78 8 19112 44 190 10.3 78 8 20113 21 259 15.5 77 8 21114 9 36 14.3 72 8 22115 NA 255 12.6 75 8 23116 45 212 9.7 79 8 24117 168 238 3.4 81 8 25118 73 215 8.0 86 8 26119 NA 153 5.7 88 8 27120 76 203 9.7 97 8 28121 118 225 2.3 94 8 29122 84 237 6.3 96 8 30123 85 188 6.3 94 8 31$`9` Ozone Solar.R Wind Temp Month Day124 96 167 6.9 91 9 1125 78 197 5.1 92 9 2126 73 183 2.8 93 9 3127 91 189 4.6 93 9 4128 47 95 7.4 87 9 5129 32 92 15.5 84 9 6130 20 252 10.9 80 9 7131 23 220 10.3 78 9 8132 21 230 10.9 75 9 9133 24 259 9.7 73 9 10134 44 236 14.9 81 9 11135 21 259 15.5 76 9 12136 28 238 6.3 77 9 13137 9 24 10.9 71 9 14138 13 112 11.5 71 9 15139 46 237 6.9 78 9 16140 18 224 13.8 67 9 17141 13 27 10.3 76 9 18142 24 238 10.3 68 9 19143 16 201 8.0 82 9 20144 13 238 12.6 64 9 21145 23 14 9.2 71 9 22146 36 139 10.3 81 9 23147 7 49 10.3 69 9 24148 14 20 16.6 63 9 25149 30 193 6.9 70 9 26150 NA 145 13.2 77 9 27151 14 191 14.3 75 9 28152 18 131 8.0 76 9 29153 20 223 11.5 68 9 30> table(airquality$Month) 5 6 7 8 9 31 30 31 31 30 > lapply(s, function(x) colMeans(x[,c("Ozone","Wind","Temp")]))$`5` Ozone Wind Temp NA 11.62258 65.54839 $`6` Ozone Wind Temp NA 10.26667 79.10000 $`7` Ozone Wind Temp NA 8.941935 83.903226 $`8` Ozone Wind Temp NA 8.793548 83.967742 $`9`Ozone Wind Temp NA 10.18 76.90 > sapply(s, function(x) colMeans(x[,c("Ozone","Wind","Temp")])) 5 6 7 8 9Ozone NA NA NA NA NAWind 11.62258 10.26667 8.941935 8.793548 10.18Temp 65.54839 79.10000 83.903226 83.967742 76.90> sapply(s, function(x) colMeans(x[,c("Ozone","Wind","Temp")],na.rm = TRUE))//拿掉缺失值 5 6 7 8 9Ozone 23.61538 29.44444 59.115385 59.961538 31.44828Wind 11.62258 10.26667 8.941935 8.793548 10.18000Temp 65.54839 79.10000 83.903226 83.967742 76.90000(2)排序 - sort: - 对向量进行排序;返回排好序的内容 - order - 返回排好序的内容的下标/多个排序标准
x <- data.frame(v1=1:5, v2=c(10,7,9,6,8), v3=11:15, v4=c(1,1,2,2,1))x#对数据框中v2进行排序sort(x$v2)sort(x$v2, decreasing = TRUE)order(x$v2)x[order(x$v2),]#先对v4进行排序,如果遇到两个一样的元素,再按v2进行排序x[order(x$v4,x$v2),]x[order(x$v4,x$v2,decreasing = TRUE),]//执行步骤:> x <- data.frame(v1=1:5, v2=c(10,7,9,6,8), v3=11:15, v4=c(1,1,2,2,1))> x v1 v2 v3 v41 1 10 11 12 2 7 12 13 3 9 13 24 4 6 14 25 5 8 15 1> sort(x$v2)[1] 6 7 8 9 10> sort(x$v2, decreasing = TRUE)[1] 10 9 8 7 6> order(x$v2)[1] 4 2 5 3 1> x[order(x$v2),] v1 v2 v3 v44 4 6 14 22 2 7 12 15 5 8 15 13 3 9 13 21 1 10 11 1> x[order(x$v4,x$v2),] v1 v2 v3 v42 2 7 12 15 5 8 15 11 1 10 11 14 4 6 14 23 3 9 13 2> x[order(x$v4,x$v2,decreasing = TRUE),] v1 v2 v3 v43 3 9 13 24 4 6 14 21 1 10 11 15 5 8 15 12 2 7 12 1(3)总结数据信息
#查看前六行head(airquality)#看后六行tail(airquality)#自定义行数head(airquality,10)summary(airquality)str(airquality)table(airquality$Month)table(airquality$Ozone, useNA = "ifany")table(airquality$Month,airquality$Day)#判断是否有缺失值any(is.na(airquality$Ozone))sum(is.na(airquality$Ozone))#判断月份是不是都小于12all(airquality$Month<12)#新的一个数据titanic <- as.data.frame(Titanic)head(titanic)dim(titanic)summary(titanic)#一个新的表x <- xtabs(Freq ~ Class + Age, data=titanic)ftable(x)#了解我们的数据有多大object.size(airquality)print(object.size(airquality), units = "Kb")//执行步骤> #查看前六行> head(airquality) Ozone Solar.R Wind Temp Month Day1 41 190 7.4 67 5 12 36 118 8.0 72 5 23 12 149 12.6 74 5 34 18 313 11.5 62 5 45 NA NA 14.3 56 5 56 28 NA 14.9 66 5 6> #看后六行> tail(airquality) Ozone Solar.R Wind Temp Month Day148 14 20 16.6 63 9 25149 30 193 6.9 70 9 26150 NA 145 13.2 77 9 27151 14 191 14.3 75 9 28152 18 131 8.0 76 9 29153 20 223 11.5 68 9 30> head(airquality,10) Ozone Solar.R Wind Temp Month Day1 41 190 7.4 67 5 12 36 118 8.0 72 5 23 12 149 12.6 74 5 34 18 313 11.5 62 5 45 NA NA 14.3 56 5 56 28 NA 14.9 66 5 67 23 299 8.6 65 5 78 19 99 13.8 59 5 89 8 19 20.1 61 5 910 NA 194 8.6 69 5 10> summary(airquality) Ozone Solar.R Wind Temp Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00 Median : 31.50 Median :205.0 Median : 9.700 Median :79.00 Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00 Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00 NA's :37 NA's :7 Month Day Min. :5.000 Min. : 1.0 1st Qu.:6.000 1st Qu.: 8.0 Median :7.000 Median :16.0 Mean :6.993 Mean :15.8 3rd Qu.:8.000 3rd Qu.:23.0 Max. :9.000 Max. :31.0 > str(airquality)'data.frame': 153 obs. of 6 variables: $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ... $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ... $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ... $ Temp : int 67 72 74 62 56 66 65 59 61 69 ... $ Month : int 5 5 5 5 5 5 5 5 5 5 ... $ Day : int 1 2 3 4 5 6 7 8 9 10 ...> table(airquality$Month) 5 6 7 8 9 31 30 31 31 30 > table(airquality$Ozone) 1 4 6 7 8 9 10 11 12 13 14 16 18 19 20 21 22 23 24 27 1 1 1 3 1 3 1 3 2 4 4 4 4 1 4 4 1 6 2 1 28 29 30 31 32 34 35 36 37 39 40 41 44 45 46 47 48 49 50 52 3 1 2 1 3 1 2 2 2 2 1 1 3 2 1 1 1 1 1 1 59 61 63 64 65 66 71 73 76 77 78 79 80 82 84 85 89 91 96 97 2 1 1 2 1 1 1 2 1 1 2 1 1 1 1 2 1 1 1 2 108 110 115 118 122 135 168 1 1 1 1 1 1 1 > table(airquality$Ozone, useNA = "ifany") 1 4 6 7 8 9 10 11 12 13 14 16 18 19 20 21 1 1 1 3 1 3 1 3 2 4 4 4 4 1 4 4 22 23 24 27 28 29 30 31 32 34 35 36 37 39 40 41 1 6 2 1 3 1 2 1 3 1 2 2 2 2 1 1 44 45 46 47 48 49 50 52 59 61 63 64 65 66 71 73 3 2 1 1 1 1 1 1 2 1 1 2 1 1 1 2 76 77 78 79 80 82 84 85 89 91 96 97 108 110 115 118 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 1 122 135 168 <NA> 1 1 1 37 > table(airquality$Month,airquality$Day) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 6 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 7 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 8 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 9 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 29 30 31 5 1 1 1 6 1 1 0 7 1 1 1 8 1 1 1 9 1 1 0> any(is,na(airquality$Ozone))Error: could not find function "na"> any(is.na(airquality$Ozone))[1] TRUE> sum(is.na(airquality$Ozone))[1] 37> all(airquality$Month<12)[1] TRUE> #新的一个数据> titanic <- as.data.frame(Titanic)> head(titanic) Class Sex Age Survived Freq1 1st Male Child No 02 2nd Male Child No 03 3rd Male Child No 354 Crew Male Child No 05 1st Female Child No 06 2nd Female Child No 0> dim(titanic)[1] 32 5> summary(titanic) Class Sex Age Survived Freq 1st :8 Male :16 Child:16 No :16 Min. : 0.00 2nd :8 Female:16 Adult:16 Yes:16 1st Qu.: 0.75 3rd :8 Median : 13.50 Crew:8 Mean : 68.78 3rd Qu.: 77.00 Max. :670.00 > > > #一个新的表> xtabs(Freq ~ Class + Age, data=titanic) AgeClass Child Adult 1st 6 319 2nd 24 261 3rd 79 627 Crew 0 885> x <- xtabs(Freq ~ Class + Age, data=titanic)> ftable(x) Age Child AdultClass 1st 6 3192nd 24 2613rd 79 627Crew 0 885> object.size(airquality)5496 bytes> print(object.size(airquality), units = "Kb")5.4 Kb新闻热点
疑难解答