标签:www 数据类型 split max func sap 检测 question app
例子:
v = c(2, 9, 1, 45, -3, 19, -5, 6)
sort(v) # returns ordered v in decreasing order
结果:
# [1] -5 -3 1 2 6 9 19 45
sort(v, decreasing = FALSE) # orders v in increasing order
结果:
# [1] -5 -3 1 2 6 9 19 45
order(v) # returns order of the indexes in v
结果:
# [1] 7 5 3 1 8 2 6 4
order()可以和更加复杂的数据结构配合使用,从而适应更加复杂的场景
如:
#Imagine that you just want to access Sepal Length and Species.
# You can access those values in different ways:
ir[order(ir$Sepal.Length, decreasing = TRUE),c("Sepal.Length", "Species")][1:5,]
ir[order(ir$Sepal.Length, decreasing = TRUE),][1:5, c("Sepal.Length", "Species")]
aggregate(X, by, FUN, . . . ,simplify = TRUE)
举例:
针对单一对象,一种分组
aggregate(ir$Sepal.Length, by= list(ir$Species), FUN=mean)
aggregate(ir$Sepal.Width, by= list(ir$Species), summary)
针对多个对象,一种分组
aggregate(ir[,c("Sepal.Length", "Sepal.Width")], by=list(ir$Species), mean)
针对单一对象,多种分组
mean_by_sp_ind = aggregate(ir$Sepal.Length, by=list(ir$Species, ir$indoor), mean)
aggregate(formula, data, FUN, subset)
举例:
aggregate(Sepal.Length ~ Species, data = ir, mean)
针对单一对象,一种分组,相当于:
aggregate(ir$Sepal.Length, by= list(ir$Species), FUN=mean)
aggregate(Sepal.Length ~ Species + indoor, data = ir, mean)
针对单一对象,多种分组,相当于:
mean_by_sp_ind = aggregate(ir$Sepal.Length, by=list(ir$Species, ir$indoor), mean)
aggregate(cbind(Sepal.Length, Sepal.Width) ~ Species, ir, mean)
针对多个对象,一种分组,相当于:
aggregate(ir[,c("Sepal.Length", "Sepal.Width")], by=list(ir$Species), mean)
针对所有对象,多个分组
aggregate(. ~ Species + indoor, data = ir, mean)
使用subset筛选满足条件的子集
aggregate(cbind(Sepal.Length, Sepal.Width) ~ Species, data = ir, subset = Petal.Width>0.6, mean)
举例:
meanX = function(vec, n){mean(head(vec[order(-vec)], n))}
aggregate(ir$Sepal.Length, by = list(ir$Species), FUN= function(x) meanX(x, 5))
举例:
aggregate(ir$Sepal.Length, by = list(ir$Species), FUN= function(x) mean(head(x[order(-x)], 5)))
举例:
aggregate(ir$Sepal.Length, by = list(ir$Species),
FUN= function(x, n=5) mean(head(x[order(-x)], n)))
aggregate(ir$Sepal.Length, n=5, by = list(ir$Species),
FUN= function(x, n) mean(head(x[order(-x)], n)))
These are: filter(), arrange(), select(), mutate(), summarize(), sample_n(), sample_frac(), and group_by().
得到满足条件的行
filter(data frame, condition).
filter(iris, Species=="setosa") # using dplyr
等效于
iris[iris$Species=="setosa",] # using basic R
arrange(data frame, attributes)
将数据帧按照属性排序
升序:
arrange(iris, Sepal.Length) #dplyr
iris[order(iris$Sepal.Length, decreasing = FALSE),] # basic R
iris[order(iris$Sepal.Length),] # basic R
降序:
arrange(iris, desc(Sepal.Length))
iris[order(iris$Sepal.Length, decreasing = TRUE),]
iris[order(-iris$Sepal.Length),]
arrange(iris, Sepal.Length,Sepal.Width)[1:5,]
升序与降序结合:
arrange(iris, Sepal.Length, desc(Sepal.Width))[1:5,]
iris[order(iris$Sepal.Length, -iris$Sepal.Width),][1:5,]
select(data frame, var1,. . . ,varX).
选择满足条件的列,或者去掉某几列
select(ir,Petal.Width, Species)
ir[,c("Petal.Length","Species")]
select(ir, -Species)
ir[, -c(5)]
select可以使用类似通配符的相关功能
select(ir,starts_with("Petal")) #Petal.Length and Petal.Width
select(ir, ends_with("Length")) #Sepal.Length and Petal.Length
mutate(data frame, expression)
添加新的列到数据帧
ir = mutate(ir, DoubleSepalL = Sepal.Length*2,
PetalRatio = Petal.Length/Petal.Width)
ir$DoubleSepalL = ir$Sepal.Length*2
ir$PetalRatio = ir$Petal.Length/ir$Petal.Width
summarize(data frame, function(var1,. . . ,varX))
可以调用多个函数内置函数: sd(), min(), max(), median(), sum(), cor() (correlation), n() (length of vector)_, first() (first value), last() (last value) and n_distinct() (number of distinct values in vector).
summarise(ir, avg = mean(Sepal.Length), std= sd(Sepal.Length), total=n())
其中n()求行数
summary(ir)
summarise_all可以处理多个对象
均值
summarise_all(ir[,1:4],mean)
四分位点
summarise_all(ir[,1:4],quantile, probs=0.75)
乱序、取某几行的样本
sample_n(iris,5)
iris[sample(1:nrow(iris)),][1:5,]
乱序、按比例取样本
sample_frac(iris,0.01) # 取1%的样本
等效于:
iris[sample(1:nrow(iris)),][1:ceiling(nrow(iris)*0.01),]
group_by(data frame, variable)
一般与其余函数结合使用,如: summarize
summarize(group_by(ir, Species), sd(Petal.Width))
等效于:
aggregate(ir$Petal.Width, by=list(ir$Species), FUN=sd) #Base R
求相关系数
summarize(group_by(ir, Species), r=cor(Sepal.Length, Sepal.Width))
将一个函数的输出作为另一个函数的输入
group_by(ir, Species) %>% summarise(avg= mean(Petal.Length))
等效于:
summarise(group_by(ir,Species), r=mean(Petal.Length))
piping将多个函数可以直接结合,使用更加有效
non_virg = ir[ir$Species!="virginica", c("Petal.Length")]
sum(non_virg>3.5)
## [1] 45
#B. Using dplyr with no piping
summarise(filter(ir,Species!="virginica",Petal.Length>3.5), n())
## n()
## 1 45
#C. Using dplyr with piping
ir %>% filter(Species!="virginica", Petal.Length>3.5) %>% nrow()
## [1] 45
注意piping只能将不同的函数连接起来,不适用basing R
ir %>%
mutate(petal_w_l = Petal.Width/Petal.Length) %>%
arrange(desc(petal_w_l)) %>%
head(3) %>% select(Species, petal_w_l)
注意:which.max(by_spc$Sepal.Length_mean) 中的下划线‘_‘是by_spc中列的名字,并没有特殊的含义
summ = c(min = min,max = max,mean = mean,median = median, q2={function(x) quantile(x, 0.25)},q3={function(x) quantile(x, 0.75)})
by_spc=group_by(ir, Species) %>% summarise_all(summ)
by_spc
# A tibble: 3 x 25
Species Sepal.Length_min Sepal.Width_min Petal.Length_min
<fct> <dbl> <dbl> <dbl>
1 setosa 4.3 2.3 1
2 versic~ 4.9 2 3
3 virgin~ 4.9 2.2 4.5
......
a. Which plants have a higher mean sepal length?
by_spc[which.max(by_spc$Sepal.Length_mean),]$Species
b. Which plants have the sample with the smaller petal width?
by_spc[which.min(by_spc$Petal.Width_min),]$Species
使用sapply、class
sapply(choco, class)
Obtain the mode from all of the nominal attributes in the dataset.
注意 factor 使用
举例:
choco_nominal = choco[sapply(choco,{function (x) is.factor(x)})==TRUE]
sapply(choco_nominal, {function(x) names(which.max(table(x)))})
How many distinct companies have been considered? length(unique(choco$Company))
length(unique(choco$Company))
如:
Circulation2004= as.numeric(gsub(pattern = ",", replacement="", x=books$Daily.Circulation..2004))
标签:www 数据类型 split max func sap 检测 question app
原文地址:https://www.cnblogs.com/Stephanie-boke/p/12541868.html