# 轮廓图?
# 轮廓图? 另一种非常有用的图表类型便是”轮廓图”,它通过绘制出每个变量在样本中的值,展示出每个变量的变化。
# 下文的“makeProfilePlot()”函数可以绘制出轮廓图。这个函数需要“RColorBrewer”库。
makeProfilePlot<-function(mylist,names){
require(RColorBrewer)
# find out how many variables we want to include
numvariables<-length(mylist)
# choose ‘numvariables‘ random colours
colours<-brewer.pal(numvariables,"Set1")
# find out the minimum and maximum values of the variables:
mymin<-1e+20
mymax<-1e-20
for(i in 1:numvariables){
vectori<-mylist[[i]]
mini<-min(vectori)
maxi<-max(vectori)
if(mini<mymin) {mymin<-mini}
if(maxi>mymax) {mymax<-maxi}
}
# plot the variables
for(i in 1:numvariables){
vectori<-mylist[[i]]
namei<-names[i]
colouri<-colours[i]
# 每组的均值与方差
# 通常感兴趣于从一个特定样本群体去计算其均值和标准偏差,例如,计算每一个品种葡萄酒样本。葡萄酒品种被存储在“wine”变量的“V1”列中。
# 为了仅提取2号品种的数据,我们输入:
cultivar2wine<-wine[wine$V1==2,]
sapply(cultivar2wine[2:14],mean)
sapply(cultivar2wine[2:14],sd)
# 你也可以通过相似的方法计算1号品种样本,或者是3号品种样本的13种化学物质浓度的均值和标准偏差:
# 然而,为了方便起见,你也许想通过以下的“printMeanAndSdByGroup()”函数一次性输出数据集中分组数据的均值和标准偏差:
printMeanAndSdByGroup<-function(variables,groupvariable){
# find the names of the variables
variablenames<-c(names(groupvariable),names(as.data.frame(variables)))
# within each group, find the mean of each variable
groupvariable<-groupvariable[,1] #ensures groupvariable is not a list
means<-aggregate(as.matrix(variables)~groupvariable,FUN=mean)
names(means)<-variablenames
print(paste("Mean:"))
print(means)
# within each group, find the standard deviation of each variable:
sds<-aggregate(as.matrix(variables)~groupvariable,FUN=sd)
names(sds)<-variablenames
print(paste("Standard deviations:"))
print(sds)
# within each group, find the number of samples:
samplesizes<-aggregate(as.matrix(variables)~groupvariable,FUN=length)
names(samplesizes)<-variablenames
print(paste("Sample sizes:"))
print(samplesizes)
}
printMeanAndSdByGroup(wine[2:14],wine[1])
# 函数”printMeanAndSdByGroup()”将输出分组样本的数字。在本例中,我们可以看到品种1有59个样本,品种2有71个样本,品种3有48个样本。
## 变量的组间方差和组内方差
# 如果我们想计算特定变量的组内方差(例如,计算特定化学物质的浓度),我们可以使用下述的“calWithinGroupsVariance()”函数:
calcWithinGroupsVariance<-function(variable,groupvariable){
# find out how many values the group variable can take
groupvariable2<-as.factor(groupvariable[[1]])
levels<-levels(groupvariable2)
numlevels<-length(levels)
# get the mean and standard deviation for each group:
numtotal<-0
denomtotal<-0
for(i in 1:numlevels){
leveli<-levels[i]
levelidata<-variable[groupvariable==leveli,]
levelilength<-length(levelidata)
# get the mean and standard deviation for group i:
meani<-mean(levelidata)
sdi<-sd(levelidata)
numi<-(levelilength-1)*(sdi*sdi)
denomi<-levelilength
numtotal<-numtotal+numi
denomtotal<-denomtotal+denomi
}
# calculate the within-groups variance
Vw<-numtotal/(denomtotal-numlevels)
return(Vw)
}
# 例如,计算V2变量(第一种化学物质的浓度)的组内方差,我们输入:
calcWithinGroupsVariance(wine[2],wine[1]) # [1] 0.2620525
# 我们可以通过下述的“calcBetweenGroupsVariance()”函数来计算特定变量(如V2)的组间方差:
calcBetweenGroupsVariance <- function(variable,groupvariable) {
# find out how many values the group variable can take
groupvariable2 <- as.factor(groupvariable[[1]])
levels <- levels(groupvariable2)
numlevels <- length(levels)
# calculate the overall grand mean:
grandmean <- mean(variable[,1])
# get the mean and standard deviation for each group:
numtotal <- 0
denomtotal <- 0
for (i in 1:numlevels)
{
leveli <- levels[i]
levelidata <- variable[groupvariable==leveli,]
levelilength <- length(levelidata)
# get the mean and standard deviation for group i:
meani <- mean(levelidata)
sdi <- sd(levelidata)
numi <- levelilength * ((meani - grandmean)^2)
denomi <- levelilength
numtotal <- numtotal + numi
denomtotal <- denomtotal + denomi
}
# calculate the between-groups variance
Vb <- numtotal / (numlevels - 1)
Vb <- Vb[[1]]
return(Vb)
}
# 可以像这样使用它计算V2的组间方差:
calcBetweenGroupsVariance(wine[2],wine[1]) # [1] 35.39742
# 我们可以通过变量的组间方差除以组内方差计算“separation”。因此,这个通过V2计算的这个间隔是:
calcBetweenGroupsVariance(wine[2],wine[1])/calcWithinGroupsVariance(wine[2],wine[1])
# 如果我们想通过多元统计数据的所有变量计算出间隔,你可以使用下述的“calcSeparations()”:
calcSeparations<-function(variables,groupvariable){
# find out how many variables we have
variables<-as.data.frame(variables)
numvariables<-length(variables)
# find the variable names
variablenames<-colnames(variables)
# calculate the separation for each variable
for(i in 1:numvariables){
variablei<-variables[i]
variablename<-variablenames[i]
Vw<-calcWithinGroupsVariance(variablei,groupvariable)
Vb<-calcBetweenGroupsVariance(variablei,groupvariable)
sep<-Vb/Vw
print(paste("variable",variablename,"Vw=",Vw,"Vb=",Vb,"separation=",sep))
}
}
# 例如,计算每一个变量的13种化学物质浓度的间隔,我们输入:
calcSeparations(wine[2:14],wine[1])
# 因此,个体变量在组内(葡萄酒品种)的最大间隔是V2(间隔为233.0)。
# 正如我们将在下面讨论的,线性判别分析(LDA)的目的是寻找一个个体变量的线性组合将令组内(这里是品种)实现最大的间隔。
# 这里希望能够通过任何个体变量(暂时是V8的233.9)得到一个更好的间隔替代这个最优间隔。