2、R进阶

时间：2020-03-21 21:34:19 阅读：69 评论：0 收藏：0 [点我收藏+]

标签：data 赋值 UNC nbsp sub tor dimens amp 路径

R进阶

1. 函数

R 主要面向统计计算，很少会用到面向对象的编程方法（但可以基本实现）。
R语言是动态语言，不需要事先定义变量的属性
R是纯函数，函数不会改变输入的状态

1.1 举例

div7 = function(v){
    d = sum(v%%7==0)
    return(d)
# a more condensed solution would be: return(sum(v%%7==0))
}
# Now let‘s test our function
v1 = c(1, 2, 5, 6, 7, 3)
div7(v1)

1.2 函数并不改变输入的函数的状态（纯函数）

test = function(a){
  a[2] = 7
  return(a)
}

> a
[1] 1 2 3
> b = test(a)
> b
[1] 1 7 3
> a
[1] 1 2 3

2. 控制语句

2.1 If/else statement

x=10
if(x>1 & x<7){
    print("x is between 1 and 7")
}else if(x>8 & x< 15){
    print("x is bewtween 8 and 15")
}else{
    print("x is smaller than 1 or larger than 15")
}

2.2 For/while loop

x = c(1,2,3,4,5)
for(i in 1:5){
    print(x[i])
}

x = 2.987
while(x <= 4.987) {
    x = x + 0.987
    print(c(x,x-2,x-1))
}

2.3 Repeat loop

a = 1
repeat {
    print(a)
    a = a+1
    if(a > 4)
        break 
}

2.4 next跳过当前循环

x = 1: 4
for (i in x) {
    if (i == 2){
        next
    }
    print(i)
}

3. 高级数据结构List、Data Frames

3.1 List（注意直接连接不同的list不会去除重复，列表相当于Java中list和map的合体）

l = list("John","Silver")  
w = list(name="John", surname= "Silver", alias="Long John", age=30, alive="yes")
z = list(name="James", surname= "McGraw", alias= "James Flint", age= 45, alive ="unknown" )
#You can concatenate lists into lists:
v = c(w,z) #注意还是list

获取元素（注意是[[]]）：

w[[2]] # [1] "Silver"
w[["surname"]] # [1] "Silver"

注意list不能直接参与计算

min_list[2] # returns a list [4]
min_list[[2]] # returns 4
min_list[[2]] * 2 # returns 8

3.2 Data Frames

类似于一个table，每一列具有相同的数据类型，不同的列可以具有不同的数据类型。类似于sql中的表，若在面向对象语言中，需要为没一行构建一个object，然后封装成list

3.2.1 构建frame

name<- c("mary", "elisabeth", "lydia", "kitty", "fitzwilliam", "charles", "georgiana")
age <- c(21, 20, 18, 17, 27, 25, 15)
status = c(TRUE,TRUE,TRUE,FALSE, FALSE, TRUE, FALSE)
savings = c(500, 300, 200, 100, 10000, 20000, 20000)

#Create data frame
df = data.frame(name, age, status, savings)
names(df) = c("ID","Age","Alive", "Funds") # variable names

或者
df = data.frame(ID = name, Age = age, Alive = status, Alive = savings)

3.2.2 获取特定行与列

注意获取的一列是向量vector，多列才是frame.

df[,2] # 2nd column
df[2:4,] # 获取连续的某几行或者某几列
df[c("ID","Funds")] # 获取特定的某几列  
df$ID  # variable ID in the data frame
df[df$Age>20 & df$Funds>5000,] #获取元素满足条件的行

函数使用：
mean_age = mean(df$Age)
sum_funds = summary(df$Funds)

修改元素：
df[df$Age>20 & df$Alive==FALSE,]$Funds = 0
df[df$Age>20 & df$Alive==FALSE,"Funds"] = -Inf

添加新的列
df$Married = c(TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, FALSE)

4. Lists 与 Data Frames对比

list 可以将不同维度的数据直接进行结合（注意不会去除重复的数据），而Data Frames不行（并且名字不同也不行）
js= list(name="John")
jm= list(name="James", surname= "McGraw")
people = c(js,jm) #这是没有问题的

john=data.frame(name="John")
eleanor=data.frame(name="Eleanor", surname = "Gurthrie")
james=data.frame(name="Eleanor", famliyname = "Gurthrie")
rbind(john, eleanor) #列数不匹配
rbind(john, james) #列的名字不匹配

5. 文件管理

5.1 读、写TXT文本

write.table(A, file="data4.txt", col.names = c("T1", "T2"), row.names=FALSE)  
L= read.table("data.txt", header=FALSE, sep=":")

5.2 读、写CSV文本

data = read.csv(file="Path/to/file/data.csv", header=TRUE, sep=",")
write.csv(data, file = "myData.csv")

5.3 获取、设置当前路径

getwd()
setwd("/Path/to/RCode")

6. Apply函数使用

6.1 apply(X, MARGIN, FUN, . . . )

X是array（维度为2）或者matrix
MARGIN定义函数被用来作用行还是列，1表示行，2表示列，MARGIN=c(1,2)作用于行和列
FUN是如何处理数据的函数

举例：

apply(m,1,mean) #返回每一行的均值
apply(m,2,mean) #返回每一列的均值

6.2 lapply(X, FUN, . . . )

作用与每一个元素，可以视为apply的更高一级的封装 1. X 是 vector、 atomic、 list、 or data frame 2. FUN是作用于元素的函数

举例：

M1<-matrix(1:9, 3,3)
M2<-matrix(4:15, 4,3)
M3<-matrix(8:10, 3,2)
matrices_list<-list(M1, M2, M3)

min_list = lapply(matrices_list, min) #注意其中的每个元素是一个矩阵
min_list
## [[1]]
## [1] 1
##
## [[2]]
## [1] 4
##
## [[3]]
## [1] 8

min_list[2] # returns a list [4]，返回的是列表
min_list[[2]] # returns 4，返回的是值

练习：
Write a function that given an N-dimensional point x and an MxN-dimensional matrix A, returns the closest point to x in A.

calc_dist = function(b, A){ #Can be modified to return the distance, too!
    if(length(b)!= ncol(A)){
        print("ERROR:Dimensions do not match. Distances cannot be caculated.")
    }else{
        dists = apply(A,b, MARGIN = 1, FUN = function(x,b){sqrt(sum((x-b)^2))})
        return(A[which.min(dists),])
    }
}

6.3 sapply(X, FUN, . . . , simplify = TRUE)

simplify = TRUE 意味着简化输出
simplify = FALSE 等效与lapply()

M1<-matrix(1:9, 3,3)
M2<-matrix(4:15, 4,3)
M3<-matrix(8:10, 3,2)
matrices_list<-list(M1, M2, M3)
means1 = sapply(matrices_list, mean, simplify=FALSE)
means1
## [[1]]
## [1] 5
##
## [[2]]
## [1] 9.5
##
## [[3]]
## [1] 9

对比：

means2 = sapply(matrices_list, mean, simplify=TRUE)
means2
## [1] 5.0 9.5 9.0 （输出结果为向量）

7. 练习题补充

7.1 向量的交、并、差集

交集：intersect(A,B)
并集：union(A,B)
不同：setdiff(A,B),其中A中不同于B的元素

练习：Create a function that given two vectors v and u, returns two vectors: one with the values present in both vectors and one with the values present in one of the vectors and not the other.

inter_outer = function(u,v){
    res1 = intersect(u,v)
    res2 = c(setdiff(a, b), setdiff(b, a))
    return(list(inter = res1, outer = res2))
}

7.2 向量、矩阵的遍历

for (i in A) 其中A可以是向量或者矩阵

7.3 可以在函数中引入包

Write a function that, given a numerical one-dimentional list L, returns which values inside that list are prime numbers. (Hint: Remember schoolmath)

primes_l = function(L){
    library(schoolmath)
    a = unlist(L)
    a = a[is.prim(a)]
    return(list(a))
}

扩展：向量和列表的转换

List -> vector
a = unlist(L)

vector -> List
L = list(seq(5,30))

7.4 字符串连接paste()

nutri_preference = function(nutri, menu){
    sub = menu[menu$Nutrition==nutri,]
    name = paste("menu_", nutri,".csv", sep="") #paste allows us to use
    #variables in names
    write.csv(sub,name, row.names = FALSE)
}

nutri_preference("Vegan",menu)

其中sep是不同字符间的连接符
注意，menu[menu$Nutrition==nutri,]中的nutri可以是变量

7.5 获取两个不同数据框中交叉部分merge()

参考链接：https://blog.csdn.net/neweastsun/article/details/79435271

merge可以实现选出满足条件的数据的行，同时能包含不同数据帧的不同的属性（列）

实例1： Write a function that, given a data frame called order which has a variable number of food items and the quantity of these, prints the price of ordering all of the items on screen.

calc_order = function(order, menu){
    combi = merge(order,menu,by.x="Food",by.y="Food")
    total = sum(combi$Quant * combi$Price)  #不同的列之间可以直接运算、赋值
    return(paste("The total of your order is", total))
}
order1 = data.frame(Food = c("Toast","Tea"), Quant = c(2, 4)) #生成新的frame

calc_order(order1, menu)

实例2：

test1 = data.frame(Food = c("Toast"), taste = c("well"))
test2 = data.frame(Food = c("Toast","Tea"), Quant = c(2, 4))
combi = merge(test1,test2,by.x="Food",by.y="Food")

> combi
   Food taste Quant
1 Toast  well     2

7.6 which在frame中使用

能返回列的名字

> ms
  Biology Maths Chemistry Physics Programming Statistics
1      32    52        50      44          50         89
> which.max(ms)
Statistics 
         6

7.7 %in%、subset使用

%in%能判断元素是否存在frame中
subset获得frame的子集

实例：

checkgrade = function(a,class){
    if (a %in% class$Names == TRUE){
        c = subset(class, Names == a)
        l = list(personal_avg = rowMeans(c[4:9]), cohort_avg = rowMeans(subset(class, Age==c$Age)[4:9]))
        return(l)
    }else{
        return(paste("No student with the name",a))
    }
}
checkgrade("John",class2018)

7.8 获取frame的某一列的方法

frams$name
但是事先必须知道想要获得那一列即name
frames[,name]
事先可以不知道名字，name可以是变量

实例:
Write a function that, given a module and a threshold mark, it saves on a TXT the name of people who have obtained less than that mark in the module.

module_thres = function(module,mark,class){
    sub = class[class[,module] < mark,]$Names
    write.table(sub, "names.txt", row.names = FALSE)
    return(sub)
}
module_thres("Maths",70,class2018)

注意，frames的嵌套使用

2、R进阶

标签：data 赋值 UNC nbsp sub tor dimens amp 路径

原文地址：https://www.cnblogs.com/Stephanie-boke/p/12541858.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行