标签:mahout mapreduce 数据挖掘 机器学习 r语言
阅读导读:~ vi weibo.csv
A,314,1091,1488
B,196,10455,327
C,557,51635228,13793
D,55,14655464,1681
E,318,547,4899
F,166,145,170
G,17,890,169
H,54,946759,17229
weibo<-read.csv(file="weibo.csv",header=FALSE)
names(weibo)<-c("id","follow","fans","tweet")
> data.frame(weibo[1],rank=rowSums(weibo[2:4]))
id rank
1 A 2893
2 B 10978
3 C 51649578
4 D 14657200
5 E 5764
6 F 481
7 G 1076
8 H 964042
1,19
1,21
2,11
2,17
2,21
3,1
3,20
3,2
3,7
3,6
3,10
4,3
4,6
5,19
5,11
5,2
6,4
6,12
6,18
6,15
6,10
6,5
7,9
7,18
7,10
8,3
8,11
8,7
8,16
8,14
9,6
10,8
10,18
11,13
11,3
12,9
12,4
12,16
12,5
13,19
13,1
13,6
14,7
14,17
14,19
14,1
14,5
14,2
15,11
15,14
15,12
16,20
17,4
17,6
18,10
18,11
18,15
18,14
19,18
20,10
20,5
21,24
22,11
23,17
24,15
25,24
library(igraph)
people<-read.csv(file="people.csv",header=FALSE)
drawGraph<-function(data){
names(data)<-c("from","to")
g <- graph.data.frame(data, directed=TRUE)
V(g)$label <- V(g)$name
V(g)$size <- 15
E(g)$color <- grey(0.5)
g2 <- simplify(g)
plot(g2,layout=layout.circle)
}
drawGraph(people)
#构建邻接矩阵
adjacencyMatrix<-function(pages){
n<-max(apply(pages,2,max))
A <- matrix(0,n,n)
for(i in 1:nrow(pages)) A[pages[i,]$dist,pages[i,]$src]<-1
A
}
#变换概率矩阵
dProbabilityMatrix<-function(G,d=0.85){
cs <- colSums(G)
cs[cs==0] <- 1
n <- nrow(G)
delta <- (1-d)/n
A <- matrix(delta,n,n)
for (i in 1:n) A[i,] <- A[i,] + d*G[i,]/cs
A
}
#递归计算矩阵特征值
eigenMatrix<-function(G,iter=100){
n<-nrow(G)
x <- rep(1,n)
for (i in 1:iter) x <- G %*% x
x/sum(x)
}
#直接计算矩阵特征值
calcEigenMatrix<-function(G){
x <- Re(eigen(G)$vectors[,1])
x/sum(x)
}
people<-read.csv(file="people.csv",header=FALSE)
names(people)<-c("src","dist");people
A<-adjacencyMatrix(people);A
G<-dProbabilityMatrix(A);G
q<-calcEigenMatrix(G);
q
[1] 0.03274732 0.03404052 0.05983465 0.03527074 0.04366519 0.07042752 0.02741232
[8] 0.03378595 0.02118713 0.06537870 0.07788465 0.03491910 0.03910097 0.05076803
[15] 0.06685364 0.01916392 0.02793695 0.09450614 0.05056016 0.03076591 0.02956243
[22] 0.00600000 0.00600000 0.03622806 0.00600000
result<-data.frame(userid=userid,PR=q[userid])
result
userid PR
1 18 0.09450614
2 11 0.07788465
3 6 0.07042752
4 15 0.06685364
5 10 0.06537870
6 3 0.05983465
7 14 0.05076803
8 19 0.05056016
9 5 0.04366519
10 13 0.03910097
11 24 0.03622806
12 4 0.03527074
13 12 0.03491910
14 2 0.03404052
15 8 0.03378595
16 1 0.03274732
17 20 0.03076591
18 21 0.02956243
19 17 0.02793695
20 7 0.02741232
21 9 0.02118713
22 16 0.01916392
23 22 0.00600000
24 23 0.00600000
25 25 0.00600000
people[c(which(people$src==18), which(people$dist==18)),]
src dist
55 18 10
56 18 11
57 18 15
58 18 14
19 6 18
24 7 18
33 10 18
59 19 18
which(result$userid %in% people$src[which(people$dist==18)])
[1] 3 5 8 20
table(people$src)[people$src[which(people$dist==18)]]
6 7 10 19
6 3 2 1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1
10,1
11,1
12,1
13,1
14,1
15,1
16,1
17,1
18,1
19,1
20,1
21,1
22,1
23,1
24,1
25,1
~ hadoop fs -cat /user/hdfs/pagerank/tmp1/part-r-00000|head -n 4
1 0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.43100002,0.005999999,0.43100002,0.005999999,0.005999999,0.005999999,0.005999999
10 0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.43100002,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.43100002,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999
11 0.005999999,0.005999999,0.43100002,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.43100002,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999
12 0.005999999,0.005999999,0.005999999,0.2185,0.2185,0.005999999,0.005999999,0.005999999,0.2185,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.2185,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999,0.005999999
~ hadoop fs -cat /user/hdfs/pagerank/pr/part-r-00000
1 0.716666
10 1.354167
11 2.232500
12 0.575000
13 0.575000
14 0.815833
15 1.354167
16 0.532500
17 1.425000
18 1.850000
19 1.283334
2 0.716667
20 1.141667
21 0.858333
22 0.150000
23 0.150000
24 1.850000
25 0.150000
3 1.170001
4 0.929167
5 1.070833
6 2.275001
7 0.603333
8 0.575000
9 0.645833
~ hadoop fs -cat /user/hdfs/pagerank/result/part-r-00000
1 0.032842
10 0.065405
11 0.077670
12 0.034864
13 0.039175
14 0.050574
15 0.066614
16 0.019167
17 0.027990
18 0.094460
19 0.050673
2 0.034054
20 0.030835
21 0.029657
22 0.006000
23 0.006000
24 0.036111
25 0.006000
3 0.059864
4 0.035314
5 0.043805
6 0.070516
7 0.027444
8 0.033715
9 0.021251
id pr
10 18 0.094460
3 11 0.077670
22 6 0.070516
7 15 0.066614
2 10 0.065405
19 3 0.059864
11 19 0.050673
6 14 0.050574
21 5 0.043805
5 13 0.039175
17 24 0.036111
20 4 0.035314
4 12 0.034864
12 2 0.034054
24 8 0.033715
1 1 0.032842
13 20 0.030835
14 21 0.029657
9 17 0.027990
23 7 0.027444
25 9 0.021251
8 16 0.019167
15 22 0.006000
16 23 0.006000
18 25 0.006000
标签:mahout mapreduce 数据挖掘 机器学习 r语言
原文地址:http://blog.csdn.net/u013361361/article/details/40951527