码迷,mamicode.com
首页 > 其他好文 > 详细

文本挖掘

时间:2015-07-16 18:13:42      阅读:126      评论:0      收藏:0      [点我收藏+]

标签:

#
library("rJava")
library("Rwordseg")
library("NLP")
library("tm")
library(igraph)
#
setwd("E:\\毕业设计\\")
#
txt<-read.csv(file="总评论文本.csv",header=T)
#
txt<-txt[sample(nrow(txt),2000),]
edit(txt)
txt$id=as.character(txt$id)
txt$logo=as.character(txt$logo)
txt$comment=as.character(txt$comment)
#
insertWords(c("非常满意"))
insertWords(c("非常好"))
insertWords(c("很好"))
insertWords(c("很不好"))
insertWords(c("不好"))
insertWords(c("很不错"))
insertWords(c("不错"))
insertWords(c("很快"))
insertWords(c("很有用"))
insertWords(c("有用"))
insertWords(c("很好用"))
insertWords(c("好用"))
insertWords(c("挺好用"))
insertWords(c("挺好"))
insertWords(c("很快"))
insertWords(c("性价比"))
insertWords(c("苏宁"))
#
comment.Rwordseg<-segmentCN(txt$comment,nature=TRUE)
#
comment.Rwordseg.n<-comment.Rwordseg
for(i in 1:length(comment.Rwordseg))
{
    comment.Rwordseg.n[[i]][which(names(comment.Rwordseg[[i]])=="n")]
}
cordata=c()
for(i in 1:length(comment.Rwordseg.n)){
  cordata=c(cordata,comment.Rwordseg.n[[i]])
}
#
ovid=Corpus(VectorSource(cordata))
ovid<-tm_map(ovid,removeWords,c("苏宁","热水器","没有","有","很","给","说","没","我","后","的","了","不","买","就","是","也","还","都"))
#
dtm=DocumentTermMatrix(ovid,control=list(wordLengths = c(1, Inf)))
comment.matrix=as.matrix(dtm)
comment.freq=apply(comment.matrix,2,sum)
#
comment.freq.stop=rev(sort(comment.freq))[1:10]
#
plot(comment.freq.stop)
text(c(1:length(comment.freq.stop)),comment.freq.stop,names(comment.freq.stop))

文本挖掘

标签:

原文地址:http://www.cnblogs.com/liuhuan2368935760/p/4651489.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!