利用RCurl抓取电影团购信息

时间：2014-08-21 21:20:55 阅读：259 评论：0 收藏：0 [点我收藏+]

1 抓取的网址是360团购

http://tuan.360.cn/bei_jing/c_0.html?kw=电影&pageno=1#tuanFilter

2 利用firefox的FireBug插件分析其源代码，如下所示：

bubuko.com,布布扣

"//*/h3[@class=‘desc‘]" 匹配电影院名称
"//*/span [@class=‘discount‘]" 匹配原价
"//*/span [@class=‘price‘]" 匹配优惠价
"//*/div [@class=‘other-info clearfix‘]" 匹配备注信息
"//*/div[@class=‘source clearfix‘]" 匹配团购信息来源

3 源代码

## 利用RCurl抓取电影团购信息

library(RCurl)
library(XML)
library("plyr")

page <- 1:5
urlist[page]  <- paste("http://tuan.360.cn/bei_jing/c_0.html?kw=电影&pageno=",page,"#tuanFilter",sep="")
#伪造报头
myheader=c("User-Agent"="Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",
           "Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
           "Accept-Language"="en-us",
           "Connection"="keep-alive",
           "Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
)
#电影院名称
dyy_name<-c("")
#原价
last_price<-c("")
#优惠价格
now_price<-c("")
#备注
others<-c("")
#来源
tg_source<-c("")
for(url in urlist){
  #下载网址
  webpage  <- getURL(url,httpheader=myheader,.encoding="utf-8")
  #转化成XML格式
  pagetree <- htmlTreeParse(webpage,encoding="utf-8", error=function(...){}, useInternalNodes = TRUE,trim=TRUE)
  #利用XPATH筛选
  temp_name <- xpathSApply (pagetree, "//*/h3[@class=‘desc‘]", xmlValue)
  dyy_name<-c(dyy_name,temp_name)
  temp_price <- xpathSApply (pagetree, "//*/span [@class=‘discount‘]",xmlValue)
  last_price<-c(last_price,temp_price)
  temp_now_price <- xpathSApply (pagetree, "//*/span [@class=‘price‘]",xmlValue)
  now_price<-c(now_price,temp_now_price)
  temp_others <- xpathSApply (pagetree, "//*/div [@class=‘other-info clearfix‘]",xmlValue)
  others<-c(others,temp_others)
  temp_tg_source <- xpathSApply (pagetree, "//*/div[@class=‘source clearfix‘]", xmlValue)
  tg_source<-c(tg_source,temp_tg_source)
}
# 删除不必要的信息
tg_source<-laply(as.list(tg_source),function(x){
  unlist(strsplit(x,"\n"))[2]
})
# 删除不必要的信息
others<-laply(as.list(others),function(x){
  unlist(strsplit(x,"\n"))[2]
})
#组装成数据库
content<-data.frame(dyy_name,last_price,now_price,others,tg_source)
names(content)<-c(‘影院‘,‘原价‘,‘优惠价‘,‘备注‘,‘来源‘)

#写入csv文件
write.csv(content,file="电影团购信息.csv")

利用RCurl抓取电影团购信息,布布扣,bubuko.com

利用RCurl抓取电影团购信息

标签：des http java os io 文件 for 数据

原文地址：http://my.oschina.net/u/1431368/blog/305311

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行