标签:
主函数:
#coding: utf-8
‘‘‘
Created on 2016年4月21日
@author: Administrator
‘‘‘
import uuid
‘‘‘
多线程爬虫
天涯杂谈
爬取4月一个月的数据
‘‘‘
import requests,re
import json
import time
import MySQLdb
from sqlUtil2 import saveTopic,saveUser,saveRelation,saveComment
from multiprocessing.dummy import Pool as ThreadPool
global s
def getHtml(url):
headers = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36‘}
html = requests.get(url,headers=headers,timeout=1)#s设置超时的时间1s
html.encoding=‘utf-8‘
return html
def getAttentionHtml(userId,pageNo):
url=‘http://www.tianya.cn/api/tw‘
data={
‘method‘ :‘following.ice.select‘,
‘params.userId‘: userId,
‘params.pageSize‘:‘28‘,
‘params.pageNo‘:pageNo
}
headers = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36‘}
html = requests.get(url,headers=headers,params=data ,timeout=1)#s设置超时的时间1s
html.encoding=‘utf-8‘
return html
def getFansHtml(userId,pageNo):
url=‘http://www.tianya.cn/api/tw‘
data={
‘method‘ :‘follower.ice.select‘,
‘params.userId‘: userId,
‘params.pageSize‘:‘28‘,
‘params.pageNo‘:pageNo
}
headers = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36‘}
html = requests.get(url,headers=headers,params=data ,timeout=1)#s设置超时的时间1s
html.encoding=‘utf-8‘
return html
def getContnetByReg(reg,text):
return re.findall(reg, text, re.S)
def getReplyData(url):
reg=r‘class="atl-item".+?class="atl-info".+?href="(.+?)".+?>(.+?)</a>.+?<span>(.+?)</span>.+?class="bbs-content">(.+?)</div>‘
dataList = getContnetByReg(reg, getHtml(url).text)
return dataList
def getTopicData(url):
reg=r‘class="s_title".+?<span.+?>(.+?)</span>.+?div class="atl-info".+?href="(.+?)".+?>(.+?)</a>.+?<span>(.+?)</span>.+?<span>(.+?)</span>.+?<span>(.+?)</span>.+?class="atl-main".+?class="bbs-content clearfix">(.+?)</div>‘
dataList = getContnetByReg(reg, getHtml(url).text)
return dataList
def getAuthorInfo(authorUrl):
reg=r‘class="relate-link".+?href="(.+?)">(.+?)</a>.+?href="(.+?)">(.+?)</a>‘
dataList = getContnetByReg(reg, getHtml(authorUrl).text)
return dataList
def getAttentionList(userId,num):
jsonstr = getAttentionHtml(userId,num).json()
print getAttentionHtml(userId,num).text
return jsonstr["data"]["user"]
def getFansList(userId,num):
jsonstr = getFansHtml(userId,num).json()
print getFansHtml(userId,num).text
return jsonstr["data"]["user"]
def printFans(userId,num,username,conn):
print ‘================粉丝=====================‘
if(num%28==0):
x = num/28
else:
x= num/28 + 1
#数据量太大 相对减少 http://www.tianya.cn/43178991/fans
if(x>=200):
x=x/10
for i in range(1,x+1):
print ‘------第‘,i,‘页------‘
fansList = getFansList(userId,i)
for res in fansList:
try:
#保存关系
relationParams = (uuid.uuid4(),res["name"],username)
saveRelation(relationParams, conn)
except Exception,e:
print ‘failed!..‘,‘exception is: ‘,e
try:
#保存用户
ISOTIMEFORMAT=‘%Y-%m-%d %X‘
grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())
authorUrl = ‘http://www.tianya.cn/‘ + str(res["id"])
userParams = (res["id"],res["fansCount"],res["followCount"],res["name"],authorUrl,grabTime)
saveUser(userParams, conn)
print res["id"],res["name"],res["followCount"],res["fansCount"]
except Exception,e:
print ‘failed!..‘,‘exception is: ‘,e
def printAttention(userId,num,username,conn):
print ‘================关注的人=====================‘
if(num%28==0):
x = num/28
else:
x= num/28 + 1
print x
for i in range(1,x+1):
print ‘------第‘,i,‘页------‘
attentList = getAttentionList(userId,i)
for res in attentList:
try:
relationParams = (uuid.uuid4(),username,res["name"])
saveRelation(relationParams, conn)
except Exception,e:
print ‘failed!..‘,‘exception is: ‘,e
try:
#保存用户
ISOTIMEFORMAT=‘%Y-%m-%d %X‘
grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())
authorUrl = ‘http://www.tianya.cn/‘ + str(res["id"])
userParams = (res["id"],res["fansCount"],res["followCount"],res["name"],authorUrl,grabTime)
saveUser(userParams, conn)
except Exception,e:
print ‘failed!..‘,‘exception is: ‘,e
print res["id"],res["name"],res["followCount"],res["fansCount"]
def getTopicAllInfo(topicDataList,replyDataList,authorUrl,topiclink):
conn=MySQLdb.connect(host=‘localhost‘,user=‘root‘,passwd=‘1234‘,db=‘networkpublicopinionmap3‘,port=3306,charset=‘utf8‘)
for topic in topicDataList:
#得到发帖时间
postTime = topic[3].strip().split(‘:‘)[1]
print ‘******‘,s
print ‘topiclink: ‘ , topiclink
print ‘topicId: ‘ , topiclink.split(‘-‘)[-2]
print ‘title: ‘,topic[0].strip()
print ‘authorLink: ‘,topic[1].strip()
print ‘authorName: ‘,topic[2].strip()
print ‘postTime: ‘,postTime
print ‘scanNum: ‘,topic[4].strip().split(‘:‘)[1]
print ‘replyNum: ‘,topic[5].strip().split(‘:‘)[1]
print ‘content: ‘,topic[6].strip()
userId = topic[1].strip().split(‘/‘)[-1]
infoList = getAuthorInfo(topic[1].strip()) #获取作者的信息(粉丝,关注等等)
for info in infoList:
print ‘\tattentionnNums: ‘,int(info[1].strip())
print ‘\tfansNum: ‘,int(info[3].strip())
try:
#保存作者的信息
ISOTIMEFORMAT=‘%Y-%m-%d %X‘
grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())
userparams = (userId,info[3].strip(),info[1].strip(),topic[2].strip(),topic[1].strip(),grabTime)
saveUser(userparams, conn)
except Exception,e:
print ‘failed!..‘,‘exception is: ‘,e
if userId not in s:
s.add(userId)
if(int(info[1].strip())!=0):
#保存关注的人和作者的关系
#保存关注人
printAttention(userId,int(info[1].strip()),topic[2].strip(),conn)
if(int(info[3].strip())!=0):
#保存粉丝和作者的关系
#保存粉丝
printFans(userId,int(info[3].strip()),topic[2].strip(),conn)
try:
#保存帖子
ISOTIMEFORMAT=‘%Y-%m-%d %X‘
grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())
params = (topiclink.split(‘-‘)[-2],topiclink,topic[0].strip(),topic[6].strip(),topic[4].strip().split(‘:‘)[1],topic[5].strip().split(‘:‘)[1],topic[3].strip().split(‘:‘)[1],userId,grabTime)
saveTopic(params,conn)
except Exception,e:
print ‘saveTopic-failed!..‘,‘exception is: ‘,e
for data in replyDataList:
print ‘replyerLink: ‘ , data[0].strip()
print ‘replyerName: ‘ , data[1].strip()
print ‘dateTime: ‘ , data[2].strip().split(‘:‘)[1]
print ‘content: ‘ , data[3].strip()
replyerId = data[0].strip().split(‘/‘)[-1]
infoList = getAuthorInfo(data[0].strip()) #获取作者的信息(粉丝,关注等等)
for info in infoList:
print ‘\tattentionnNums: ‘,info[1].strip()
print ‘\tfansNum: ‘,info[3].strip()
try:
#保存作者的信息
ISOTIMEFORMAT=‘%Y-%m-%d %X‘
grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())
relplyerparams = (replyerId,info[3].strip(),info[1].strip(),data[1].strip(),data[0].strip(),grabTime)
saveUser(relplyerparams, conn)
except Exception,e:
print ‘failed!..‘,‘exception is: ‘,e
if replyerId not in s:
s.add(replyerId)
if(int(info[1].strip())!=0):
printAttention(replyerId,int(info[1].strip()),data[1].strip(),conn)
if(int(info[3].strip())!=0):
printFans(replyerId,int(info[3].strip()),data[1].strip(),conn)
try:
#保存评论
ISOTIMEFORMAT=‘%Y-%m-%d %X‘
grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())
commentParams = (uuid.uuid4(),data[3].strip(),data[2].strip().split(‘:‘)[1],topiclink.split(‘-‘)[-2],replyerId,grabTime)
saveComment(commentParams,conn)
except Exception,e:
print ‘failed!..‘,‘exception is: ‘,e
conn.close();
def getReplyAllInfo(topicDataList,replyDataList,authorUrl,topiclink):
conn=MySQLdb.connect(host=‘localhost‘,user=‘root‘,passwd=‘1234‘,db=‘networkpublicopinionmap3‘,port=3306,charset=‘utf8‘)
print ‘............第二页的评论开始............‘
for data in replyDataList:
print ‘topiclink: ‘ , topiclink
print ‘replyerLink: ‘ , data[0].strip()
print ‘replyername: ‘ , data[1].strip()
print ‘dateTime: ‘ , data[2].strip().split(‘:‘)[1]
print ‘content: ‘ , data[3].strip()
replyerId = data[0].strip().split(‘/‘)[-1]
infoList = getAuthorInfo(data[0].strip()) #获取作者的信息(粉丝,关注等等)
for info in infoList:
print ‘\tattentionnNums: ‘,info[1].strip()
print ‘\tfansNum: ‘,info[3].strip()
try:
#保存作者的信息
ISOTIMEFORMAT=‘%Y-%m-%d %X‘
grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())
relplyerparams = (replyerId,info[3].strip(),info[1].strip(),data[1].strip(),data[0].strip(),grabTime)
saveUser(relplyerparams, conn)
except Exception,e:
print ‘failed!..‘,‘exception is: ‘,e
if replyerId not in s:
s.add(replyerId)
if(int(info[1].strip())!=0):
printAttention(replyerId,int(info[1].strip()),data[1].strip(),conn)
if(int(info[3].strip())!=0):
printFans(replyerId,int(info[3].strip()),data[1].strip(),conn)
try:
#保存评论
ISOTIMEFORMAT=‘%Y-%m-%d %X‘
grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())
comment2Params = (uuid.uuid4(),data[3].strip(),data[2].strip().split(‘:‘)[1],topiclink.split(‘-‘)[-2],replyerId,grabTime)
saveComment(comment2Params,conn)
except Exception,e:
print ‘saveComment()-failed!..‘,‘exception is: ‘,e
conn.close();
def spider(url):
originalUrl = ‘http://bbs.tianya.cn‘
authorUrl = ‘http://www.tianya.cn‘
reg=r‘</tbody>(.+?)</table>‘
regLink = r‘div class="links".+?</a>.+?href="(.+?)"‘
html = getHtml(url)
nextLink = getContnetByReg(regLink, html.text)
print ‘nextLink: ‘, originalUrl + nextLink[0]
n=1
while(nextLink[0]):
print ‘...............第‘,n,‘页..................‘
contentList = getContnetByReg(reg, html.text)
for content in contentList:
resreg = r‘class="td-title faceblue">.+?href="(.+?)".+?(.+?)</‘
resultList = getContnetByReg(resreg, content);
for result in resultList:
#获取发帖时间
try:
pageHtml = getHtml(originalUrl + result[0].strip())
postTimeReg=r‘class="s_title".+?div class="atl-info".+?</a>.+?<span>(.+?)</span>‘#判断postTimr的正则表达式
postTimeList = getContnetByReg(postTimeReg, pageHtml.text)
postTime= postTimeList[0].strip().split(‘:‘)[1]
print ‘postTime: ‘,postTime
if(postTime.startswith(‘2016-03‘)):
print ‘end..‘
return
if(not postTime.startswith(‘2016-04‘)):
print ‘continue...‘
continue
print ‘start..‘
#获取帖子的信息
replyDataList = getReplyData(originalUrl + result[0].strip())#
topicDataList = getTopicData(originalUrl + result[0].strip())
print ‘=================================================‘
#先判断有没有页码
isPageReg=r‘class="atl-head".+?<div>(.+?)</div>‘#判断页码的正则表达式
isPage = getContnetByReg(isPageReg, pageHtml.text)
print ‘isPage[0]: ‘,isPage[0].strip()
#如果没有 页码直接得到帖子的相关信息
if(isPage[0].strip() == ‘‘):
#得到帖子的想关信息
getTopicAllInfo(topicDataList,replyDataList,authorUrl, originalUrl + result[0].strip())
#如果有页码
else:
regPage = r‘class="atl-pages">.+?</strong>.+?<(.+?)>‘#判断当前页是不是尾页的正则表达式
flag = getContnetByReg(regPage, pageHtml.text)
#判断页码终止的条件,输出所有页码下的信息
print ‘flag: ‘, flag[0]
#输出第一页的信息
print ‘----------------------------------第1页----------------------------------------------‘
getTopicAllInfo(topicDataList,replyDataList,authorUrl,originalUrl + result[0].strip())
#当前页的下页不为空时
i= 1
while(flag[0] != ‘span‘):
i+=1
#获取当前页下页的链接
print ‘----------------------------------------第‘, i ,‘页--------------------------------------------------‘
nextPageReg = r‘class="atl-pages">.+?</strong>.+?href="(.+?)">‘#判断当前页是不是尾页的正则表达式
nextPageLink = getContnetByReg(nextPageReg, pageHtml.text)
print ‘nextPageLink: ‘ , originalUrl + nextPageLink[0].strip()
replynextPageList = getReplyData(originalUrl + nextPageLink[0].strip())#下页
nextPageHtml = getHtml(originalUrl + nextPageLink[0].strip())
#输出此页信息
getReplyAllInfo(topicDataList,replynextPageList,authorUrl,originalUrl + result[0].strip())
flag = getContnetByReg(regPage, nextPageHtml.text)
except Exception,e:
print ‘failed!..‘,‘exception is: ‘,e
n +=1
if n==2:
url = ‘http://bbs.tianya.cn‘ + nextLink[0]
html = getHtml(url)
nextLink = getContnetByReg(regLink, html.text)
else:
regLink2 = r‘div class="links".+?</a>.+?</a>.+?href="(.+?)"‘
nextLink = getContnetByReg(regLink2, html.text)
url = ‘http://bbs.tianya.cn‘ + nextLink[0]
try:
html = getHtml(url)
nextLink = getContnetByReg(regLink, html.text)
except Exception , e:
print ‘错误了!页面获取不到了! ‘
if __name__ == ‘__main__‘:
# url = ‘http://bbs.tianya.cn/list.jsp?item=develop&order=1‘
url = ‘http://bbs.tianya.cn/list.jsp?item=free&order=1‘
page = []
s = set()
#newpage = ‘http://tieba.baidu.com/p/3522395718?pn=1‘
page.append(url)
pool = ThreadPool(8)
try:
pool.map(spider, page)
except Exception , e:
print e
finally:
pool.close()
pool.join()
数据库操作函数sqlUtil2.py
#coding: utf-8
‘‘‘
Created on 2016年4月27日
@author: Administrator
‘‘‘
def saveTopic(params,conn):
cur=conn.cursor()
sql0 =‘select ‘
sql = "insert into topic(topicId,website,title,content,scanNums,replyNums,postTime,userId,grabTime) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
#保存数据库
cur.execute(sql,params)
conn.commit()
print ‘数据库插入成功 =====================================================Topic!‘
print ‘...................................................‘
cur.close()
def saveUser(params,conn):
cur=conn.cursor()
sql = "insert into user(userId,fansCount,followCount,name,writerUrl,grabTime) values(%s,%s,%s,%s,%s,%s)"
#保存数据库
cur.execute(sql,params)
conn.commit()
print ‘数据库插入成功=====================================================User!‘
print ‘...................................................‘
cur.close()
def saveRelation(params,conn):
cur=conn.cursor()
sql = "insert into relation(id,userFrom,userTo) values(%s,%s,%s)"
#保存数据库
cur.execute(sql,params)
conn.commit()
print ‘数据库插入成功--====================================================Relation!‘
print ‘...................................................‘
cur.close()
def saveComment(params,conn):
cur=conn.cursor()
sql = "insert into comment(commentId,content,postTime,topicId,userId,grabTime) values(%s,%s,%s,%s,%s,%s)"
#保存数据库
cur.execute(sql,params)
conn.commit()
print ‘数据库插入成功=======================================================Comment!‘
print ‘...................................................‘
cur.close()
标签:
原文地址:http://blog.csdn.net/poice00/article/details/51894373