python爬虫（终）

时间：2016-07-13 10:32:36 阅读：322 评论：0 收藏：0 [点我收藏+]

标签：

主函数：

#coding: utf-8
‘‘‘
Created on 2016年4月21日

@author: Administrator
‘‘‘
import uuid
‘‘‘
多线程爬虫
天涯杂谈
爬取4月一个月的数据

‘‘‘
import requests,re
import json
import time
import MySQLdb
from sqlUtil2 import saveTopic,saveUser,saveRelation,saveComment
from multiprocessing.dummy import Pool as ThreadPool
global s
def getHtml(url):
    headers = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36‘}
    html = requests.get(url,headers=headers,timeout=1)#s设置超时的时间1s
    html.encoding=‘utf-8‘
    return html
def getAttentionHtml(userId,pageNo):
    url=‘http://www.tianya.cn/api/tw‘
    data={
          ‘method‘ :‘following.ice.select‘,
          ‘params.userId‘: userId,
          ‘params.pageSize‘:‘28‘,
          ‘params.pageNo‘:pageNo
          }
    headers = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36‘}
    html = requests.get(url,headers=headers,params=data ,timeout=1)#s设置超时的时间1s
    html.encoding=‘utf-8‘
    return html
def getFansHtml(userId,pageNo):
    url=‘http://www.tianya.cn/api/tw‘
    data={
          ‘method‘ :‘follower.ice.select‘,
          ‘params.userId‘: userId,
          ‘params.pageSize‘:‘28‘,
          ‘params.pageNo‘:pageNo
          }
    headers = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36‘}
    html = requests.get(url,headers=headers,params=data ,timeout=1)#s设置超时的时间1s
    html.encoding=‘utf-8‘
    return html
def getContnetByReg(reg,text):
    return re.findall(reg, text, re.S)
def getReplyData(url):
    reg=r‘class="atl-item".+?class="atl-info".+?href="(.+?)".+?>(.+?)</a>.+?<span>(.+?)</span>.+?class="bbs-content">(.+?)</div>‘
    dataList = getContnetByReg(reg, getHtml(url).text)
    return dataList
def getTopicData(url):
    reg=r‘class="s_title".+?<span.+?>(.+?)</span>.+?div class="atl-info".+?href="(.+?)".+?>(.+?)</a>.+?<span>(.+?)</span>.+?<span>(.+?)</span>.+?<span>(.+?)</span>.+?class="atl-main".+?class="bbs-content clearfix">(.+?)</div>‘
    dataList = getContnetByReg(reg, getHtml(url).text)
    return dataList
def getAuthorInfo(authorUrl):
    reg=r‘class="relate-link".+?href="(.+?)">(.+?)</a>.+?href="(.+?)">(.+?)</a>‘
    dataList = getContnetByReg(reg, getHtml(authorUrl).text)
    return dataList
def getAttentionList(userId,num):
    jsonstr = getAttentionHtml(userId,num).json()
    print getAttentionHtml(userId,num).text
    return jsonstr["data"]["user"]
def getFansList(userId,num):
    jsonstr = getFansHtml(userId,num).json()
    print getFansHtml(userId,num).text
    return jsonstr["data"]["user"]
def printFans(userId,num,username,conn):
    print ‘================粉丝=====================‘
    if(num%28==0):
        x = num/28
    else:
        x= num/28 + 1
    #数据量太大 相对减少 http://www.tianya.cn/43178991/fans
    if(x>=200):
        x=x/10
    for i in range(1,x+1):
        print ‘------第‘,i,‘页------‘
        fansList = getFansList(userId,i)
        for res in fansList:
            try:
                #保存关系
                relationParams = (uuid.uuid4(),res["name"],username)
                saveRelation(relationParams, conn)
            except Exception,e:
                print ‘failed!..‘,‘exception is: ‘,e
            try:
                #保存用户
                ISOTIMEFORMAT=‘%Y-%m-%d %X‘
                grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())
                authorUrl = ‘http://www.tianya.cn/‘ +  str(res["id"])
                userParams = (res["id"],res["fansCount"],res["followCount"],res["name"],authorUrl,grabTime)
                saveUser(userParams, conn)
                print res["id"],res["name"],res["followCount"],res["fansCount"]
            except Exception,e:
                print ‘failed!..‘,‘exception is: ‘,e
def printAttention(userId,num,username,conn):
    print ‘================关注的人=====================‘
    if(num%28==0):
        x = num/28
    else:
        x= num/28 + 1
    print x
    for i in range(1,x+1):
        print ‘------第‘,i,‘页------‘
        attentList = getAttentionList(userId,i)
        for res in attentList:
            try:
                relationParams = (uuid.uuid4(),username,res["name"])
                saveRelation(relationParams, conn)
            except Exception,e:
                print ‘failed!..‘,‘exception is: ‘,e
            try:
                #保存用户
                ISOTIMEFORMAT=‘%Y-%m-%d %X‘
                grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())
                authorUrl = ‘http://www.tianya.cn/‘ + str(res["id"])
                userParams = (res["id"],res["fansCount"],res["followCount"],res["name"],authorUrl,grabTime)
                saveUser(userParams, conn)
            except Exception,e:
                print ‘failed!..‘,‘exception is: ‘,e
            print res["id"],res["name"],res["followCount"],res["fansCount"]

def getTopicAllInfo(topicDataList,replyDataList,authorUrl,topiclink):
    conn=MySQLdb.connect(host=‘localhost‘,user=‘root‘,passwd=‘1234‘,db=‘networkpublicopinionmap3‘,port=3306,charset=‘utf8‘)
    for topic in topicDataList:
        #得到发帖时间
        postTime = topic[3].strip().split(‘：‘)[1]
        print ‘******‘,s
        print ‘topiclink: ‘ , topiclink
        print ‘topicId: ‘ , topiclink.split(‘-‘)[-2]
        print ‘title: ‘,topic[0].strip()
        print ‘authorLink: ‘,topic[1].strip()
        print ‘authorName: ‘,topic[2].strip()
        print ‘postTime: ‘,postTime
        print ‘scanNum: ‘,topic[4].strip().split(‘：‘)[1]
        print ‘replyNum: ‘,topic[5].strip().split(‘：‘)[1]
        print ‘content: ‘,topic[6].strip()

        userId = topic[1].strip().split(‘/‘)[-1]
        infoList = getAuthorInfo(topic[1].strip()) #获取作者的信息（粉丝，关注等等）
        for info in infoList:
            print ‘\tattentionnNums: ‘,int(info[1].strip())
            print ‘\tfansNum: ‘,int(info[3].strip())
            try:
                #保存作者的信息
                ISOTIMEFORMAT=‘%Y-%m-%d %X‘
                grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())
                userparams = (userId,info[3].strip(),info[1].strip(),topic[2].strip(),topic[1].strip(),grabTime)
                saveUser(userparams, conn)
            except Exception,e:
                print ‘failed!..‘,‘exception is: ‘,e
            if userId not in s:
                s.add(userId)
                if(int(info[1].strip())!=0):
                    #保存关注的人和作者的关系
                    #保存关注人
                    printAttention(userId,int(info[1].strip()),topic[2].strip(),conn)
                if(int(info[3].strip())!=0):
                    #保存粉丝和作者的关系
                    #保存粉丝
                    printFans(userId,int(info[3].strip()),topic[2].strip(),conn)
        try:
            #保存帖子
            ISOTIMEFORMAT=‘%Y-%m-%d %X‘
            grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())
            params = (topiclink.split(‘-‘)[-2],topiclink,topic[0].strip(),topic[6].strip(),topic[4].strip().split(‘：‘)[1],topic[5].strip().split(‘：‘)[1],topic[3].strip().split(‘：‘)[1],userId,grabTime)
            saveTopic(params,conn)
        except Exception,e:
            print ‘saveTopic-failed!..‘,‘exception is: ‘,e
    for data in replyDataList:
        print ‘replyerLink: ‘ , data[0].strip()
        print ‘replyerName: ‘ , data[1].strip()
        print ‘dateTime: ‘ , data[2].strip().split(‘：‘)[1]
        print ‘content: ‘ , data[3].strip()
        replyerId = data[0].strip().split(‘/‘)[-1]
        infoList = getAuthorInfo(data[0].strip()) #获取作者的信息（粉丝，关注等等）
        for info in infoList:
            print ‘\tattentionnNums: ‘,info[1].strip()
            print ‘\tfansNum: ‘,info[3].strip()
            try:
                #保存作者的信息
                ISOTIMEFORMAT=‘%Y-%m-%d %X‘
                grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())
                relplyerparams = (replyerId,info[3].strip(),info[1].strip(),data[1].strip(),data[0].strip(),grabTime)
                saveUser(relplyerparams, conn)
            except Exception,e:
                print ‘failed!..‘,‘exception is: ‘,e
            if replyerId not in s:
                s.add(replyerId)
                if(int(info[1].strip())!=0):
                    printAttention(replyerId,int(info[1].strip()),data[1].strip(),conn)
                if(int(info[3].strip())!=0):
                    printFans(replyerId,int(info[3].strip()),data[1].strip(),conn)
        try:
            #保存评论
            ISOTIMEFORMAT=‘%Y-%m-%d %X‘
            grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())
            commentParams = (uuid.uuid4(),data[3].strip(),data[2].strip().split(‘：‘)[1],topiclink.split(‘-‘)[-2],replyerId,grabTime)
            saveComment(commentParams,conn)
        except Exception,e:
            print ‘failed!..‘,‘exception is: ‘,e
    conn.close();
def getReplyAllInfo(topicDataList,replyDataList,authorUrl,topiclink):
    conn=MySQLdb.connect(host=‘localhost‘,user=‘root‘,passwd=‘1234‘,db=‘networkpublicopinionmap3‘,port=3306,charset=‘utf8‘)
    print ‘............第二页的评论开始............‘
    for data in replyDataList:
        print ‘topiclink: ‘ , topiclink
        print ‘replyerLink: ‘ , data[0].strip()
        print ‘replyername: ‘ , data[1].strip()
        print ‘dateTime: ‘ , data[2].strip().split(‘：‘)[1]
        print ‘content: ‘ , data[3].strip()
        replyerId = data[0].strip().split(‘/‘)[-1]
        infoList = getAuthorInfo(data[0].strip()) #获取作者的信息（粉丝，关注等等）
        for info in infoList:
            print ‘\tattentionnNums: ‘,info[1].strip()
            print ‘\tfansNum: ‘,info[3].strip()
            try:
                #保存作者的信息
                ISOTIMEFORMAT=‘%Y-%m-%d %X‘
                grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())
                relplyerparams = (replyerId,info[3].strip(),info[1].strip(),data[1].strip(),data[0].strip(),grabTime)
                saveUser(relplyerparams, conn)
            except Exception,e:
                print ‘failed!..‘,‘exception is: ‘,e
            if replyerId not in s:
                s.add(replyerId)
                if(int(info[1].strip())!=0):
                    printAttention(replyerId,int(info[1].strip()),data[1].strip(),conn)
                if(int(info[3].strip())!=0):
                    printFans(replyerId,int(info[3].strip()),data[1].strip(),conn)
        try:
            #保存评论
            ISOTIMEFORMAT=‘%Y-%m-%d %X‘
            grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())
            comment2Params = (uuid.uuid4(),data[3].strip(),data[2].strip().split(‘：‘)[1],topiclink.split(‘-‘)[-2],replyerId,grabTime)
            saveComment(comment2Params,conn)
        except Exception,e:
            print ‘saveComment()-failed!..‘,‘exception is: ‘,e
    conn.close();
def spider(url):
    originalUrl = ‘http://bbs.tianya.cn‘
    authorUrl = ‘http://www.tianya.cn‘
    reg=r‘</tbody>(.+?)</table>‘
    regLink = r‘div class="links".+?</a>.+?href="(.+?)"‘
    html = getHtml(url)
    nextLink = getContnetByReg(regLink, html.text)
    print ‘nextLink: ‘, originalUrl + nextLink[0]
    n=1
    while(nextLink[0]):
        print ‘...............第‘,n,‘页..................‘
        contentList = getContnetByReg(reg, html.text)
        for content in contentList:
            resreg  = r‘class="td-title faceblue">.+?href="(.+?)".+?(.+?)</‘
            resultList = getContnetByReg(resreg, content);
            for result in resultList:
                #获取发帖时间
                try:
                    pageHtml = getHtml(originalUrl + result[0].strip())
                    postTimeReg=r‘class="s_title".+?div class="atl-info".+?</a>.+?<span>(.+?)</span>‘#判断postTimr的正则表达式
                    postTimeList = getContnetByReg(postTimeReg, pageHtml.text)
                    postTime= postTimeList[0].strip().split(‘：‘)[1]
                    print ‘postTime: ‘,postTime
                    if(postTime.startswith(‘2016-03‘)):
                        print ‘end..‘
                        return
                    if(not postTime.startswith(‘2016-04‘)):
                        print ‘continue...‘
                        continue
                    print ‘start..‘
                    #获取帖子的信息
                    replyDataList = getReplyData(originalUrl + result[0].strip())#
                    topicDataList = getTopicData(originalUrl + result[0].strip())
                    print ‘=================================================‘
                    #先判断有没有页码
                    isPageReg=r‘class="atl-head".+?<div>(.+?)</div>‘#判断页码的正则表达式
                    isPage = getContnetByReg(isPageReg, pageHtml.text)
                    print ‘isPage[0]: ‘,isPage[0].strip()

                    #如果没有 页码直接得到帖子的相关信息
                    if(isPage[0].strip() == ‘‘):
                        #得到帖子的想关信息
                        getTopicAllInfo(topicDataList,replyDataList,authorUrl, originalUrl + result[0].strip())
                    #如果有页码
                    else:
                        regPage = r‘class="atl-pages">.+?</strong>.+?<(.+?)>‘#判断当前页是不是尾页的正则表达式
                        flag = getContnetByReg(regPage, pageHtml.text)
                        #判断页码终止的条件，输出所有页码下的信息
                        print ‘flag: ‘, flag[0]
                        #输出第一页的信息
                        print ‘----------------------------------第1页----------------------------------------------‘
                        getTopicAllInfo(topicDataList,replyDataList,authorUrl,originalUrl + result[0].strip())
                        #当前页的下页不为空时
                        i= 1
                        while(flag[0] != ‘span‘):
                            i+=1
                            #获取当前页下页的链接
                            print ‘----------------------------------------第‘, i ,‘页--------------------------------------------------‘
                            nextPageReg = r‘class="atl-pages">.+?</strong>.+?href="(.+?)">‘#判断当前页是不是尾页的正则表达式
                            nextPageLink = getContnetByReg(nextPageReg, pageHtml.text)
                            print ‘nextPageLink: ‘ , originalUrl + nextPageLink[0].strip()
                            replynextPageList = getReplyData(originalUrl + nextPageLink[0].strip())#下页
                            nextPageHtml = getHtml(originalUrl + nextPageLink[0].strip())
                            #输出此页信息
                            getReplyAllInfo(topicDataList,replynextPageList,authorUrl,originalUrl + result[0].strip())
                            flag = getContnetByReg(regPage, nextPageHtml.text)
                except Exception,e:
                    print ‘failed!..‘,‘exception is: ‘,e
        n +=1
        if n==2:
            url = ‘http://bbs.tianya.cn‘ + nextLink[0]
            html = getHtml(url)
            nextLink = getContnetByReg(regLink, html.text)
        else:
            regLink2 = r‘div class="links".+?</a>.+?</a>.+?href="(.+?)"‘
            nextLink = getContnetByReg(regLink2, html.text)
            url = ‘http://bbs.tianya.cn‘ + nextLink[0]
            try:
                html = getHtml(url)
                nextLink = getContnetByReg(regLink, html.text)
            except Exception , e:
                print ‘错误了！页面获取不到了！ ‘
if __name__ == ‘__main__‘:
   # url = ‘http://bbs.tianya.cn/list.jsp?item=develop&order=1‘
    url = ‘http://bbs.tianya.cn/list.jsp?item=free&order=1‘
    page = []
    s = set()
    #newpage = ‘http://tieba.baidu.com/p/3522395718?pn=1‘
    page.append(url)

    pool = ThreadPool(8)
    try:
        pool.map(spider, page)
    except Exception , e:
        print e
    finally:
        pool.close()
        pool.join()

数据库操作函数sqlUtil2.py

#coding: utf-8
‘‘‘
Created on 2016年4月27日

@author: Administrator
‘‘‘
def saveTopic(params,conn):
    cur=conn.cursor()
    sql0 =‘select ‘
    sql = "insert into topic(topicId,website,title,content,scanNums,replyNums,postTime,userId,grabTime)             values(%s,%s,%s,%s,%s,%s,%s,%s,%s)" 
    #保存数据库
    cur.execute(sql,params)
    conn.commit()
    print ‘数据库插入成功 =====================================================Topic!‘
    print ‘...................................................‘
    cur.close()
def saveUser(params,conn):
    cur=conn.cursor()
    sql = "insert into user(userId,fansCount,followCount,name,writerUrl,grabTime)             values(%s,%s,%s,%s,%s,%s)" 
    #保存数据库
    cur.execute(sql,params)
    conn.commit()
    print ‘数据库插入成功=====================================================User!‘
    print ‘...................................................‘
    cur.close()
def saveRelation(params,conn):
    cur=conn.cursor()
    sql = "insert into relation(id,userFrom,userTo)             values(%s,%s,%s)" 
    #保存数据库
    cur.execute(sql,params)
    conn.commit()
    print ‘数据库插入成功--====================================================Relation!‘
    print ‘...................................................‘
    cur.close()
def saveComment(params,conn):
    cur=conn.cursor()
    sql = "insert into comment(commentId,content,postTime,topicId,userId,grabTime)             values(%s,%s,%s,%s,%s,%s)" 
    #保存数据库
    cur.execute(sql,params)
    conn.commit()
    print ‘数据库插入成功=======================================================Comment!‘
    print ‘...................................................‘
    cur.close()

python爬虫（终）

标签：

原文地址：http://blog.csdn.net/poice00/article/details/51894373

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行