【Python】理想论坛爬虫，先爬主贴，再爬主贴中子贴，将发帖人，发帖时间等录入MySql数据库

时间：2018-04-04 16:11:57 阅读：237 评论：0 收藏：0 [点我收藏+]

标签：thread 技术分享 def ddt pre strong split utf-8 5.5

这回两个爬虫协助作业，论坛爬虫先爬主贴，爬到主贴后启动帖子爬虫爬子贴。

代码多点了：

# 论坛爬虫，用于爬取主贴再爬子贴
from bs4 import BeautifulSoup
import requests
import threading
import re
import pymysql

user_agent=‘Mozilla/4.0 (compatible;MEIE 5.5;windows NT)‘
headers={‘User-Agent‘:user_agent}

# 论坛爬虫类（多线程）
class forumCrawler(threading.Thread):
    def __init__(self,name,url):
        threading.Thread.__init__(self,name=name)
        self.name=name
        self.url=url
        self.infos=[]
    
    def run(self):
        print("线程"+self.name+"开始爬取页面"+self.url);

        try:
            rsp=requests.get(self.url,headers=headers)
            soup= BeautifulSoup(rsp.text,‘html.parser‘,from_encoding=‘utf-8‘)
            #print(rsp.text); # rsp.text是全文

            # 找出span
            for spans in soup.find_all(‘span‘,class_="forumdisplay"):
                #找出link
                for link in spans.find_all(‘a‘):
                    if link and link.get("href"): 
                        #print(link.get("href"))
                        #print(link.text+‘\n‘)
                        topicLink="http://www.55188.com/"+link.get("href")

                        tc=topicCrawler(name=self.name+‘_tc#‘+link.get("href"),url=topicLink)
                        tc.start()


        except Exception as e:
            print("线程"+self.name+"发生异常。")# 不管怎么出现的异常，就让它一直爬到底
            print(e);

# 帖子爬虫类（多线程）
class topicCrawler(threading.Thread):
    def __init__(self,name,url):
        threading.Thread.__init__(self,name=name)
        self.name=name
        self.url=url
        self.infos=[]
    
    def run(self):
        while(self.url!="none"):
            print("线程"+self.name+"开始爬取页面"+self.url);

            try:
                rsp=requests.get(self.url,headers=headers)
                self.url="none"#用完之后置空，看下一页能否取到值
                soup= BeautifulSoup(rsp.text,‘html.parser‘,from_encoding=‘utf-8‘)
                #print(rsp.text); # rsp.text是全文

                # 找出一页里每条发言
                for divs in soup.find_all(‘div‘,class_="postinfo"):
                    #print(divs.text) # divs.text包含作者和发帖时间的文字
                    
                    # 用正则表达式将多个空白字符替换成一个空格
                    RE = re.compile(r‘(\s+)‘)
                    line=RE.sub(" ",divs.text)

                    arr=line.split(‘ ‘)

                    #print(len(arr)) 
                    arrLength=len(arr)

                    if arrLength==7:
                        info={‘楼层‘:arr[1],
                              ‘作者‘:arr[2].replace(‘只看：‘,‘‘),
                              ‘日期‘:arr[4],
                              ‘时间‘:arr[5]}
                        self.infos.append(info);
                    elif arrLength==8:
                        info={‘楼层‘:arr[1],
                              ‘作者‘:arr[2].replace(‘只看：‘,‘‘),
                              ‘日期‘:arr[5],
                              ‘时间‘:arr[6]}
                        self.infos.append(info);



                #找下一页所在地址
                for pagesDiv in soup.find_all(‘div‘,class_="pages"):
                    for strong in pagesDiv.find_all(‘strong‘):
                        print(‘当前为第‘+strong.text+‘页‘)

                        # 找右边的兄弟节点
                        nextNode=strong.next_sibling
                        if nextNode and nextNode.get("href"): # 右边的兄弟节点存在，且其有href属性
                            #print(nextNode.get("href"))
                            self.url=‘http://www.55188.com/‘+nextNode.get("href")


                if self.url!="none":
                    print("有下一页，线程"+self.name+"前往下一页")
                    continue
                else:
                    print("无下一页，线程"+self.name+‘爬取结束，开始打印...‘)
                    
                    for info in self.infos:
                        print(‘\n‘)
                        for key in info:
                            print(key+":"+info[key])

                    print("线程"+self.name+‘打印结束.‘)

                    insertDB(self.name,self.infos)


            except Exception as e:
                print("线程"+self.name+"发生异常。重新爬行")# 不管怎么出现的异常，就让它一直爬到底
                print(e);
                continue

# 数据库插值
def insertDB(crawlName,infos):
    conn=pymysql.connect(host=‘127.0.0.1‘,user=‘root‘,passwd=‘12345678‘,db=‘test‘,charset=‘utf8‘)
    

    for info in infos:
        sql="insert into test.topic(floor,author,tdate,ttime,crawlername,addtime) values (‘"+info[‘楼层‘]+"‘,‘"+info[‘作者‘]+"‘,‘"+info[‘日期‘]+"‘,‘"+info[‘时间‘]+"‘,‘"+crawlName+"‘,now() )"
        print(sql)
        conn.query(sql)
    
    conn.commit()# 写操作之后commit不可少
    conn.close()


# 入口函数
def main():
    for i in range(1,10):
        url=‘http://www.55188.com/forum-8-‘+str(i)+‘.html‘
        tc=forumCrawler(name=‘fc#‘+str(i),url=url)
        tc.start()

# 开始
main()

输出太多就不贴了，把插入数据后的数据库展示一下：

技术分享图片

一会针对数据做个处理。

【Python】理想论坛爬虫，先爬主贴，再爬主贴中子贴，将发帖人，发帖时间等录入MySql数据库

标签：thread 技术分享 def ddt pre strong split utf-8 5.5

原文地址：https://www.cnblogs.com/xiandedanteng/p/8717506.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行