利用request、beautifulsoup、xml写多线程爬虫

时间：2017-06-02 17:23:48 阅读：202 评论：0 收藏：0 [点我收藏+]

标签：自定义 win request data star import beautiful pat 使用

# -*- coding:UTF-8 -*-
import requests,time
from collections import OrderedDict
import threading
from bs4 import BeautifulSoup as bp

    

t3 = time.time()
ths = []  # 存放线程


def get(num):
    dic = OrderedDict()
    n = str(num)
    data = {‘basename‘:‘BASENAME11‘,
    ‘where‘:‘2PLDYDY1‘,
    ‘dbpage‘:n,
    ‘pagecount‘:‘5‘,
    ‘order‘:‘ORDER1,ORDER2‘,
    ‘orderbytype‘:‘ASC‘,
    ‘searchList‘:‘SEARCHLIST11‘,
    ‘isKz‘:‘0‘,
    ‘id‘:‘0.40519130290516947‘}
    header1 = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36‘,‘Referrer‘:‘http://txl.cnnic.cn/cnnic/center.do?path=txl_index‘}
    page = requests.post(‘http://txl.cnnic.cn/cnnic/dBSearchForTxlAction.do‘,headers=header1,data=data)  # 自定义请求头，这些请求头内容是在浏览器上看到的
    t = page.text
    soup = bp(t,‘xml‘)  #使用beautifulsoup解析xml文件，解析html时，将xml改为lxml
    all_body = soup.find_all(‘EmailResult‘)  #查找EmailResult标签包含的所有内容，生成一个列表
    for info in all_body:
        print(u‘%s‘%info.NAME.text.ljust(10,‘　‘),info.FENJI.text.ljust(20,‘ ‘),info.SHOUJI.text.ljust(30),info.EMAIL.text.ljust(30),info.ZHIWU.text)  # 根据标签查找相应的text文本内容即可

for num in range(75):
    t1 = threading.Thread(target=get, args=(num,))
    ths.append(t1)
for t in ths:
    t.start()
for ttt in ths:
    ttt.join()

t4 = time.time()
tt = t4 - t3
print(tt)

标签：自定义 win request data star import beautiful pat 使用

原文地址：http://www.cnblogs.com/wt11/p/6933629.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行