码迷,mamicode.com
首页 > 其他好文 > 详细

xml--xpath--from lxml import etree

时间:2017-10-22 21:19:17      阅读:251      评论:0      收藏:0      [点我收藏+]

标签:res   exe   webkit   div   ber   etc   turn   美女   urlopen   

 html---显示数据
 xml ---传输数据


lxml 库
--- etree --xpath ==== path helper 工具辅助解析 from lxml import etree # 版本问题 有时候 不能 import lxml 后 不能使用lxml.etree
xml
=etree.HTML(html) #转换为HTML DOM
link_list
=xml.xpath("//div[@class="threadlist_lz clearfix"]//a[@class="j_th_tit "]/@href") # 匹配结果列表

 

=======================================================================================================
#  1  取出每个帖子的连接

# class="threadlist_title pull_left j_th_tit  member_thread_title_frs "  会员的class
#class="threadlist_title pull_left j_th_tit "                           普通的class

#class="threadlist_detail clearfix"  #  -===== 再往上找 找到相同的为止


# //div[@class="threadlist_lz clearfix"]//a[@class="j_th_tit "]/@href



# // --所有

# //div[@class="threadlist_lz clearfix"]//a[@class="j_th_tit"]  #  // 两层关系


#  2 取出每个帖子中 图片的连接

# //img[@class=‘BDE_Image‘]/@src


#=======================================================================================================

import urllib.request
import random
import os
import time
from concurrent.futures import ThreadPoolExecutor
from lxml import etree

def getUrl(url,user_agent):
    ‘‘‘
    根据url请求 获取相应的请求内容
    :param url: 需要爬取的url

    ‘‘‘
    print(url)
    request=urllib.request.Request(url)
    request.add_header(User-Agent,user_agent)
    response=urllib.request.urlopen(request)

    html=response.read().decode(utf-8)
    my_xml=etree.HTML(html)

    url_list=my_xml.xpath(//div[@class="threadlist_lz clearfix"]//a[@class="j_th_tit "]/@href)
    return url_list

#https://tieba.baidu.com/f?kw=美女&pn=50
def loadPicUrl(obj):

    url_list = obj.result()

    with ThreadPoolExecutor() as e:
        for url in url_list:
            e.submit(getPic,"https://tieba.baidu.com"+url).add_done_callback(savePic)
    e.shutdown(wait=True)

def getPic(url):

    html=urllib.request.urlopen(url).read()
    xml=etree.HTML(html)
    pic_url_list=xml.xpath("//img[@class=‘BDE_Image‘]/@src")
    return pic_url_list

def savePic(obj):
    ‘‘‘
    将html 写入本地
    :param html: 服务器的相应内容
    :return:
    ‘‘‘
    pic_url_list=obj.result()
    for pic_url in pic_url_list:
        path=%s\%s%(os.getcwd(),pic_url.split(/)[-1])
        content=urllib.request.urlopen(pic_url).read()
        with open(path,wb) as f:
            f.write(content)

def  tiebaSpider(kw,beginPage,endPage):

    new_kw = urllib.request.quote(kw)
    executor=ThreadPoolExecutor()
    for page in range(beginPage, endPage + 1):
        new_url = url + ?kw= + new_kw + &pn= + str(page*50)
        user_agent = random.choice(ua_list)
        future=executor.submit(getUrl,new_url,user_agent)
        future.add_done_callback(loadPicUrl)
    executor.shutdown(wait=True)

        # html=loadPage(new_url,user_agent)
        # savePage(html,kw,page)


if __name__ == __main__:

    url = http://tieba.baidu.com/f

    ua_list = [
        User-Agent:Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11,
        User-Agent:Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50,
        User-Agent:Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50,
        User-Agent:Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0,
        User-Agent:Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0),
        User-Agent:Mozilla/4.0(compatible;MSIE7.0;WindowsNT6.0),
        User-Agent:Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1,
        User-Agent:Opera/9.80(Macintosh;IntelMacOSX10.6.8;U;en)Presto/2.8.131Version/11.11,
        User-Agent:Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11
        ]
    kw=input(请输入需要爬取的贴吧:)
    beginPage=int(input(请输入开始页码:))
    endPage=int(input((请输入结束的页码:)))
    start_time=time.time()
    tiebaSpider(kw, beginPage, endPage)
    end_time=time.time()

    print(end_time-start_time)   # 2.0721185207366943

 

xml--xpath--from lxml import etree

标签:res   exe   webkit   div   ber   etc   turn   美女   urlopen   

原文地址:http://www.cnblogs.com/big-handsome-guy/p/7710431.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!