码迷,mamicode.com
首页 > 其他好文 > 详细

爬虫--百度贴吧每一页中的图片

时间:2018-08-02 01:48:24      阅读:138      评论:0      收藏:0      [点我收藏+]

标签:mozilla   百度贴吧   trie   ide   art   img   nbsp   spider   rtp   

import urllib.request
import urllib.parse
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
from lxml import etree

def loadPage(url):
    headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}
    request = urllib.request.Request(url, headers = headers)
    html = urllib.request.urlopen(request).read()

    #解析HTML文档为HTML_DOM模型
    content = etree.HTML(html)
    #返回所有匹配成功的列表集合
    link_list = content.xpath(‘//li[@class=" j_thread_list clearfix"]//div[@class="threadlist_title pull_left j_th_tit "]/a/@href‘)
    print(link_list,len(link_list))

    for link in link_list:
        fulllink = "http://tieba.baidu.com" + link      #每个帖子的链接
        loadImage(fulllink)

#取出每个帖子中每个图片的的链接
def loadImage(link):
    headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}
    request = urllib.request.Request(link, headers=headers)
    html = urllib.request.urlopen(request).read()
    content = etree.HTML(html)

    #返回帖子里所有图片链接的列表集合
    link_list = content.xpath(‘//img[@class="BDE_Image"]/@src‘)
    for link in link_list:
        filename = link[-15:]
        urllib.request.urlretrieve(link,‘./tieba/‘+filename)
        print("下载成功"+‘----‘+filename)


def tiebaSpider(url, beginPage, endPage):

    for page in range(beginPage, endPage + 1):
        pn = (page - 1) * 50
        fullurl = url + "&pn=" + str(pn)
        loadPage(fullurl)


if __name__ == "__main__":
    kw = input("请输入要爬取的贴吧名:")
    startPage = int(input("请输入起始页:"))
    endPage = int(input("请输入结束页:"))

    url = "https://tieba.baidu.com/f?"

    # 可以使用urlencode({‘kw‘:kw})  --->  https://tieba.baidu.com/f?kw=美女
    key = urllib.parse.urlencode({"kw": kw})
    fullurl = url + key
    # fullurl = url + ‘kw=‘ +kw
    # print(fullurl)

    tiebaSpider(fullurl, startPage, endPage)

 

爬虫--百度贴吧每一页中的图片

标签:mozilla   百度贴吧   trie   ide   art   img   nbsp   spider   rtp   

原文地址:https://www.cnblogs.com/dongpei/p/9404640.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!