码迷,mamicode.com
首页 > 其他好文 > 详细

表情包的同步异步下载

时间:2019-07-02 22:55:29      阅读:238      评论:0      收藏:0      [点我收藏+]

标签:查看   *args   main   out   like   保存图片   put   lxml   split   

同步下载

from lxml import etree
import requests
from urllib import request #保存图片
import os
import re

def parse_page(url):
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
    }
    response = requests.get(url=url,headers=headers).text
    tree = etree.HTML(response)
    images =tree.xpath(//div[@class="page-content text-center"]//img[@class!="gif"])
    for img in images:
        # 拿到img标签 但是img里的src显示的不是真正的图片,真正的图片在data-orginal里面(关键点)
        # print(etree.tostring(img))  # 查看img的html中样式
        img_url = img.get("data-original")  #获取到所有图片
        # get可以获取标签属性的某一个值  (关键点)

        pic_name = img.get("alt")
        pic_name = re.sub(r"[\??\.,。 !!]"," ",pic_name)
        # 获取后缀名
        suffix = os.path.splitext(img_url)[1]
        filename = pic_name + suffix
        # print(filename)
        request.urlretrieve(img_url,images/+filename)


def main():
    for x in range(1,3): # 获取 1-3页
        url = http://www.doutula.com/photo/list/?page=%d % x
        parse_page(url)


if __name__ == __main__:
    main()

 

表情包的异步下载

from lxml import etree
import requests
from urllib import request #保存图片
import os
import re
from queue import Queue
import threading


class Producer(threading.Thread):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
    }
    def __init__(self,page_queue,img_queue,*args,**kwargs):
        super(Producer, self).__init__(*args,**kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue

    def run(self):
        while True:
            if self.page_queue.empty(): # 如果page队列中没有url就退出
                break
            url = self.page_queue.get()
            self.parse_page(url)

    def parse_page(self,url):
        response = requests.get(url=url,headers=self.headers).text
        tree = etree.HTML(response)
        images =tree.xpath(//div[@class="page-content text-center"]//img[@class!="gif"])
        for img in images:
            img_url = img.get("data-original")  #获取到所有图片
            pic_name = img.get("alt")
            pic_name = re.sub(r"[\??\.,。 !!\*]"," ",pic_name)
            # 获取后缀名
            suffix = os.path.splitext(img_url)[1]
            filename = pic_name + suffix

            # 拿到文件名之后 现在可以添加到img_queue 队列中了
            self.img_queue.put((img_url,filename))


class Consumer(threading.Thread):
    def __init__(self,page_queue,img_queue,*args,**kwargs):
        super(Consumer, self).__init__(*args,**kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue

    def run(self):
        while True:
            # 两个队列都为空的时就退出
            if self.img_queue.empty() and self.page_queue.empty():
                break
            img_url,filename = self.img_queue.get() # 把上面封装的元组进行解包 (注意)
            request.urlretrieve(img_url,images/+filename)
            print(filename," 下载完毕")

def main():
    # 定义两个队列
    page_queue = Queue(100) # 爬100页
    img_queue = Queue(1000) # 存的图片 数值尽量设置大一点

    for x in range(1,101): # 获取 1-3页
        url = http://www.doutula.com/photo/list/?page=%d % x
        # 每一页的url 放入队列
        page_queue.put(url)

    for x in range(5):
        t = Producer(page_queue,img_queue)
        t.start()

    for x in range(5):
        t = Consumer(page_queue,img_queue)
        t.start()


if __name__ == __main__:
    main()

 

表情包的同步异步下载

标签:查看   *args   main   out   like   保存图片   put   lxml   split   

原文地址:https://www.cnblogs.com/kenD/p/11123555.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!