码迷,mamicode.com
首页 > 其他好文 > 详细

爬虫:爬取男人团女优们的封面

时间:2018-11-16 23:42:37      阅读:5162      评论:0      收藏:0      [点我收藏+]

标签:erer   观察   content   rgs   lib   图片   href   empty   list   

将同一个女优的放到一个文件夹,用

threading.Lock()

防止新建文件夹错误,但注释掉后还能正常运行,有待观察

from lxml import etree
import requests
import os
import re
from urllib import request
import threading
from queue import Queue
from threading import Lock
base = ‘http://nanrenvip.xyz‘
headers = {
‘User-Agent‘: ‘~~~~~~~~~~~~~~~‘,
‘Referer‘: ‘http://nanrenvip.xyz/nvyouku/1-0-0-0-0-0-0.html‘
}


class Producer(threading.Thread):
"""在女优列表中提取每人的姓名和链接,name 用于分文件夹, 最后一直传递到img中"""
def __init__(self, pages, women_pages, *args, **kwargs):
super(Producer, self).__init__(*args, **kwargs)
self.pages = pages
self.women_pages = women_pages

def run(self):
while True:
if self.pages.empty():
break
url = self.pages.get()
self.get_women(url)

def get_women(self, url):
response = requests.get(url, headers=headers)
text = response.content.decode(‘utf-8‘)
html = etree.HTML(text)
box = html.xpath(‘//div[@class="list_box"]//div[@class="list_l"]//li‘)[:15]
for each in box:
name = each.xpath(‘./a/@title‘)[0]
her_url = each.xpath(‘./a/@href‘)[0]
women = {‘name‘: name, ‘url‘: base + her_url}
self.women_pages.put(women)


class Producer_2(threading.Thread):
"""获取女优详情页所有番号链接以及标题, 标题用来命名图片"""
def __init__(self, pages, women_pages, avs, *args, **kwargs):
super(Producer_2, self).__init__(*args, **kwargs)
self.pages = pages
self.women_pages = women_pages
self.avs = avs

def run(self):
while True:
if self.pages.empty() and self.women_pages.empty():
break
women = self.women_pages.get()
self.get_av_list(women)

def get_av_list(self, women):
url = women[‘url‘]
name = women[‘name‘]
response = requests.get(url, headers=headers)
text = response.content.decode(‘utf-8‘)
html = etree.HTML(text)
lst = html.xpath(‘//div[@class="zp_list"]‘)[0]
text = etree.tostring(lst, encoding=‘utf-8‘).decode(‘utf-8‘)
avs = re.findall(r‘<a href="(.*?)">(.*?)</a>‘, text, re.DOTALL)
for each in avs:
her_url = base + each[0]
her_title = each[1]
av_list = {‘url‘: her_url, ‘title‘: her_title, ‘name‘: name}
self.avs.put(av_list)


class Producer_3(threading.Thread):
"""获取该番号的图片src, 加上name title"""
def __init__(self, pages, women_pages, avs, imgs, *args, **kwargs):
super(Producer_3, self).__init__(*args, **kwargs)
self.pages = pages
self.women_pages = women_pages
self.avs = avs
self.imgs = imgs

def run(self):
while True:
if self.pages.empty() and self.women_pages.empty() and self.avs.empty():
break
av = self.avs.get()
self.get_imgs(av)

def get_imgs(self, av):
url = av[‘url‘]
name = av[‘name‘]
title = av[‘title‘]
response = requests.get(url, headers=headers)
text = response.content.decode(‘utf-8‘)
html = etree.HTML(text)
tar = html.xpath(‘//div[@class="artCon"]‘)[0]
text = etree.tostring(tar, encoding=‘utf-8‘).decode(‘utf-8‘)
src = re.findall(r‘data-original="(.*?)"‘, text, re.DOTALL)[0]
src = base + src
img = {}
img[‘name‘] = name
img[‘title‘] = title
img[‘src‘] = src
self.imgs.put(img)


class Consumer(threading.Thread):
"""下载图片, lock 防止出现新建文件夹时错误"""
def __init__(self, pages, women_pages, avs, imgs, lock, *args, **kwargs):
super(Consumer, self).__init__(*args, **kwargs)
self.pages = pages
self.women_pages = women_pages
self.avs = avs
self.imgs = imgs
self.lock = lock

def run(self):
while True:
if self.pages.empty() and self.women_pages.empty() and self.avs.empty() and self.imgs.empty():
break
img = self.imgs.get()
self.download(img)

def download(self, img):
src = img[‘src‘]
name = img[‘name‘]
title = img[‘title‘]
self.lock.acquire()
if not os.path.exists(‘vip/‘+name):
os.makedirs(‘vip/‘+name)
self.lock.release()
try:
request.urlretrieve(src, ‘./vip/%s/%s.jpg‘ % (name, title))
except:
print(src)


def main():
base_url = ‘http://nanrenvip.xyz/nvyouku/1-0-0-0-0-0-{}.html‘
pages = Queue(60)
women_pages = Queue(1000)
avs = Queue(100000)
imgs = Queue(100000)
lock = threading.Lock()
for x in range(55):
url = base_url.format(x)
pages.put(url)
for x in range(2):
Producer(pages, women_pages).start()
for x in range(10):
Producer_2(pages, women_pages, avs).start()
for x in range(10):
Producer_3(pages, women_pages, avs, imgs).start()
for x in range(10):
Consumer(pages, women_pages, avs, imgs, lock).start()


if __name__ == ‘__main__‘:
main()

爬虫:爬取男人团女优们的封面

标签:erer   观察   content   rgs   lib   图片   href   empty   list   

原文地址:https://www.cnblogs.com/xxxxf/p/9972109.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!