标签:
urlController.py
import bsController from urllib import request class SpiderMain(object): def __init__(self): self.header = {‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11‘, ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘, ‘Accept-Charset‘: ‘ISO-8859-1,utf-8;q=0.7,*;q=0.3‘, ‘Accept-Encoding‘: ‘none‘, ‘Accept-Language‘: ‘en-US,en;q=0.8‘, ‘Connection‘: ‘keep-alive‘} self.bsManage = bsController.bsManage() def getUrl(self,rootUrl): for i in range(1,500): url = rootUrl+‘%s‘ %i+‘.html‘ req = request.Request(url) for h in self.header: req.add_header(h, self.header[h]) try: html = request.urlopen(req).read() # print(html) self.bsManage.getPageUrl(html,i) req.close() except request.URLError as e: if hasattr(e, ‘code‘): print(‘Error code:‘,e.code) elif hasattr(e, ‘reason‘): print(‘Reason:‘,e.reason) if __name__==‘__main__‘: rootUrl = ‘http://www.meitulu.com/item/‘ obj_root = SpiderMain() obj_root.getUrl(rootUrl)
bsController.py
from bs4 import BeautifulSoup from urllib import request import os class bsManage: def __init__(self): self.pageUrl = ‘http://www.meitulu.com/item/‘ self.header = { ‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11‘, ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘, ‘Accept-Charset‘: ‘ISO-8859-1,utf-8;q=0.7,*;q=0.3‘, ‘Accept-Encoding‘: ‘none‘, ‘Accept-Language‘: ‘en-US,en;q=0.8‘, ‘Connection‘: ‘keep-alive‘} # html是获取到的网页的html # i表示i_x.html def getPageUrl(self,html,i): soup = BeautifulSoup(html, ‘html.parser‘, from_encoding=‘utf-8‘) # 获取到最后一个连接 lastUrl = soup.find_all(‘div‘, {‘id‘: ‘pages‘})[0].find_all(‘a‘)[-2][‘href‘] # print(html) # print(lastUrl) # 获取到最后一页的数字 if i < 10: len = 1 elif i<100: len = 2 elif i<1000: len = 3 elif i<10000: len = 4 lastPage = int(lastUrl[29+len:-5]) # 创建图片文件夹 if not os.path.exists(‘img‘): os.mkdir(‘img‘) path = ‘img/%s‘ %i if not os.path.exists(path): os.mkdir(path) # 先爬取第一页 因为url格式不一样 # 获取所需要图片的连接 array links = soup.find_all(‘img‘,class_=‘content_img‘) for link in links: name = str(link[‘src‘])[-21:] data = request.urlopen(link[‘src‘]).read() img = open(‘img/%s/‘ %i + name,‘wb+‘) img.write(data) img.close() # print(‘%d 已经爬完‘ %i) # str = self.pageUrl + ‘%s‘ %i + ‘.html‘ # print(str) # 每一个页面下有lastPage个小页面 for j in range(2,lastPage+1): # 重新拼接url 获取到下一页的url url = self.pageUrl + ‘%s_%s‘ %(i,j) + ‘.html‘ self.saveImgWithUrl(url,i) print(‘%d 已经爬完‘ %i) def saveImgWithUrl(self,url,i): req = request.Request(url) for h in self.header: req.add_header(h, self.header[h]) try: html = request.urlopen(req).read() soup = BeautifulSoup(html, ‘html.parser‘, from_encoding=‘utf-8‘) # 获取所需要图片的连接 array links = soup.find_all(‘img‘, class_=‘content_img‘) for link in links: name = str(link[‘src‘])[-21:] data = request.urlopen(link[‘src‘]).read() img = open(‘img/%s/‘ % i + name, ‘wb+‘) img.write(data) img.close() except request.URLError as e: if hasattr(e, ‘code‘): print(‘Error code:‘, e.code) elif hasattr(e, ‘reason‘): print(‘Reason:‘, e.reason)
标签:
原文地址:http://www.cnblogs.com/xj76149095/p/5851065.html