码迷,mamicode.com
首页 > 编程语言 > 详细

python之路——爬虫实例

时间:2016-09-07 22:44:00      阅读:307      评论:0      收藏:0      [点我收藏+]

标签:

urlController.py

import bsController
from urllib import request

class SpiderMain(object):
    def __init__(self):
        self.header = {‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11‘,
               ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘,
               ‘Accept-Charset‘: ‘ISO-8859-1,utf-8;q=0.7,*;q=0.3‘,
               ‘Accept-Encoding‘: ‘none‘,
               ‘Accept-Language‘: ‘en-US,en;q=0.8‘,
               ‘Connection‘: ‘keep-alive‘}
        self.bsManage = bsController.bsManage()

    def getUrl(self,rootUrl):
        for i in range(1,500):
            url = rootUrl+‘%s‘ %i+‘.html‘
            req = request.Request(url)
            for h in self.header:
                   req.add_header(h, self.header[h])
            try:
              html = request.urlopen(req).read()
              # print(html)
              self.bsManage.getPageUrl(html,i)
              req.close()
            except request.URLError as e:
              if hasattr(e, ‘code‘):
                print(‘Error code:‘,e.code)
              elif hasattr(e, ‘reason‘):
                print(‘Reason:‘,e.reason)


if __name__==‘__main__‘:
    rootUrl = ‘http://www.meitulu.com/item/‘
    obj_root = SpiderMain()
    obj_root.getUrl(rootUrl)

 

bsController.py

from bs4 import BeautifulSoup
from urllib import request
import os

class bsManage:
    def __init__(self):
        self.pageUrl = ‘http://www.meitulu.com/item/‘
        self.header = {
            ‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11‘,
            ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘,
            ‘Accept-Charset‘: ‘ISO-8859-1,utf-8;q=0.7,*;q=0.3‘,
            ‘Accept-Encoding‘: ‘none‘,
            ‘Accept-Language‘: ‘en-US,en;q=0.8‘,
            ‘Connection‘: ‘keep-alive‘}

    # html是获取到的网页的html
    # i表示i_x.html
    def getPageUrl(self,html,i):
        soup = BeautifulSoup(html, ‘html.parser‘, from_encoding=‘utf-8‘)
        # 获取到最后一个连接
        lastUrl = soup.find_all(‘div‘, {‘id‘: ‘pages‘})[0].find_all(‘a‘)[-2][‘href‘]
        # print(html)
        # print(lastUrl)
        # 获取到最后一页的数字
        if i < 10:
            len = 1
        elif i<100:
            len = 2
        elif i<1000:
            len = 3
        elif i<10000:
            len = 4
        lastPage = int(lastUrl[29+len:-5])
        # 创建图片文件夹
        if not os.path.exists(‘img‘):
            os.mkdir(‘img‘)
        path = ‘img/%s‘ %i
        if not os.path.exists(path):
            os.mkdir(path)
        # 先爬取第一页 因为url格式不一样
        # 获取所需要图片的连接 array
        links = soup.find_all(‘img‘,class_=‘content_img‘)
        for link in links:
               name = str(link[‘src‘])[-21:]
               data = request.urlopen(link[‘src‘]).read()
               img = open(‘img/%s/‘ %i + name,‘wb+‘)
               img.write(data)
               img.close()
        # print(‘%d 已经爬完‘ %i)

        # str = self.pageUrl + ‘%s‘ %i + ‘.html‘
        # print(str)

        # 每一个页面下有lastPage个小页面
        for j in range(2,lastPage+1):
            # 重新拼接url 获取到下一页的url
            url = self.pageUrl + ‘%s_%s‘ %(i,j) + ‘.html‘
            self.saveImgWithUrl(url,i)
        print(‘%d 已经爬完‘ %i)

    def saveImgWithUrl(self,url,i):
        req = request.Request(url)
        for h in self.header:
            req.add_header(h, self.header[h])
        try:
            html = request.urlopen(req).read()
            soup = BeautifulSoup(html, ‘html.parser‘, from_encoding=‘utf-8‘)
            # 获取所需要图片的连接 array
            links = soup.find_all(‘img‘, class_=‘content_img‘)
            for link in links:
                name = str(link[‘src‘])[-21:]
                data = request.urlopen(link[‘src‘]).read()
                img = open(‘img/%s/‘ % i + name, ‘wb+‘)
                img.write(data)
                img.close()
        except request.URLError as e:
            if hasattr(e, ‘code‘):
                print(‘Error code:‘, e.code)
            elif hasattr(e, ‘reason‘):
                print(‘Reason:‘, e.reason)

 

python之路——爬虫实例

标签:

原文地址:http://www.cnblogs.com/xj76149095/p/5851065.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!