码迷,mamicode.com
首页 > 其他好文 > 详细

pyspider—爬取下载图片

时间:2015-09-30 16:21:06      阅读:2037      评论:0      收藏:0      [点我收藏+]

标签:

以第一ppt网站为例:http://www.1ppt.com/

from pyspider.libs.base_handler import *
import urllib2,HTMLParser,re

import urllib2,HTMLParser,re

#根url
host = "http://www.1ppt.com/"
#本地保存地址
localSavePath = /data/girls/
#起始图片html地址
startHtmlUrl = ‘‘
#图片页Html的地址
htmlUrlList = []
#图片Url地址
imageUrlList = []
patter = [0-9]*\.jpg;
#根据得到的图片路径URL将图片下载下来保存本地
def downloadImage(url):
    print url
    cont = urllib2.urlopen(url).read()
    match = re.search(patter,url);
    if match:
        print 正在下载文件:,match.group()
        filename = localSavePath+match.group()
        f = open(filename,w+)
        f.write(cont)
        f.close()
    else:
        print no match

#根据首页得到的图片集遍历每个图片集
def getImageUrlByHtmlUrl(htmlUrl):
    parser = MyHtmlParse(False)
    request = urllib2.Request(htmlUrl)
    try:
        response = urllib2.urlopen(request)
        content = response.read()
        parser.feed(content)
    except urllib2.URLError,e:
        print e.reason
        return

class MyHtmlParse(HTMLParser.HTMLParser):
    def __init__(self,isIndex):
        self.isIndex = isIndex;
        HTMLParser.HTMLParser.__init__(self)

    def handle_starttag(self,tag,attrs):
        #print tag
        #print attrs

        if(self.isIndex):
            if(tag == a):
                if(len(attrs) == 3):
                    #print attrs[0]
                    if(attrs[1][0] ==title):
                        newUrl = host + attrs[0][1]
                        #    print ‘找到一处图片的网页链接:‘,newUrl
                        global startHtml
                        startHtmlUrl = newUrl
                        getImageUrlByHtmlUrl(newUrl)
        else:
            #print tag
            if(tag == img):
                #    print attrs
                #print attrs[0][0]
                #print attrs[1][0]
                if(attrs[0][0] == src and attrs[1][0] == alt and attrs[0][1] ):
                    imageUrl = attrs[0][1]
                    match = re.search(patter,imageUrl)
                    if match:
                        print 找到一张图片:,imageUrl
                        downloadImage(imageUrl)
                        imageUrlList.append(imageUrl)    
                        #if (tag == ‘a‘):       
                        #if (len(attrs) == 4):
                        ##if (attrs[1] == (‘class‘,‘next‘)):
                        #nextUrl = host + attrs[2][1]
                        #print ‘找到一处图片的网页链接:‘,nextUrl
                        #global startHtmlUrl
                        #if (startHtmlUrl != nextUrl):
                        #getImageUrlByHtmlUrl(nextUrl)


#分析首页得到每个图片集的链接
def parse_url_picture(indexUrl):
    #indexUrl = ‘http://desk.zol.com.cn/meinv/‘
    #分析首页得到每个图片集的链接
    #indexUrl = ‘http://www.1ppt.com‘
    m = urllib2.urlopen(indexUrl).read()
    #print m
    parserIndex = MyHtmlParse(True)
    parserIndex.feed(m)

picture_website = rhttp://www.1ppt.com/
class Handler(BaseHandler):
    crawl_config = {
    }
    
    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl(picture_website, callback=self.index_page)
        return
    @config(age= 10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc(a[href^="http"]).items():
            print each.attr.href
            parse_url_picture(each.attr.href)
            self.crawl(each.attr.href, callback=self.detail_page)
        return
    
    @config(priority=2)
    def detail_page(self, response):
        return{
        }

 

下面脚本是直接运行(不用放到爬虫平台上):

#coding: utf-8 #############################################################
# File Name: girls.py
# Author: mylonly
# mail: mylonly@gmail.com
# Created Time: Mon 09 Jun 2014 09:23:18 PM CST
#########################################################################
#!/usr/bin/python

import urllib2,HTMLParser,re

#根url
host = "http://1ppt.com"
#本地保存地址
localSavePath = /data/girls/
#起始图片html地址
startHtmlUrl = ‘‘
#图片页Html的地址
htmlUrlList = []
#图片Url地址
imageUrlList = []
patter = [0-9]*\.jpg;
#根据得到的图片路径URL将图片下载下来保存本地
def downloadImage(url):
    print url
    cont = urllib2.urlopen(url).read()
    match = re.search(patter,url);
    if match:
        print 正在下载文件:,match.group()
        filename = localSavePath+match.group()
        f = open(filename,w+)
        f.write(cont)
        f.close()
    else:
        print no match

#根据首页得到的图片集遍历每个图片集
def getImageUrlByHtmlUrl(htmlUrl):
    parser = MyHtmlParse(False)
    request = urllib2.Request(htmlUrl)
    try:
        response = urllib2.urlopen(request)
        content = response.read()
        parser.feed(content)
    except urllib2.URLError,e:
        print e.reason

class MyHtmlParse(HTMLParser.HTMLParser):
    def __init__(self,isIndex):
        self.isIndex = isIndex;
        HTMLParser.HTMLParser.__init__(self)
        
    def handle_starttag(self,tag,attrs):
        #print tag
        #print attrs
        
        if(self.isIndex):
            if(tag == a):
                if(len(attrs) == 3):
                    #print attrs[0]
                    if(attrs[1][0] ==title):
                        newUrl = host + attrs[0][1]
                    #    print ‘找到一处图片的网页链接:‘,newUrl
                        global startHtml
                        startHtmlUrl = newUrl
                        getImageUrlByHtmlUrl(newUrl)
        else:
            #print tag
            if(tag == img):
            #    print attrs
                print attrs[0][0]
                print attrs[1][0]
                if(attrs[0][0] == src and attrs[1][0] == alt and attrs[0][1] ):
                    imageUrl = attrs[0][1]
                    match = re.search(patter,imageUrl)
                    if match:
                        print 找到一张图片:,imageUrl
                        downloadImage(imageUrl)
                        imageUrlList.append(imageUrl)    
            #if (tag == ‘a‘):
                #if (len(attrs) == 4):
                    ##if (attrs[1] == (‘class‘,‘next‘)):
                    #nextUrl = host + attrs[2][1]
                    #print ‘找到一处图片的网页链接:‘,nextUrl
                    #global startHtmlUrl
                    #if (startHtmlUrl != nextUrl):
                        #getImageUrlByHtmlUrl(nextUrl)
#分析首页得到每个图片集的链接
indexUrl = http://www.1ppt.com
m = urllib2.urlopen(indexUrl).read()
#print m
parserIndex = MyHtmlParse(True)
parserIndex.feed(m)

pyspider—爬取下载图片

标签:

原文地址:http://www.cnblogs.com/panliu/p/4849212.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!