标签:
以第一ppt网站为例:http://www.1ppt.com/
from pyspider.libs.base_handler import * import urllib2,HTMLParser,re import urllib2,HTMLParser,re #根url host = "http://www.1ppt.com/" #本地保存地址 localSavePath = ‘/data/girls/‘ #起始图片html地址 startHtmlUrl = ‘‘ #图片页Html的地址 htmlUrlList = [] #图片Url地址 imageUrlList = [] patter = ‘[0-9]*\.jpg‘; #根据得到的图片路径URL将图片下载下来保存本地 def downloadImage(url): print url cont = urllib2.urlopen(url).read() match = re.search(patter,url); if match: print ‘正在下载文件:‘,match.group() filename = localSavePath+match.group() f = open(filename,‘w+‘) f.write(cont) f.close() else: print ‘no match‘ #根据首页得到的图片集遍历每个图片集 def getImageUrlByHtmlUrl(htmlUrl): parser = MyHtmlParse(False) request = urllib2.Request(htmlUrl) try: response = urllib2.urlopen(request) content = response.read() parser.feed(content) except urllib2.URLError,e: print e.reason return class MyHtmlParse(HTMLParser.HTMLParser): def __init__(self,isIndex): self.isIndex = isIndex; HTMLParser.HTMLParser.__init__(self) def handle_starttag(self,tag,attrs): #print tag #print attrs if(self.isIndex): if(tag == ‘a‘): if(len(attrs) == 3): #print attrs[0] if(attrs[1][0] ==‘title‘): newUrl = host + attrs[0][1] # print ‘找到一处图片的网页链接:‘,newUrl global startHtml startHtmlUrl = newUrl getImageUrlByHtmlUrl(newUrl) else: #print tag if(tag == ‘img‘): # print attrs #print attrs[0][0] #print attrs[1][0] if(attrs[0][0] == ‘src‘ and attrs[1][0] == ‘alt‘ and attrs[0][1] ): imageUrl = attrs[0][1] match = re.search(patter,imageUrl) if match: print ‘找到一张图片:‘,imageUrl downloadImage(imageUrl) imageUrlList.append(imageUrl) #if (tag == ‘a‘): #if (len(attrs) == 4): ##if (attrs[1] == (‘class‘,‘next‘)): #nextUrl = host + attrs[2][1] #print ‘找到一处图片的网页链接:‘,nextUrl #global startHtmlUrl #if (startHtmlUrl != nextUrl): #getImageUrlByHtmlUrl(nextUrl) #分析首页得到每个图片集的链接 def parse_url_picture(indexUrl): #indexUrl = ‘http://desk.zol.com.cn/meinv/‘ #分析首页得到每个图片集的链接 #indexUrl = ‘http://www.1ppt.com‘ m = urllib2.urlopen(indexUrl).read() #print m parserIndex = MyHtmlParse(True) parserIndex.feed(m) picture_website = r‘http://www.1ppt.com/‘ class Handler(BaseHandler): crawl_config = { } @every(minutes=24 * 60) def on_start(self): self.crawl(picture_website, callback=self.index_page) return @config(age= 10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc(‘a[href^="http"]‘).items(): print each.attr.href parse_url_picture(each.attr.href) self.crawl(each.attr.href, callback=self.detail_page) return @config(priority=2) def detail_page(self, response): return{ }
下面脚本是直接运行(不用放到爬虫平台上):
#coding: utf-8 ############################################################# # File Name: girls.py # Author: mylonly # mail: mylonly@gmail.com # Created Time: Mon 09 Jun 2014 09:23:18 PM CST ######################################################################### #!/usr/bin/python import urllib2,HTMLParser,re #根url host = "http://1ppt.com" #本地保存地址 localSavePath = ‘/data/girls/‘ #起始图片html地址 startHtmlUrl = ‘‘ #图片页Html的地址 htmlUrlList = [] #图片Url地址 imageUrlList = [] patter = ‘[0-9]*\.jpg‘; #根据得到的图片路径URL将图片下载下来保存本地 def downloadImage(url): print url cont = urllib2.urlopen(url).read() match = re.search(patter,url); if match: print ‘正在下载文件:‘,match.group() filename = localSavePath+match.group() f = open(filename,‘w+‘) f.write(cont) f.close() else: print ‘no match‘ #根据首页得到的图片集遍历每个图片集 def getImageUrlByHtmlUrl(htmlUrl): parser = MyHtmlParse(False) request = urllib2.Request(htmlUrl) try: response = urllib2.urlopen(request) content = response.read() parser.feed(content) except urllib2.URLError,e: print e.reason class MyHtmlParse(HTMLParser.HTMLParser): def __init__(self,isIndex): self.isIndex = isIndex; HTMLParser.HTMLParser.__init__(self) def handle_starttag(self,tag,attrs): #print tag #print attrs if(self.isIndex): if(tag == ‘a‘): if(len(attrs) == 3): #print attrs[0] if(attrs[1][0] ==‘title‘): newUrl = host + attrs[0][1] # print ‘找到一处图片的网页链接:‘,newUrl global startHtml startHtmlUrl = newUrl getImageUrlByHtmlUrl(newUrl) else: #print tag if(tag == ‘img‘): # print attrs print attrs[0][0] print attrs[1][0] if(attrs[0][0] == ‘src‘ and attrs[1][0] == ‘alt‘ and attrs[0][1] ): imageUrl = attrs[0][1] match = re.search(patter,imageUrl) if match: print ‘找到一张图片:‘,imageUrl downloadImage(imageUrl) imageUrlList.append(imageUrl) #if (tag == ‘a‘): #if (len(attrs) == 4): ##if (attrs[1] == (‘class‘,‘next‘)): #nextUrl = host + attrs[2][1] #print ‘找到一处图片的网页链接:‘,nextUrl #global startHtmlUrl #if (startHtmlUrl != nextUrl): #getImageUrlByHtmlUrl(nextUrl) #分析首页得到每个图片集的链接 indexUrl = ‘http://www.1ppt.com‘ m = urllib2.urlopen(indexUrl).read() #print m parserIndex = MyHtmlParse(True) parserIndex.feed(m)
标签:
原文地址:http://www.cnblogs.com/panliu/p/4849212.html