码迷,mamicode.com
首页 > 其他好文 > 详细

自己写一个爬虫 copider

时间:2015-10-27 22:08:12      阅读:243      评论:0      收藏:0      [点我收藏+]

标签:

copider 模仿scrapy的一些写法,当然我这个是单进程的,不是异步的

 

 

1.目录 copider/copider.py

#coding=utf-8

‘‘‘
Created on 2015年10月8日

@author: snt1
‘‘‘

import urllib2
import lxml.html
import StringIO



class Spider(object):
    def __init__(self, url, meta=None):
        self.URL = url
        self.META = meta
        self.TEXTMARK = self.get(url)
        self.SEL = self.selector(doc=self.TEXTMARK)
        
        
    def get(self, url):
        try:
            req = urllib2.Request(url)
            req.add_header(User-Agent, Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.155 Safari/537.36)
            shtml = urllib2.urlopen(req, timeout=15).read()
        except Exception, e:
            print e,"...next.."
            
        data = StringIO.StringIO(shtml)
        HTML = data.read()
        return(HTML)
    
    # 返回html
    @property
    def html(self):
        return self.TEXTMARK
    
    @property
    def url(self):
        return self.URL
    
    @property
    def meta(self):
        return self.META
    
    def selector(self, doc=None):
        if doc:
            HTML = doc
        else:
            HTML = self.HTML
        return lxml.html.fromstring(HTML)
    
    def xpath(self, rule):
        iter_list = self.SEL.xpath(rule)
        attrList = []
        try:
            for ele in iter_list:
                attrList.append(ele.attrib)
                #attrList.append(ele.attrib)
            return attrList
        except Exception, e:
            return iter_list
        
        
def Request(url, func, **meta):
    if meta:
        response=Spider(url,meta[meta])
    else:
        response=Spider(url)
    func(response)

 

 

 

2.copider/aero.py

#coding=utf-8

‘‘‘
Created on 2015年10月8日

@author: snt1
‘‘‘


import re
import time
from copider import Spider, Request





class AeroCopider(object):
    
    name = "aero"
    storeId = "554b14c97b010cc731e81b35" # 站点ID
    allowed_domains = ["www.xxxx.com"]
    
    root_url = http://www.xxxx.com
    category_url = root_url + /category/index.jsp?numResultsPerPage=100&categoryId=%s
    cap_category_url = root_url + /family/index.jsp?categoryId=%s&page=%d&numResultsPerPage=100
    url_dicts = {3534623:Girls, 3534624:Guys}
        
    def __init__(self):
        self.start_urls()

    def start_urls(self):
        for fid in self.url_dicts.keys():
            url = self.category_url %fid
            response = Spider(url)
            node_a = response.xpath(//*[@id="sidebar-left"]/div/dl[2]/dd//dt/a/@href)
            node_text = response.xpath(//*[@id="sidebar-left"]/div/dl[2]/dd//dt/a/text())
            
            url_list, cid_list  = [],[]
            for num, preparing in enumerate(node_a):
                parttern = re.compile(rfamily.jsp\?categoryId=)
                if parttern.search(preparing):
                    chd_url = self.root_url+preparing
                    pattern_sub = re.compile(&cp=.*?$)
                    chd_url = pattern_sub.sub(‘‘, chd_url, re.S|re.I|re.M)
                    
                    pattern_fin = re.compile(rfamily.jsp\?categoryId=(\d+))
                    cid = pattern_fin.findall(chd_url)[0]
                    url_list.append(chd_url)
                    cid_list.append(cid)
                    print(u产品分类链接:%s -> %s %(node_text[num], chd_url))
                    cateid = cid_list[num]
                    Request(chd_url, self.parse_page, meta={cateid:cateid})
                    print
                    

    def parse_page(self, response):
        #total_page = response.xpath(‘//div[@class="pagination"]/ul/li/a[@rel="nofollow"]/text()‘)
        total_items = int(response.xpath(//*[@id="main-wrap"]//li[@class="count"]/span/text())[0])
        mod, rem = divmod(total_items, 100)
        if mod > 1:
            if rem > 0:
                mod += 1
        else:
            mod = 1

        total_page = mod
        print(u产品总分页数: %s -> %s %(total_page,response.url))
        
        cateid = response.meta[cateid]
        for page in range(1, total_page+1):
            url = self.cap_category_url %(cateid, page)
            Request(url, self.parse_product)
             
     
    def parse_product(self, response):
        product = response.xpath(//*[@id="products"]//h4/a/@href)
        print(u以下来自哪个页面:%s %response.url)
        print(u产品:%s个 -> 路径:%s %(len(product), product))
        

    
                
if __name__ == __main__:
    AeroCopider()

    

 

自己写一个爬虫 copider

标签:

原文地址:http://www.cnblogs.com/caoguo/p/4915570.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!