标签:format llb kpi ict spi closed 只读 url [1]
# -*- coding: utf-8 -*- import scrapy from copy import deepcopy class SuningSpider(scrapy.Spider): name = ‘suning‘ allowed_domains = [‘suning.com‘] start_urls = [‘https://book.suning.com/‘] def parse(self, response): # li_list = response.xpath("//div[@class=‘menu-list‘]//div[@class=‘submenu-left‘]/ul/li") # #//div[@class="menu-list"]/div[14]//div[@class="submenu-left"]/p/a/text() # for li in li_list: # item = {} # item["title_1"] = li.xpath("./a/text()").extract_first() # item["href_1"] = li.xpath("./a/@href").extract_first() # print(item) # yield item # menu_list = response.xpath("//div[@class=‘menu-list‘]/div[@class=‘menu-sub‘]") # for menu_sub in menu_list: # item = {} # item["title_1"] = menu_sub.xpath("./div/p/a/text()").extract() # item["href_1"] = menu_sub.xpath("./div/p/a/@href").extract() # # item["title_2"] = menu_sub.xpath("./div/ul/li/a/text()").extract() # item["href_2"] = menu_sub.xpath("./div/ul/li/a/@href").extract() # # # print(item) # yield item # menu_list = response.xpath("//div[@class=‘menu-list‘]/div[@class=‘menu-sub‘]") # # for menu in menu_list: # item = {} # p_list = menu.xpath("./div[1]/p") # ul_list = menu.xpath("./div/ul") # for p in p_list: # # item["title_1"] = p.xpath("./a/text()").extract() # item["href_1"] = p.xpath("./a/@href").extract() # # print(item) # # for ul in ul_list: # # li_list = ul.xpath("./li") # for li in li_list: # # item["title_2"] = li.xpath("./a/text()").extract_first() # item["href_2"] = li.xpath("./a/@href").extract_first() # # print(item) # yield item menu_list = response.xpath("//div[@class=‘menu-list‘]/div[@class=‘menu-sub‘]") print("========") for menu in menu_list: item = {} div_list = menu.xpath("./div") for div_lr in div_list: p_list = div_lr.xpath("./p") ul_list = div_lr.xpath("./ul") #<div><p>小说</p><ul><li></li><li></li></ul><p>青春文学</p><ul><li></li><li></li></ul><p>艺术</p><ul><li></li><li></li></ul></div> #由于p标签和ul是同级的,但p标签是大分类,所以要让li下的a附属于大分类,就要同时循环,用zip for p,ul in zip(p_list,ul_list): item["title_1"] = p.xpath("./a/text()").extract() item["href_1"] = p.xpath("./a/@href").extract() li_list = ul.xpath("./li") for li in li_list: #https://list.suning.com/1-502688-0.html #https://list.suning.com/1-502688-0-0-0-0-0-14-0-4.html # item["url"] = response.xpath("") item["title_2"] = li.xpath("./a/text()").extract_first() item["href_2"] = li.xpath("./a/@href").extract_first() item["href_2"] = item["href_2"].rsplit(‘.‘,1)[0]+"-0-0-0-0-14-0-4.html" # print(item) # yield item yield scrapy.Request( item["href_2"], #列表页 callback = self.parse_list, meta = {"item":deepcopy(item)} ) # https://list.suning.com/emall/showProductList.do?ci=502679&pg=03&cp=0&il=0&iy=-1&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAAB&id=IDENTIFYING&cc=010&paging=1&sub=0 # next_part_url = ‘https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp={}&il=0&iy=-1&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAAB&id=IDENTIFYING&cc=010&paging=1&sub=0‘ # ci = item["href_2"].split("-")[1] # cp = item["href_2"].split("-")[2] # cp = cp.split(".")[0] # next_part_url = next_part_url.format(ci, cp) # # item["href_3"] =next_part_url # yield scrapy.Request( # next_part_url, # callback=self.parse_list, # meta={"item": deepcopy(item)} # ) def parse_list(self,response): print(response.request.url) # print(response.meta) item = deepcopy(response.meta["item"]) # li_list1 = response.xpath("//div[@id=‘filter-results‘]/ul/li") li_list1 = response.xpath("//li[@name=‘‘]") for li in li_list1: item["book_name"] = li.xpath(".//p[@class=‘sell-point‘]/a/text()").extract_first() # item["book_href"] = li.xpath(".//div[@class=‘res-info‘]/p[2]/a/@href").extract_first() # item["book_price"] = li.xpath(".//div[@class=‘res-info‘]/p[1]/em/text()").extract_first() # item["shop_name"] = li.xpath(".//div[@class=‘res-info‘]/p[4]/@salesname").extract_first() # item["shop_price"] = li.xpath(".//div[@class=‘res-info‘]/p[4]/a/@href").extract_first() # print(item) yield item # item1 = deepcopy(item) # print(item1) page_count = response.xpath("//a[@id=‘nextPage‘]/preceding-sibling::*[1]/text()").extract_first() if page_count: # current_page_num = int(response.xpath("//a[@class=‘cur‘]/text()").extract_first()) current_page = response.xpath("//link[@rel=‘canonical‘]/@href").extract_first() current_page_num = int(current_page.split(‘-‘)[2]) # url = ‘https://list.suning.com/1-502687-1-0-0-0-0-14-0-4.html‘ # next = response.xpath(‘//a[@id="nextPage"]/@href‘).extract_first() url_num = item["href_2"].rsplit(‘-‘)[1] if current_page_num < int(page_count): next_url = ‘https://list.suning.com/1-{}-{}-0-0-0-0-14-0-4.html‘.format(url_num,current_page_num + 1) yield scrapy.Request( next_url, callback=self.parse_list, meta={"item": response.meta["item"]} )
pipelines.py
import json,codecs from scrapy.exceptions import DropItem #第一种要在settings中配置保存路径 SUNING_FILE_PATH="suningdata.log" #两个piplines的执行顺序,根据权重,先打开第一个,后打开第二个,先执行第一个,后执行第二个,先关闭第二个,后关闭第一个 class SuningbookPipeline(object): def __init__(self,path): self.f = None self.path = path @classmethod def from_crawler(cls, crawler): """ 初始化时候,用于创建pipeline对象 :param crawler: :return: """ print(‘File.from_crawler‘) #去所有的配置文件中找SUNING_FILE_PATH path = crawler.settings.get(‘SUNING_FILE_PATH‘) return cls(path) def open_spider(self,spider): """ 爬虫开始执行时,调用 :param spider: :return: """ # if spider.name == ‘chouti‘:#多个爬虫项目时,执行chouti的pipelines print(‘File.open_spider‘) self.f = open(self.path,‘a+‘,encoding=‘utf-8‘) def process_item(self, item, spider): # f = open(‘xx.log‘,‘a+‘) # f.write(item[‘href‘]+‘\n‘) # f.close() lines = json.dumps(dict(item), ensure_ascii=False) + "\n" self.f.write(lines) return item #这个return item 的作用是交给下一个pipeliens里面的process_item的item, # 如果没有return下一个pipelines不会接收到值,为空 # raise DropItem() #如果不想让下面的pipelines的process_item执行,可以不用return item 用这个raise DropItem,抛出异常 def close_spider(self,spider): """ 爬虫关闭时,被调用 :param spider: :return: """ print(‘File.close_spider‘) self.f.close() #可以设置两个pipelines ,一个保存到文件,一个保存到数据库 # class DbSuningbookPipeline(object): # def __init__(self,path): # self.f = None # self.path = path # # @classmethod # def from_crawler(cls, crawler): # """ # 初始化时候,用于创建pipeline对象 # :param crawler: # :return: # """ # print(‘File.from_crawler‘) # #去所有的配置文件中找SUNING_FILE_PATH # path = crawler.settings.get(‘SUNING_FILE_PATH‘) # return cls(path) # # def open_spider(self,spider): # """ # 爬虫开始执行时,调用 # :param spider: # :return: # """ # # if spider.name == ‘chouti‘: # print(‘File.open_spider‘) # self.f = open(self.path,‘a+‘,encoding=‘utf-8‘) # # def process_item(self, item, spider): # # f = open(‘xx.log‘,‘a+‘) # # f.write(item[‘href‘]+‘\n‘) # # f.close() # lines = json.dumps(dict(item), ensure_ascii=False) + "\n" # self.f.write(lines) # return item # # # def close_spider(self,spider): # """ # 爬虫关闭时,被调用 # :param spider: # :return: # """ # print(‘File.close_spider‘) # self.f.close() #第二种 # class SuningbookPipeline(object): # """ # 将数据保存到json文件,由于文件编码问题太多,这里用codecs打开,可以避免很多编码异常问题 # 在类加载时候自动打开文件,制定名称、打开类型(只读),编码 # 重载process_item,将item写入json文件,由于json.dumps处理的是dict,所以这里要把item转为dict # 为了避免编码问题,这里还要把ensure_ascii设置为false,最后将item返回回去,因为其他类可能要用到 # 调用spider_closed信号量,当爬虫关闭时候,关闭文件 # """ # def __init__(self): # self.file = codecs.open(‘suning.json‘, ‘w‘, encoding="utf-8") # # def process_item(self, item, spider): # lines = json.dumps(dict(item), ensure_ascii=False) + "\n" # ## 注意需要有一个参数ensure_ascii=False ,不然数据会直接为utf编码的方式存入比如:“/xe15” # self.file.write(lines) # return item # # def spider_closed(self, spider): # self.file.close() #第三种 # class SuningbookPipeline(object): # def open_spider(self,spider): # self.f = open(‘xxx.text‘,‘a+‘,encoding=‘utf-8‘) # # def process_item(self, item, spider): # # print(item) # line = json.dumps(dict(item), ensure_ascii=False) + "\n" # self.f.write(line) # return item # # def close_spider(self,spider): # self.f.close() # #第四种 # class SuningbookPipeline(object): # def process_item(self, item, spider): # # with open(‘data.txt‘, ‘a‘) as f: # f.write(item[‘title_1‘]) # f.write(item[‘href_1‘]) # f.write(item[‘book_name‘] + ‘\n‘) # return item
settings
#配置保存路径 SUNING_FILE_PATH="suningdata.log" # 修改默认的去重规则 # DUPEFILTER_CLASS = ‘scrapy.dupefilter.RFPDupeFilter‘ # DUPEFILTER_CLASS = ‘xdb.dupefilters.XdbDupeFilter‘ # 限制深度 # DEPTH_LIMIT = 3
项目地址:https://github.com/CH-chen/suningbook
标签:format llb kpi ict spi closed 只读 url [1]
原文地址:https://www.cnblogs.com/chvv/p/10332465.html