标签:xls lam 文件 make spi www. other 地址 zip
# -*- coding: utf-8 -*- import scrapy from WSYU.items import WsyuItem import datetime from urllib import parse from scrapy.http import Request class WsyuSpider(scrapy.Spider): name = ‘wsyu‘ allowed_domains = [‘wsyu.edu.cn‘] # start_urls = [‘http://www.wsyu.edu.cn/‘,] start_urls = [‘http://www.wsyu.edu.cn/‘,] html_url_set = [] other_url_set =[] wenjian_end = ["@", ".pdf", ".jpg", ".gif", ".png", ".doc", ".xls", ".ppt", ".mp3", ".rar", ".zip",] def do_fiter(self,all_urls): for one_url in all_urls: if any(u in one_url for u in self.wenjian_end): self.other_url_set.append(one_url) else: pass return all_urls def parse(self, response): # 获取所有的地址链接 all_urls = response.xpath(‘//a/@href‘).extract() all_urls = [parse.urljoin(response.url,url) for url in all_urls] all_urls1 = self.do_fiter(all_urls) # all_urls2 = list(filter(lambda x:True if x.startswith(‘\‘http‘) else False, all_urls1)) if all_urls1!=None: for one_url in all_urls1: if one_url not in self.html_url_set and one_url not in self.other_url_set: self.html_url_set.append(one_url) # yield self.make_requests_from_url(one_url) yield Request(parse.urljoin(response.url,one_url),callback=self.download_parse) # 回调函数默认为parse else: yield Request(url=self.html_url_set[-2],callback=self.parse) def download_parse(self,response): item = WsyuItem() item[‘url‘] = response.url # print(item[‘url‘]) item[‘content‘] = response.text # print(item[‘content‘]) item[‘create_time‘] = datetime.datetime.now() # print(item[‘create_time‘]) yield item # yield Request(url=response.url ,callback=self.parse) yield self.make_requests_from_url(response.url)
标签:xls lam 文件 make spi www. other 地址 zip
原文地址:https://www.cnblogs.com/longbigbeard/p/9716151.html