码迷,mamicode.com
首页 > 其他好文 > 详细

scrapy 爬虫并把所有网址和所有图片对应起来写入到Excel中

时间:2017-11-26 16:48:41      阅读:906      评论:0      收藏:0      [点我收藏+]

标签:write   extra   img   row   for   process   mpi   app   setting   

 

items.py 数据条目

import scrapy


class DangdangItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title=scrapy.Field()
link =scrapy.Field()
comment=scrapy.Field()
pic_addr=scrapy.Field()


dd.py 数据分析
# -*- coding: utf-8 -*-
import scrapy
import re
from dangdang.items import DangdangItem
from scrapy.http import Request

class DdSpider(scrapy.Spider):
name = ‘dd‘
allowed_domains = [‘dangdang.com‘]
start_urls = [‘http://category.dangdang.com/pg1-cid4003844.html‘]

def parse(self, response):
item=DangdangItem()
item[‘title‘]=response.xpath(‘//a[@name="itemlist-title"]/@title‘).extract()
item[‘link‘]=response.xpath(‘//a[@name="itemlist-title"]/@href‘).extract()
item[‘comment‘] = response.xpath(‘//a[@name="itemlist-review"]/text()‘).extract()
item[‘pic_addr‘] = response.xpath(‘//a/img/@data-original‘).extract()
lst=item[‘pic_addr‘]
lnk=item[‘link‘]
#相同图片地址整理到一起
i = 0
lst1 = []
while i < len(lst):
pat = ‘http://.*?/.*?/.*?/([0-9]{5})‘
data = re.compile(pat).findall(lst[i])
k = 0
j = 0
ll = []
while j < len(lst):
data1 = re.compile(pat).findall(lst[j])
if (data == data1):
ll.append(lst[j])

k += 1
if k > 1:
lst.pop(j)
j = j - 1
j += 1
lst1.append(ll)
i += 1
#网页链接和图片链接序号相匹配,为写入数据进行整理
lst = []
for m in range(0, len(lnk)):
pat1 = ‘http://.*?/([0-9]{5})‘
d = re.compile(pat1).findall(lnk[m])

for n in range(0, len(lst1)):
pat2 = ‘http://.*?/.*?/.*?/([0-9]{5})‘
d1 = re.compile(pat2).findall(lst1[n][0])

if d == d1:
lst.append(lst1[n])
break
item[‘pic_addr‘]=lst

yield item
for page in range(2,81):
url=‘http://category.dangdang.com/pg‘+str(page)+‘-cid4003844.html‘
yield Request(url,callback=self.parse)

创建Excel表
import xlwt
book = xlwt.Workbook(encoding="utf-8")
sht = book.add_sheet("publisher", cell_overwrite_ok=True)
sht.write(0,0,u‘序号‘)
sht.write(0,1,u‘标题‘)
sht.write(0,2,u‘链接‘)
sht.write(0,3,u‘评论‘)
sht.write(0,4,u‘图片链接1‘)
sht.write(0,5,u‘图片链接2‘)
sht.write(0,6,u‘图片链接3‘)
sht.write(0,7,u‘图片链接4‘)
sht.write(0,8,u‘图片链接5‘)
book.save("d:\\data\\dangdang\\dangdang.xls")


pipelines.py 数据写入

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import xlrd
from xlutils.copy import copy


class DangdangPipeline(object):
def process_item(self, item,spider):
rb = xlrd.open_workbook(‘d:\\data\\dangdang\\dangdang.xls‘)
sht = rb.sheets()[0]
nrows = sht.nrows
# ncols = sht.ncols+1
wb = copy(rb)
ws = wb.get_sheet(0)
for i in range(0,len(item[‘title‘])):
title=item[‘title‘][i]
link=item[‘link‘][i]
comment=item[‘comment‘][i]
pic_addr = item[‘pic_addr‘][i]
try:
ws.write(nrows,0,nrows)
ws.write(nrows, 1, title)
ws.write(nrows, 2, link)
ws.write(nrows, 3, comment)
for j in range(0,len(pic_addr)):
ws.write(nrows,4+j,pic_addr[j])
nrows += 1
except Exception as err:
print(err)
wb.save(‘d:\\data\\dangdang\\dangdang.xls‘)
return item




scrapy 爬虫并把所有网址和所有图片对应起来写入到Excel中

标签:write   extra   img   row   for   process   mpi   app   setting   

原文地址:http://www.cnblogs.com/xiesongyou/p/7899104.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!