码迷,mamicode.com
首页 > 其他好文 > 详细

爬虫之获取当当网全部图书

时间:2017-04-02 17:33:52      阅读:221      评论:0      收藏:0      [点我收藏+]

标签:取数   ide   实例   continue   rect   取出   soup   lin   highlight   

#encoding:utf-8
#
#author:wuhao
#

#******

#爬取当当网图书,未使用框架

#main是主函数

#KindLinks.py和 获取数据信息.py 是2个封装的类

#KindLinks只有一个方法,它返回的是 listUrl---(name(小分类名称),url(小分类对应的链接)) LB---(总的分类)

#获取数据信息有2个方法,---getpage(),getinfo() getpage()返回的是页码数,getinfo()返回的是每本书中的信息(书名,评论数,作者,出版社,价格,出版日期) 书名我没有进行进一步的解析,可能比较杂乱




#当当网商品种类链接,获取不同种类的所有图书

from bs4 import BeautifulSoup

class _FirstPageLinkToGetUrl():
    def __init__(self,opener):
        self.opener=opener
        self.url="http://category.dangdang.com/?ref=www-0-C"


    def getDifferentSeriesBookUrl(self):
        html=self.opener.open(self.url).read().decode("gbk")

        soup=BeautifulSoup(html,"html.parser")
        #类别
        LB = []
        # 字典存储小类别对应的URL
        #dictUrl = {}
        #
        temp=0
        listUrl=[]
        count=[]
        #outside  ---外层的div
        #_li      ---li层
        for outsideDiv in soup.find("div", class_="classify_books", id="floor_1").find_all("div", class_="classify_kind"):
            LB.append(outsideDiv.div.a.string)
            temp=0
            dictUrl={}
            for _li in outsideDiv.find("ul").find_all("li"):
                if _li.a.string == "更多":
                    continue
                else:
                   # print(s.a.get("href"), s.a.string)
                    temp+=1
                    dictUrl[_li.a.string] = _li.a.get("href")
            count.append(temp)
            listUrl.append(dictUrl)
        return listUrl,LB
#获取网页中包含的图书的信息
from
bs4 import BeautifulSoup import re class _GetBookInfo(): def __init__(self,opener): self.opener=opener def getPage(self,url): html = self.opener.open(url) html = html.read().decode("gbk") # 网页数据 with open("test.txt","w") as f: f.write(html) regex=re.compile("<span>/\d+</span>") valueNum=re.findall("\d+",regex.findall(html)[0]) return int(valueNum[0]) def getInfo(self,url): html = self.opener.open(url).read().decode("gbk") soup = BeautifulSoup(html,"html.parser") ulTag=soup.find("ul",class_="list_aa listimg",id=True) liTag=ulTag.find_all("li",id=True) data1=[] #遍历liTag temp=0 for li in liTag: data = [] try: data.append(li.find("p",class_="name").string) data.append(li.find("p",class_="star").a.string) data.append(li.find("p",class_="author").a.string) data.append(li.find("p",class_="publishing").a.string) data.append(li.find("p",class_="price").span.string) data.append(re.findall(r"/ .+ ",str(li.find("p", class_="publishing_time")))[0].replace(" ","").replace("/","")) data1.append(data) except:continue #print(data) return data1 # ‘‘‘ def getDifferentSeriesBookUrl(self): html=self.opener.open(self.url).read().decode("gbk") soup=BeautifulSoup(html) #类别 LB = [] # 字典存储小类别对应的URL dictUrl = {} #outside ---外层的div #_li ---li层 for outsideDiv in soup.find("div", class_="classify_books", id="floor_1").find_all("div", class_="classify_kind"): LB.append(outsideDiv.div.a.string) for _li in outsideDiv.find("ul").find_all("li"): if _li.a.string == "更多": continue else: # print(s.a.get("href"), s.a.string) dictUrl[_li.a.string] = _li.a.get("href") return dictUrl,LB ‘‘‘
#-encoding:utf-8
from 当当网图书爬取 import 获取数据信息 as bookInfo
from 当当网图书爬取 import KindLinks as kls
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import http.cookiejar
import re
import xlwt
import xlrd

def getCorrectUrl(url,page):
    if page==0:  return url
    url=url.replace("m/","m/pg"+str(page)+"-")
    return url


#url,当当网所有商品网页
url="http://category.dangdang.com/?ref=www-0-C"
#创建实例化对象
Cookie=http.cookiejar.CookieJar()
#创建处理器
CookieHandle=urllib.request.HTTPCookieProcessor(Cookie)
#创建opener
opener=urllib.request.build_opener(CookieHandle)
#模拟浏览器登录
header=    {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
    }
head=[]
for key,value in header.items():
    elem=(key,value)
    head.append(elem)
opener.addheaders=head
#打开一次网页让opener具备Cookie
opener.open(url)

#首先获取相关链接从KindLinks
_kls=kls._FirstPageLinkToGetUrl(opener)
#书籍的链接数据
bdata=_kls.getDifferentSeriesBookUrl()

bdata_url=bdata[0]          #包含所有需要用的url
bdata_gd=bdata[1]           #大体描述
#bdata_count=bdata[2]        #每取出多少个url,创建一个表格
#把字典转换为list存储
bdata_url_name=[]
bdata_url_url=[]
print((list(bdata_url[0].values())))
for key in range(len(bdata_url)):
    bdata_url_url.append(list(bdata_url[key].values()))
    bdata_url_name.append(list(bdata_url[key].keys()))
print(bdata_url_name)
print(bdata_url_url[0])
#实例化对象
bio=bookInfo._GetBookInfo(opener)
#在excel中存储的格式
StyleinfoInExcel=["书名","评论数","作者","出版社","价格","出版日期"]
book=xlwt.Workbook(encoding="utf-8")
#用于统计总计书的数量
count=0

for _gd in range(len(bdata_url)):
    for _bdata in range(len(bdata_url_name[_gd])):
        page = bio.getPage(bdata_url_url[_gd][_bdata])           #获取页码数
        sheetname=bdata_url_name[_gd][_bdata].replace("/", "-")
        try:
            sheet=book.add_sheet(sheetname=sheetname)
        except:continue
        print(sheetname+"正在写入...")
        for i in range(len(StyleinfoInExcel)):
            sheet.write(0,i,StyleinfoInExcel[i])
        #进行数据的读取和写入
        temp=0
        for CurrentPage in range(1,page,1):                                             #CurrentPage为实际爬取到的网页页码
            try:
                data=bio.getInfo(getCorrectUrl(bdata_url_url[_gd][_bdata],CurrentPage))          #数据保存到data中
                #将数据写入到Excel
                for i in range(len(data)):
                    temp+=1
                    for j in range (len(data[i])):
                        #print(data[i][j],end=" ")
                        sheet.write(temp,j,data[i][j])
                    count+=1
            except:continue
        print("已写入"+str(count)+"本书")
        print(sheetname+"写入完成...\r\n")


        if _bdata==len(bdata_url_name[_gd])-1:
            book.save(bdata_gd[_gd].replace("/","-")+".xls")
            book = xlwt.Workbook(encoding="utf-8")
            print("--------已完成"+bdata_gd[_gd])
        #
print("写入完成,共计"+str(count)+"本书")
技术分享

  

 

爬虫之获取当当网全部图书

标签:取数   ide   实例   continue   rect   取出   soup   lin   highlight   

原文地址:http://www.cnblogs.com/one-lightyear/p/6659350.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!