码迷,mamicode.com
首页 > 编程语言 > 详细

用python抓一了一些数据存到本地

时间:2017-05-07 12:55:47      阅读:163      评论:0      收藏:0      [点我收藏+]

标签:blog   url   count   mini   pre   web   open   get   beautiful   

import codecs

from xml.dom.minidom import Document
import requests
from bs4 import BeautifulSoup

doc = Document()
def getAllUrl(pageCount):
    url=https://www.xxx.co/xxxx/{page}
    return  url.format(page=pageCount)

def getHtml(pageCount):
    html = requests.get(getAllUrl(pageCount))
    return html

def WirteXml(gName,gImg,wUrl):
    girlName = gName
    girlImage = gImg
    webUrl = wUrl
    name = doc.createElement("name")
    aperson.appendChild(name)
    personname = doc.createTextNode(girlName)
    name.appendChild(personname)
    img = doc.createElement("imgUrl")
    aperson.appendChild(img)
    prersonUrl = doc.createTextNode(girlImage)
    img.append.Child(prersonUrl)
    weburl = doc.createElement("webUrl")
    aperson.appendChild(weburl)
    personname = doc.createTextNode(webUrl)
    weburl.appendChild(personname)

if __name__ == __main__:
   # f = codecs.open(‘Conker.txt‘, ‘w‘, ‘utf-8‘)
    filename = "people.xml"
    f = codecs.open(filename, "w", utf-8)
    people = doc.createElement("Actresses")
    doc.appendChild(people)
    aperson = doc.createElement("person")
    people.appendChild(aperson)
    for count in range(1,1250):
      html = getHtml(count).text
      soup= BeautifulSoup(html,"lxml")
      trs=soup.findAll("img")
      length=len(trs)
      for i in range(length):
        try:
            girlName = trs[i].attrs["title"]
            girlImage = trs[i].attrs["src"]
            webUrl ="https://www.xxx.co/xx/"+trs[i].attrs["src"].split(/)[-1][:-6]
            WirteXml(girlName,girlImage,webUrl)
        except:
           None
      print(""+str(count)+"页抓完!!!")
    f.write(doc.toprettyxml(indent="  "))
    f.close()

 

用python抓一了一些数据存到本地

标签:blog   url   count   mini   pre   web   open   get   beautiful   

原文地址:http://www.cnblogs.com/Conker/p/6820345.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!