码迷,mamicode.com
首页 > Web开发 > 详细

抓取天涯文章的蜘蛛代码,刚经过更新(因为天涯页面HTML代码变化)

时间:2014-08-08 21:14:36      阅读:311      评论:0      收藏:0      [点我收藏+]

标签:style   blog   http   color   os   for   ar   div   

#_*_coding:utf-8-*-
import urllib2
import traceback
import codecs
from BeautifulSoup import BeautifulSoup

def openSoup(url,code):
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page,fromEncoding=code)#,fromEncoding="gb2312"
    #soup = BeautifulSoup(page,code)
    return soup

def getContentFromDiv(contents):
    s = ""
    for content in contents:
        try:
            s += content
        except:
            pass
    
    s = s.lstrip().rstrip()
    if len(s) < 50:
        return ""
    else:
        return "    "+s+"\r\n"+"\r\n"

def readHtml(soup,fp,authname):
    pageContent = ""
    item = soup.find(name=div, attrs={class:bbs-content clearfix})
    if item != None:
        pageContent += getContentFromDiv(item.contents)

    items = soup.findAll(name=div, attrs={class:atl-item})
    for item in items:
        userItem = item.find(name=a, attrs={class:js-vip-check})
        if userItem == None or userItem.contents[0] != authname:
            continue

        contentItem = item.find(name=div, attrs={class:bbs-content})
        pageContent += getContentFromDiv(contentItem.contents)
    
    fp.write(pageContent)
   
def getNextPage(soup,pno):
    nextlink = soup.find(name="a",attrs={"class":"js-keyboard-next"})
    if nextlink != None:
        return "http://bbs.tianya.cn"+nextlink["href"]
    else:
        return OVER

def getHtml(url,filename,authname):
    p = 1
    fp = codecs.open(filename,w,utf-8)
    while True:
        soup = openSoup(url,utf-8)
        readHtml(soup,fp,authname)
        url = getNextPage(soup,p+1)
        if url == OVER :
            break
        print PAGE +str(p)+ OK
        p = p + 1
       
    print It\‘s Over
    fp.close()


if __name__ == __main__:
    getHtml(http://bbs.tianya.cn/post-no05-143258-1.shtml,krzc.txt,u关河五十州)
    #getHtml(‘http://bbs.tianya.cn/post-no05-143258-1036.shtml‘,‘krzc.txt‘,u‘关河五十州‘)

 

抓取天涯文章的蜘蛛代码,刚经过更新(因为天涯页面HTML代码变化),布布扣,bubuko.com

抓取天涯文章的蜘蛛代码,刚经过更新(因为天涯页面HTML代码变化)

标签:style   blog   http   color   os   for   ar   div   

原文地址:http://www.cnblogs.com/code-style/p/3900122.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!