标签:
#coding:utf-8
‘‘‘
Created on 2015?-4-5
@author: Administrator
‘‘‘
from bs4 import BeautifulSoup
import urllib2
import sys
reload(sys)
sys.setdefaultencoding(‘utf-8‘)
‘‘‘
爬虫函数
‘‘‘
def GetCnBlog(cnt):
url=‘http://www.cnblogs.com/sitehome/p/‘
if cnt<1 or cnt>20:
return None
pages=[]
for i in xrange(1,cnt):
url=url+str(i)
content_stream=urllib2.urlopen(url)
soup =BeautifulSoup(content_stream)
h3=soup.find_all(‘h3‘)
for h in h3:
p=h.text+‘ ‘+ h.find(‘a‘)[‘href‘]+‘\r\n‘
pages.append(p)
return pages
def Save(p):
f=file(‘C:\\pages.txt‘,‘w‘)
f.writelines(p)
f.close()
if __name__==‘__main__‘:
ps= GetCnBlog(10)
Save(ps)
http://files.cnblogs.com/files/kun2008/setuptools-0.6c11.win32-py2.7.zip
Python BeautifulSoup 抓取博客园首页精华
标签:
原文地址:http://www.cnblogs.com/kun2008/p/4394261.html