标签:list img size .sh coding href 包装 cat parser
1、用requests库和BeautifulSoup4库,爬取校园新闻列表的时间、标题、链接、来源、详细内容;将其中的时间str转换成datetime类型;将取得详细内容的代码包装成函数。
import requests from bs4 import BeautifulSoup from datetime import datetime gzccurl=‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘ res=requests.get(gzccurl) res.encoding=‘utf-8‘ soup =BeautifulSoup(res.text,‘html.parser‘) def getdetail(url):#将取得详细内容的代码包装成函数 resn=requests.get(url) resn.encoding=‘utf-8‘ soupn=BeautifulSoup(resn.text,‘html.parser‘) detail=soupn.select(‘.show-content‘)[0].text return (detail) def getdetailtime(url):#将取得详细时间的代码包装成函数 rest=requests.get(url) rest.encoding=‘utf-8‘ soupt=BeautifulSoup(rest.text,‘html.parser‘) detailtime=soupt.select(‘.show-info‘)[0].text[16:24] return(detailtime) for news in soup.select(‘li‘): if len((news.select(‘.news-list-title‘)))>0: time=news.select(‘.news-list-info‘)[0].span.text#时间 title=news.select(‘.news-list-title‘)[0].contents[0]#标题 url=news.select(‘a‘)[0][‘href‘]#链接 source=news.select(‘.news-list-info‘)[0].select(‘span‘)[1].text detail=getdetail(url) dtime=getdetailtime(url) addt=time+‘-‘+dtime dt=datetime.strptime(addt,‘%Y-%m-%d-%H:%M:%S‘)#将其中的时间str转换成datetime类型
2、爬取AOPA网站上的内容
import requests from bs4 import BeautifulSoup from datetime import datetime aopaurl=‘http://www.aopa.org.cn/news/list.php?catid=8‘ res=requests.get(aopaurl) res.encoding=‘utf-8‘ soup =BeautifulSoup(res.text,‘html.parser‘) def getdetail(url):#将取得详细内容的代码包装成函数 resn=requests.get(url) resn.encoding=‘utf-8‘ soupn=BeautifulSoup(resn.text,‘html.parser‘) detail=soupn.select(‘.content‘)[0].text return (detail) for news in soup.select(‘.catlist‘): #print(news) for news2 in news.select(‘.catlist_li‘): if len((news2))>0: time=news2.select(‘span‘)[0].text #时间 title=news2.select(‘a‘)[0][‘title‘]#标题 url=news2.select(‘a‘)[0][‘href‘]#链接 dt=datetime.strptime(time,‘%Y-%m-%d %H:%M‘)#将其中的时间str转换成datetime类型 detail=getdetail(url) print( "时间:",dt,"\t标题:",title,"\t链接:",url,"详情:",detail)
用requests库和BeautifulSoup4库爬取新闻列表
标签:list img size .sh coding href 包装 cat parser
原文地址:http://www.cnblogs.com/gzw2017/p/7608174.html