标签:odi awl utf-8 lease one ptime get 爬取 coding
爬取中山大学官网的新闻页面:
import requests from bs4 import BeautifulSoup from datetime import datetime import re # 获取新闻列表页的简略信息 def crawlOnePage(url): res = requests.get(url) res.encoding = ‘UTF-8‘ soup = BeautifulSoup(res.text, ‘html.parser‘) cont = soup.select(‘li‘) for i in cont: print() print(‘新闻网址: ‘ + ‘http://news2.sysu.edu.cn/news01/‘ + i.select(‘a‘)[0][‘href‘]) # detailUrl=‘http://news2.sysu.edu.cn/news01/‘ + i.select(‘a‘)[0][‘href‘] print(‘新闻标题: ‘ + i.select(‘a‘)[0].text) # getDetail(detailUrl) #获取新闻具体信息 def getDetail(url): res = requests.get(url) res.encoding = ‘UTF-8‘ soup = BeautifulSoup(res.text, ‘html.parser‘) cont=soup.select(‘p‘)[2].text.split(‘|‘) # 日期 times=cont[4].split(‘:‘)[1] # 来源 source=cont[0] # 作者 author=cont[1] #编辑 editor=cont[3] # 将时间字符串转换成datetime格式 release_time = datetime.strptime(times, ‘%Y-%m-%d ‘) print(source,author,editor,release_time) content = soup.select(‘p‘)[-1].text print(content) # 取得所有页面的新闻 def getTotalPage(url): res = requests.get(url) res.encoding = ‘UTF-8‘ soup = BeautifulSoup(res.text, ‘html.parser‘) n =int( soup.select(‘strong‘)[0].text.lstrip(‘1/‘)) for i in range(1, n): page = str(i) geturl = ‘http://news2.sysu.edu.cn/news01/index‘+page+‘.htm‘ crawlOnePage(geturl) crawlOnePage(‘http://news2.sysu.edu.cn/news01/index.htm‘) getDetail(‘http://news2.sysu.edu.cn/news01/152940.htm‘) getTotalPage(‘http://news2.sysu.edu.cn/news01/index.htm‘)
截图:
标签:odi awl utf-8 lease one ptime get 爬取 coding
原文地址:https://www.cnblogs.com/a565810497/p/8797018.html