标签:来源 odi time 技术分享 inf 技术 分享 print page
import requests import re from bs4 import BeautifulSoup url=‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘ res=requests.get(url) res.encoding=‘utf-8‘ soup=BeautifulSoup(res.text,‘html.parser‘) li=soup.select(‘li‘) def get(gzcc): dj=re.search(‘_.*/(.*).html‘,gzcc).groups(0)[0] djcs=int(requests.get(‘http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80‘.format(dj)).text.split(‘.‘)[-1].lstrip("html(‘").rstrip("‘);")) return djcs def sss(label): for news in label: if len(news.
select(‘.news-list-title‘))>0: title=news.select(‘.news-list-title‘)[0].text #标题 time=news.select(‘.news-list-info‘)[0].contents[0].text#时间 url1=news.select(‘a‘)[0][‘href‘]#url bumen=news.select(‘.news-list-info‘)[0].contents[1].text#部门 description=news.select(‘.news-list-description‘)[0].text #描述 cs=get(url1) print(time,title,url,cs) sss(li) pages=int(soup.select(‘.a1‘)[0].text.rstrip(‘条‘))//10+1 for list in range(2,pages+1): pageurl="http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(list) pageres=requests.get(pageurl) pageres.encoding=‘utf-8‘ pagesoup=BeautifulSoup(pageres.text,‘html.parser‘) pagelist=pagesoup.select(‘li‘) sss(pagelist) break
import requests import re from bs4 import BeautifulSoup url=‘http://news.szu.edu.cn/xyxw/sdyw.htm‘ res=requests.get(url) res.encoding=‘utf-8‘ soup=BeautifulSoup(res.text,‘html.parser‘) li=soup.select(‘li‘) def get(shenda): dj=re.search(‘_.*/(.*).htm‘,shenda).groups(0)[0] djcs=int(requests.get(‘http://news.szu.edu.cn/info/1003/{}.htm‘.format(dj)).text.split(‘.‘)[-1].lstrip("html(‘").rstrip("‘);")) return djcs def sd(label): for news in label: if len(news.select(‘._blank‘))>0: title=news.select(‘._blank‘)[0].text time=news.select(‘._blank‘).contents[0].text url1=news.select(‘a‘) cs=get(url1) print(time,title,url,cs) sd(li) pages=int(soup.select(‘.left‘)[0].text.rstrip(‘条‘))//10+1 for list in range(2,pages+1): pageurl="http://news.szu.edu.cn/xyxw/sdyw/{}.htm".format(list) pageres=requests.get(pageurl) pageres.encoding=‘utf-8‘ pagesoup=BeautifulSoup(pageres.text,‘html.parser‘) pagelist=pagesoup.select(‘li‘) sd(pagelist) break
标签:来源 odi time 技术分享 inf 技术 分享 print page
原文地址:http://www.cnblogs.com/bb437601841/p/7649549.html