标签:list split http 数据 parser 自己 php .com findall
1.取出一个新闻列表页的全部新闻 包装成函数。
2.获取总的新闻篇数,算出新闻总页数。
3.获取全部新闻列表页的全部新闻详情。
import requests from bs4 import BeautifulSoup from datetime import datetime import re #获取新闻点击次数 def getClickCount(newsUrl): newsId = re.findall(‘\_(.*).html‘, newsUrl)[0].split(‘/‘)[1] clickUrl = ‘http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80‘.format(newsId) clickStr = requests.get(clickUrl).text count = re.search("hits‘\).html\(‘(.*)‘\);",clickStr).group(1) return count # 获取新闻的信息 def getNewDetail(url): resd = requests.get(url) resd.encoding = ‘utf-8‘ soupd = BeautifulSoup(resd.text, ‘html.parser‘) title = soupd.select(‘.show-title‘)[0].text info = soupd.select(‘.show-info‘)[0].text time = info.lstrip(‘发布时间:‘)[0:19] dt = datetime.strptime(time, ‘%Y-%m-%d %H:%M:%S‘) if info.find(‘来源:‘) > 0: source = info[info.find(‘来源:‘):].split()[0].lstrip(‘来源:‘) else: source = ‘none‘ if info.find(‘作者:‘) > 0: author = info[info.find(‘作者:‘):].split()[0].lstrip(‘作者:‘) else: author = ‘none‘ clickcount=getClickCount(url); print(‘链接:{0}\n标题:{1}\n发布时间:{2}\n来源:{3}\n作者:{4}\n点击次数:{5}‘.format(url, title, dt, source,author , clickcount)) print(‘-----------------‘) def getListPage(listPageUrl): res = requests.get(listPageUrl) res.encoding = ‘utf-8‘ soup = BeautifulSoup(res.text, ‘html.parser‘) for news in soup.select(‘li‘): if len(news.select(‘.news-list-title‘)) > 0: # 获取新闻模块链接 a = news.a.attrs[‘href‘] # 调用函数获取新闻正文 getNewDetail(a) url=‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘ resn = requests.get(url) resn.encoding = ‘utf-8‘ soupn = BeautifulSoup(resn.text,‘html.parser‘) n = int(soupn.select(‘.a1‘)[0].text.rstrip(‘条‘)) for i in range(n,n+1): pageUrl = ‘http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html‘.format(i) getListPage(pageUrl)
4.找一个自己感兴趣的主题,进行数据爬取,并进行分词分析。不能与其它同学雷同。
import requests import re import jieba from bs4 import BeautifulSoup from datetime import datetime def getNewsDetail(newsUrl): resd = requests.get(newsUrl) resd.encoding = ‘gb2312‘ soupd = BeautifulSoup(resd.text, ‘html.parser‘) content = soupd.select(‘#endText‘)[0].text info = soupd.select(‘.post_time_source‘)[0].text date = re.search(‘(\d{4}.\d{2}.\d{2}\s\d{2}.\d{2}.\d{2})‘, info).group(1) dateTime = datetime.strptime(date, ‘%Y-%m-%d %H:%M:%S‘) sources = re.search(‘来源:\s*(.*)‘, info).group(1) TopWords = getTopWords(content) print(‘发布时间:{0}\n来源:{1}‘.format(dateTime, sources)) print(‘关键词:{}、{}、{}、{}、{}‘.format(TopWords[0], TopWords[1], TopWords[2],TopWords[3],TopWords[4])) print(content) print(‘---------------------------‘) def getTopWords(content): str = ‘‘‘一!“”,。?;’"‘,.、:\n‘‘‘ for s in str: content=content.replace(s, ‘ ‘) wordlist = list(jieba.cut(content)) exclude = {‘这‘, ‘\u3000‘, ‘\r‘, ‘\xa0‘,‘时候‘,‘对‘,‘上‘,‘与‘,‘等‘,‘不‘,‘‘,‘没有‘,‘很多‘,‘的‘,‘大‘,‘出来‘, ‘_‘, ‘到‘,‘ ‘, ‘将‘, ‘在‘, ‘是‘, ‘了‘, ‘一‘, ‘还‘, ‘也‘, ‘《‘, ‘》‘, ‘(‘, ‘)‘,‘和‘,‘我‘,‘我们‘,‘其‘,‘能够‘,‘以‘,‘个‘,‘短‘,‘中‘,‘是‘,‘不是‘} set2 = set(wordlist) - exclude dict = {} for key in set2: dict[key] = wordlist.count(key) dictlist = list(dict.items()) dictlist.sort(key=lambda x: x[1], reverse=True) return dictlist; def getListPage(listUrl): res = requests.get(listUrl) res.encoding = ‘gbk‘ soup = BeautifulSoup(res.text, ‘html.parser‘) for new in soup.select(‘#news-flow-content‘)[0].select(‘li‘): url = new.select(‘a‘)[0][‘href‘] title = new.select(‘a‘)[0].text print(‘标题:{0}\n链接:{1}‘.format(title, url)) getNewsDetail(url) break listUrl = ‘http://tech.163.com/telecom/‘ getListPage(listUrl) for i in range(2, 10): listUrl = ‘http://tech.163.com/special/it_2016_%02d/‘ % i getListPage(listUrl)
标签:list split http 数据 parser 自己 php .com findall
原文地址:https://www.cnblogs.com/abcdcd/p/8799052.html