标签:href rip com mat getc coding 字符串 技术分享 div
1. 用requests库和BeautifulSoup库,爬取校园新闻首页新闻的标题、链接、正文、show-info。
2. 分析info字符串,获取每篇新闻的发布时间,作者,来源,摄影等信息。
3. 将字符串格式的发布时间转换成datetime类型
4. 使用正则表达式取得新闻编号
5. 生成点击次数的Request URL
6. 获取点击次数
7. 将456步骤定义成一个函数 def getClickCount(newsUrl):
8. 将获取新闻详情的代码定义成一个函数 def getNewDetail(newsUrl):
9. 尝试用使用正则表达式分析show info字符串,点击次数字符串。
# -*- coding: UTF-8 -*-# -*- import requests import re import locale locale=locale.setlocale(locale.LC_CTYPE, ‘chinese‘) from bs4 import BeautifulSoup from datetime import datetime url = "http://news.gzcc.cn/html/xiaoyuanxinwen/" res = requests.get(url) res.encoding = ‘utf-8‘ soup = BeautifulSoup(res.text, ‘html.parser‘) def getNewDetail(Url): for news in soup.select(‘li‘): # print(news) if len(news.select(‘.news-list-title‘))>0: t1=news.select(‘.news-list-title‘)[0].text d1=news.select(‘.news-list-description‘)[0].text a=news.select(‘a‘)[0].attrs[‘href‘] res = requests.get(a) res.encoding = ‘utf-8‘ soupd = BeautifulSoup(res.text, ‘html.parser‘) c=soupd.select(‘#content‘)[0].text info=soupd.select(‘.show-info‘)[0].text d=info.lstrip(‘发布时间:‘)[:19] print("标题:", t1) print("链接:", a) print("展示:", info) print("正文:", c) resd = requests.get(a) resd.encoding = ‘utf-8‘ soupd = BeautifulSoup(resd.text, ‘html.parser‘) t = soupd.select(‘.show-info‘)[0].text[0:24].lstrip(‘发布时间:‘) dt = datetime.strptime(t, ‘%Y-%m-%d %H:%M:%S‘) print("发布时间:", dt) au=info[info.find(‘作者‘):].split()[0].lstrip(‘作者:‘) f = info[info.find(‘来源‘):].split()[0].lstrip(‘来源:‘) p = info[info.find(‘摄影‘):].split()[0].lstrip(‘摄影:‘) print("作者:", au) print("来源:", f) print("摄影:", p) getClickCount(a) break def getClickCount(newsUrl): rematch=re.match(‘http://news.gzcc.cn/html/2018/xiaoyuanxinwen(.*).html‘,newsUrl).group(1).split(‘/‘)[1] newId=re.search(‘\_(.*).html‘,newsUrl).group(1) refindall=re.findall(‘\_(.*).html‘,newsUrl)[0] # clickUrl="http://oa.gzcc.cn/api.php?op=count&id=9183&modelid=80".format(newId) # print(rematch) clickUrl = ‘http://oa.gzcc.cn/api.php?op=count&id=9183&modelid=80‘ rest = requests.get(clickUrl).text.split(‘.html‘)[-1].lstrip("(‘").rstrip("‘);") print("新闻编号:", newId) print("点击次数URL:", clickUrl) print("点击次数:",rest) clickStr = requests.get(clickUrl).text getClickCount = re.search("hits‘\).html\(‘(.*)‘\);", clickStr).group(1) print("点击次数1:",getClickCount) getNewDetail(url)
标签:href rip com mat getc coding 字符串 技术分享 div
原文地址:https://www.cnblogs.com/ashh/p/8763223.html