码迷,mamicode.com
首页 > 其他好文 > 详细

新练习

时间:2018-04-12 13:41:26      阅读:153      评论:0      收藏:0      [点我收藏+]

标签:tail   format   click   inf   split   .text   request   api   print   

import re
import requests
from bs4 import BeautifulSoup
from datetime import datetime

def getClickCount(newsUrl):
newId =re.search(‘\_(.*).html‘,newsUrl).group(1).split(‘/‘)[1]
clickUrl = "http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80".format(newId)
return(int(requests.get(clickUrl).text.split(‘.html‘)[-1].lstrip("(‘").rstrip("‘);")))

def getNewsDetail(newsUrl):
resd = requests.get(newsUrl)
resd.encoding = ‘utf-8‘
soupd = BeautifulSoup(resd.text,‘html.parser‘)
c = soupd.select(‘#content‘)[0].text
info = soupd.select(‘.show-info‘)[0].text
d = info.lstrip(‘发布时间;‘)[:19]
dt = datetime.strptime(d,‘%Y-%m-%d %H:%M:%S‘)
au = info[info.find(‘作者:‘):].split()[0].lstrip(‘作者:‘)
clickCount = getClickCount(newsUrl)
print(clickCount,newsUrl,dt,au)

def getNewsList(pageUrl):
res = requests.get(pageUrl)
res.encoding = ‘utf-8‘
soup = BeautifulSoup(res.text,‘html.parser‘)
for news in soup.select(‘li‘):
if len(news.select(‘.news-list-title‘))>0:
newsUrl = news.select(‘a‘)[0].attrs[‘href‘]
getNewsDetail(newsUrl)
break
pageUrl =‘http://news.gzcc.cn/html/xiaoyuanxinwen‘
getNewsList(pageUrl)
for i in (2,233):
getNewsList(pageUrl)


新练习

标签:tail   format   click   inf   split   .text   request   api   print   

原文地址:https://www.cnblogs.com/lg916843/p/8806683.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!