123

时间：2019-04-08 13:22:33 阅读：306 评论：0 收藏：0 [点我收藏+]

标签：click page api 点击次数 script color php .sql

代码：

  1 import requests
  2 from bs4 import BeautifulSoup
  3 from datetime import datetime
  4 import re
  5 import pandas as ps
  6 import sqlite3
  7 
  8 #获取点击次数
  9 def newsClick(url):
 10     id = re.findall(‘(\d{1,7})‘,url)[-1]
 11     clickUrl = ‘http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80‘.format(id)
 12     click = requests.get(clickUrl)
 13     newClick = int(click.text.split(‘.html‘)[-1].lstrip("(‘").rstrip("‘);")) # 获取点击次数
 14     #print(‘点击次数：‘)
 15     #print(newClick)
 16     return newClick
 17 
 18 def newsDateTime(newInfo):
 19     newDate = newInfo.split()[0].lstrip(‘发布时间:‘)
 20     newTime = newInfo.split()[1]
 21     newDateTime = newDate+‘ ‘+newTime
 22     newDT = datetime.strptime(newDateTime,‘%Y-%m-%d %H:%M:%S‘)
 23     #print(‘发布时间：‘)
 24     #print(newDT)
 25     return newDT
 26 
 27 #获取新闻的全部信息
 28 def newsInfo(url):
 29     news = requests.get(url)
 30     news.encoding = ‘utf-8‘
 31     newSoup = BeautifulSoup(news.text,‘html.parser‘)
 32     
 33     #标题
 34     title = newSoup.select(‘.show-title‘)[0].text
 35     #print(‘标题：‘+title)
 36     
 37     #发布信息
 38     newInfo = newSoup.select(‘.show-info‘)[0].text
 39     
 40     #发布时间
 41     newDT = newsDateTime(newInfo)
 42     
 43     #作者
 44     author = newInfo.split()[2]
 45     #print(author)
 46     
 47     #审核
 48     examine = newInfo.split()[3]
 49     #print(examine)
 50     
 51     #来源
 52     source = newInfo.split()[4]
 53     #print(source)
 54     
 55     #获取点击次数的url
 56     newClick = newsClick(newUrl)
 57     
 58     #内容
 59     #newContent = newSoup.select(‘.show-content‘)[0].text
 60     #print(‘内容：‘+newContent)
 61     
 62     #把获取到的新闻内容添加进字典中
 63     newsDetail = {}
 64     newsDetail[‘newsTitle‘] = title
 65     newsDetail[‘newsDaTe‘] = newDT
 66     newsDetail[‘newsAuthor‘] = author
 67     newsDetail[‘newsExamine‘] = examine
 68     newsDetail[‘newsSource‘] = source
 69     newsDetail[‘newsClick‘] = newClick
 70     
 71     return newsDetail;
 72 
 73 def newsList(url):
 74     news = requests.get(url)
 75     news.encoding = ‘utf-8‘
 76     newSoup = BeautifulSoup(news.text,‘html.parser‘)
 77     newList = []
 78     li = newSoup.select(‘li‘)
 79     for new in li:
 80         if len(new.select(‘.news-list-text‘))>0:
 81             newUrl = new.select(‘a‘)[0][‘href‘]
 82             #获取新闻的概述信息
 83             newDescription = new.select(‘.news-list-description‘)[0].text
 84             newsDict = newsInfo(newUrl)
 85             #把新闻的概述信息添加进字典中
 86             newsDict[‘newsDescription‘] = newDescription
 87             newList.append(newsDict)
 88     return newList
 89 
 90 url = ‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘
 91 
 92 #获取新闻的页数
 93 page = int(newSoup.select(‘#pages‘)[0].text.split(‘..‘)[1].rstrip(‘ 下一页 ‘))
 94 allNews = []
 95 #获取页数为range(246,248)的新闻信息
 96 for i in range(246,248):
 97     listPageUrl = ‘http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html‘.format(i)
 98     allNews.extend(newsList(url))
 99 newsTable = ps.DataFrame(allNews)
100 #newsTable.sort_index(by=[‘newsClick‘],ascending=False) #按点击次数排名，当ascending=TRUE时是从小到大排列
101 #newsTable.sort_index(by=[‘newsDaTe‘],ascending=False) #按发布时间进行排名
102 newsTable.to_csv(r‘F:\爬虫\news.csv‘)
103 with sqlite3.connect(‘news.sqlite‘) as db:
104     newsTable.to_sql(‘news‘,db)
105     s = pandas.read_sql_query(‘SELECT * FROM news‘,con = db)
106 s

123

标签：click page api 点击次数 script color php .sql

原文地址：https://www.cnblogs.com/bhuan/p/10669679.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行