标签:click page api 点击 次数 script color php .sql
代码:
1 import requests 2 from bs4 import BeautifulSoup 3 from datetime import datetime 4 import re 5 import pandas as ps 6 import sqlite3 7 8 #获取点击次数 9 def newsClick(url): 10 id = re.findall(‘(\d{1,7})‘,url)[-1] 11 clickUrl = ‘http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80‘.format(id) 12 click = requests.get(clickUrl) 13 newClick = int(click.text.split(‘.html‘)[-1].lstrip("(‘").rstrip("‘);")) # 获取点击次数 14 #print(‘点击次数:‘) 15 #print(newClick) 16 return newClick 17 18 def newsDateTime(newInfo): 19 newDate = newInfo.split()[0].lstrip(‘发布时间:‘) 20 newTime = newInfo.split()[1] 21 newDateTime = newDate+‘ ‘+newTime 22 newDT = datetime.strptime(newDateTime,‘%Y-%m-%d %H:%M:%S‘) 23 #print(‘发布时间:‘) 24 #print(newDT) 25 return newDT 26 27 #获取新闻的全部信息 28 def newsInfo(url): 29 news = requests.get(url) 30 news.encoding = ‘utf-8‘ 31 newSoup = BeautifulSoup(news.text,‘html.parser‘) 32 33 #标题 34 title = newSoup.select(‘.show-title‘)[0].text 35 #print(‘标题:‘+title) 36 37 #发布信息 38 newInfo = newSoup.select(‘.show-info‘)[0].text 39 40 #发布时间 41 newDT = newsDateTime(newInfo) 42 43 #作者 44 author = newInfo.split()[2] 45 #print(author) 46 47 #审核 48 examine = newInfo.split()[3] 49 #print(examine) 50 51 #来源 52 source = newInfo.split()[4] 53 #print(source) 54 55 #获取点击次数的url 56 newClick = newsClick(newUrl) 57 58 #内容 59 #newContent = newSoup.select(‘.show-content‘)[0].text 60 #print(‘内容:‘+newContent) 61 62 #把获取到的新闻内容添加进字典中 63 newsDetail = {} 64 newsDetail[‘newsTitle‘] = title 65 newsDetail[‘newsDaTe‘] = newDT 66 newsDetail[‘newsAuthor‘] = author 67 newsDetail[‘newsExamine‘] = examine 68 newsDetail[‘newsSource‘] = source 69 newsDetail[‘newsClick‘] = newClick 70 71 return newsDetail; 72 73 def newsList(url): 74 news = requests.get(url) 75 news.encoding = ‘utf-8‘ 76 newSoup = BeautifulSoup(news.text,‘html.parser‘) 77 newList = [] 78 li = newSoup.select(‘li‘) 79 for new in li: 80 if len(new.select(‘.news-list-text‘))>0: 81 newUrl = new.select(‘a‘)[0][‘href‘] 82 #获取新闻的概述信息 83 newDescription = new.select(‘.news-list-description‘)[0].text 84 newsDict = newsInfo(newUrl) 85 #把新闻的概述信息添加进字典中 86 newsDict[‘newsDescription‘] = newDescription 87 newList.append(newsDict) 88 return newList 89 90 url = ‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘ 91 92 #获取新闻的页数 93 page = int(newSoup.select(‘#pages‘)[0].text.split(‘..‘)[1].rstrip(‘ 下一页 ‘)) 94 allNews = [] 95 #获取页数为range(246,248)的新闻信息 96 for i in range(246,248): 97 listPageUrl = ‘http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html‘.format(i) 98 allNews.extend(newsList(url)) 99 newsTable = ps.DataFrame(allNews) 100 #newsTable.sort_index(by=[‘newsClick‘],ascending=False) #按点击次数排名,当ascending=TRUE时是从小到大排列 101 #newsTable.sort_index(by=[‘newsDaTe‘],ascending=False) #按发布时间进行排名 102 newsTable.to_csv(r‘F:\爬虫\news.csv‘) 103 with sqlite3.connect(‘news.sqlite‘) as db: 104 newsTable.to_sql(‘news‘,db) 105 s = pandas.read_sql_query(‘SELECT * FROM news‘,con = db) 106 s
标签:click page api 点击 次数 script color php .sql
原文地址:https://www.cnblogs.com/bhuan/p/10669679.html