码迷,mamicode.com
首页 > 其他好文 > 详细

123

时间:2019-04-08 13:22:33      阅读:306      评论:0      收藏:0      [点我收藏+]

标签:click   page   api   点击   次数   script   color   php   .sql   

代码:

  1 import requests
  2 from bs4 import BeautifulSoup
  3 from datetime import datetime
  4 import re
  5 import pandas as ps
  6 import sqlite3
  7 
  8 #获取点击次数
  9 def newsClick(url):
 10     id = re.findall((\d{1,7}),url)[-1]
 11     clickUrl = http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80.format(id)
 12     click = requests.get(clickUrl)
 13     newClick = int(click.text.split(.html)[-1].lstrip("(‘").rstrip("‘);")) # 获取点击次数
 14     #print(‘点击次数:‘)
 15     #print(newClick)
 16     return newClick
 17 
 18 def newsDateTime(newInfo):
 19     newDate = newInfo.split()[0].lstrip(发布时间:)
 20     newTime = newInfo.split()[1]
 21     newDateTime = newDate+ +newTime
 22     newDT = datetime.strptime(newDateTime,%Y-%m-%d %H:%M:%S)
 23     #print(‘发布时间:‘)
 24     #print(newDT)
 25     return newDT
 26 
 27 #获取新闻的全部信息
 28 def newsInfo(url):
 29     news = requests.get(url)
 30     news.encoding = utf-8
 31     newSoup = BeautifulSoup(news.text,html.parser)
 32     
 33     #标题
 34     title = newSoup.select(.show-title)[0].text
 35     #print(‘标题:‘+title)
 36     
 37     #发布信息
 38     newInfo = newSoup.select(.show-info)[0].text
 39     
 40     #发布时间
 41     newDT = newsDateTime(newInfo)
 42     
 43     #作者
 44     author = newInfo.split()[2]
 45     #print(author)
 46     
 47     #审核
 48     examine = newInfo.split()[3]
 49     #print(examine)
 50     
 51     #来源
 52     source = newInfo.split()[4]
 53     #print(source)
 54     
 55     #获取点击次数的url
 56     newClick = newsClick(newUrl)
 57     
 58     #内容
 59     #newContent = newSoup.select(‘.show-content‘)[0].text
 60     #print(‘内容:‘+newContent)
 61     
 62     #把获取到的新闻内容添加进字典中
 63     newsDetail = {}
 64     newsDetail[newsTitle] = title
 65     newsDetail[newsDaTe] = newDT
 66     newsDetail[newsAuthor] = author
 67     newsDetail[newsExamine] = examine
 68     newsDetail[newsSource] = source
 69     newsDetail[newsClick] = newClick
 70     
 71     return newsDetail;
 72 
 73 def newsList(url):
 74     news = requests.get(url)
 75     news.encoding = utf-8
 76     newSoup = BeautifulSoup(news.text,html.parser)
 77     newList = []
 78     li = newSoup.select(li)
 79     for new in li:
 80         if len(new.select(.news-list-text))>0:
 81             newUrl = new.select(a)[0][href]
 82             #获取新闻的概述信息
 83             newDescription = new.select(.news-list-description)[0].text
 84             newsDict = newsInfo(newUrl)
 85             #把新闻的概述信息添加进字典中
 86             newsDict[newsDescription] = newDescription
 87             newList.append(newsDict)
 88     return newList
 89 
 90 url = http://news.gzcc.cn/html/xiaoyuanxinwen/
 91 
 92 #获取新闻的页数
 93 page = int(newSoup.select(#pages)[0].text.split(..)[1].rstrip( 下一页 ))
 94 allNews = []
 95 #获取页数为range(246,248)的新闻信息
 96 for i in range(246,248):
 97     listPageUrl = http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html.format(i)
 98     allNews.extend(newsList(url))
 99 newsTable = ps.DataFrame(allNews)
100 #newsTable.sort_index(by=[‘newsClick‘],ascending=False) #按点击次数排名,当ascending=TRUE时是从小到大排列
101 #newsTable.sort_index(by=[‘newsDaTe‘],ascending=False) #按发布时间进行排名
102 newsTable.to_csv(rF:\爬虫\news.csv)
103 with sqlite3.connect(news.sqlite) as db:
104     newsTable.to_sql(news,db)
105     s = pandas.read_sql_query(SELECT * FROM news,con = db)
106 s

 

123

标签:click   page   api   点击   次数   script   color   php   .sql   

原文地址:https://www.cnblogs.com/bhuan/p/10669679.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!