4-15 爬取新浪网

时间：2018-04-15 19:49:53 阅读：176 评论：0 收藏：0 [点我收藏+]

标签：new group datetime loader nts 编辑 ram .sh append

import requests
 3 from bs4 import BeautifulSoup
 4 from datetime import datetime
 5 import re
 6 import json
 7 import pandas
 8 
 9 def getNewsdetial(newsurl):
10     res = requests.get(newsurl)
11     res.encoding = ‘utf-8‘
12     soup = BeautifulSoup(res.text,‘html.parser‘)
13     newsTitle = soup.select(‘.page-header h1‘)[0].text.strip()
14     nt = datetime.strptime(soup.select(‘.time-source‘)[0].contents[0].strip(),‘%Y年%m月%d日%H:%M‘)
15     newsTime = datetime.strftime(nt,‘%Y-%m-%d %H:%M‘)
16     newsArticle = getnewsArticle(soup.select(‘.article p‘))
17     newsAuthor = newsArticle[-1]
18     return newsTitle,newsTime,newsArticle,newsAuthor
19 def getnewsArticle(news):
20     newsArticle = []
21     for p in news:
22          newsArticle.append(p.text.strip())
23     return newsArticle
24 
25 # 获取评论数量
26 
27 def getCommentCount(newsurl):
28     m = re.search(‘doc-i(.+).shtml‘,newsurl)
29     newsid = m.group(1)
30     commenturl = ‘http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20‘
31     comment = requests.get(commenturl.format(newsid))   #将要修改的地方换成大括号，并用format将newsid放入大括号的位置
32     jd = json.loads(comment.text.lstrip(‘var data=‘))
33     return jd[‘result‘][‘count‘][‘total‘]
34 
35 
36 def getNewsLinkUrl():
37 #     得到异步载入的新闻地址（即获得所有分页新闻地址）
38     urlFormat = ‘http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1501000415111‘
39     url = []
40     for i in range(1,10):
41         res = requests.get(urlFormat.format(i))
42         jd = json.loads(res.text.lstrip(‘  newsloadercallback(‘).rstrip(‘);‘))
43         url.extend(getUrl(jd))     #entend和append的区别
44     return url
45 
46 def getUrl(jd):
47 #     获取每一分页的新闻地址
48     url = []
49     for i in jd[‘result‘][‘data‘]:
50         url.append(i[‘url‘])
51     return url
52 
53 # 取得新闻时间，编辑，内容，标题，评论数量并整合在total_2中
54 def getNewsDetial():
55     title_all = []
56     author_all = []
57     commentCount_all = []
58     article_all = []
59     time_all = []
60     url_all = getNewsLinkUrl()
61     for url in url_all:
62         title_all.append(getNewsdetial(url)[0])
63         time_all.append(getNewsdetial(url)[1])
64         article_all.append(getNewsdetial(url)[2])
65         author_all.append(getNewsdetial(url)[3])
66         commentCount_all.append(getCommentCount(url))
67     total_2 = {‘a_title‘:title_all,‘b_article‘:article_all,‘c_commentCount‘:commentCount_all,‘d_time‘:time_all,‘e_editor‘:author_all}
68     return total_2
69 
70 # ( 运行起始点 )用pandas模块处理数据并转化为excel文档
71 
72 df = pandas.DataFrame(getNewsDetial())
73 df.to_excel(‘news2.xlsx‘)

4-15 爬取新浪网

标签：new group datetime loader nts 编辑 ram .sh append

原文地址：https://www.cnblogs.com/coder-2017/p/8848951.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行