标签:链接 des nic error iter 定义函数 女性 BLB write
# -*- coding: utf-8 -*-
import requests, re
import time
import os
import csv
import sys
import importlib
from fake_useragent import UserAgent
importlib.reload(sys)
class WeiBoSpider():
def __init__(self, page):
self.path = os.getcwd() + "/weibo.csv"
self.csvfile = open(self.path, "a", newline="", encoding="utf-8-sig")
self.writer = csv.writer(self.csvfile)
# csv头部
self.writer.writerow((‘话题链接‘, ‘话题内容‘, ‘楼主ID‘, ‘楼主昵称‘, ‘楼主性别‘, ‘发布日期‘,
‘发布时间‘, ‘转发量‘, ‘评论量‘, ‘点赞量‘, ‘评论者ID‘, ‘评论者昵称‘,
‘评论者性别‘, ‘评论日期‘, ‘评论时间‘, ‘评论内容‘))
self.headers = {
‘Cookie‘: ‘_T_WM=22822641575; H5_wentry=H5; backURL=https%3A%2F%2Fm.weibo.cn%2F; ALF=1584226439; MLOGIN=1; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5RJaVYrb.BEuOvUQ8Ca2OO5JpX5K-hUgL.FoqESh-7eKzpShM2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMceoBfeh2EeKBN; SCF=AnRSOFp6QbWzfH1BqL4HB8my8eWNC5C33KhDq4Ko43RUIzs6rjJC49kIvz5_RcOJV2pVAQKvK2UbAd1Uh6j0pyo.; SUB=_2A25zQaQBDeRhGeBM71cR8SzNzzuIHXVQzcxJrDV6PUJbktAKLXD-kW1NRPYJXhsrLRnku_WvhsXi81eY0FM2oTtt; SUHB=0mxU9Kb_Ce6s6S; SSOLoginState=1581634641; WEIBOCN_FROM=1110106030; XSRF-TOKEN=dc7c27; M_WEIBOCN_PARAMS=oid%3D4471980021481431%26luicode%3D20000061%26lfid%3D4471980021481431%26uicode%3D20000061%26fid%3D4471980021481431‘,
‘Referer‘: ‘https://m.weibo.cn/detail/4312409864846621‘,
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36‘,
‘X-Requested-With‘: ‘XMLHttpRequest‘
}
self.comments_ID = []
self.page = page
def get_title_id(self):
# 获取内容中的id列表
for page in range(1, self.page):
self.headers = {
"User-Agent": UserAgent().chrome
}
time.sleep(1)
api_url = ‘https://m.weibo.cn/api/feed/trendtop?containerid=102803_ctg1_600059_-_ctg1_600059&page=‘ + str(page)
rep = requests.get(url=api_url, headers=self.headers)
# 获取ID值并写入列表comment_ID中
for json in rep.json()[‘data‘][‘statuses‘]:
comment_ID = json[‘id‘]
self.comments_ID.append(comment_ID)
def spider_title(self, id):
"""爬取战役情每个主题的详情页面"""
try:
title_url = ‘https://m.weibo.cn/detail/‘ + str(id)
html_text = requests.get(url=title_url, headers=self.headers).text
# 内容
title = re.findall(‘.*?"text": "(.*?)",.*?‘, html_text)[0]
# 去掉title中的html标签
text = re.sub(‘<(S*?)[^>]*>.*?|<.*? />‘, ‘‘, title)
# 用户id
user_id = re.findall(‘.*?"id": "(.*?)",.*?‘, html_text)[0]
# 用户昵称
user_nicname = re.findall(‘.*?"screen_name": "(.*?)",.*?‘, html_text)[0]
# 性别
user_gender = re.findall(‘.*?"gender": "(.*?)",.*?‘, html_text)[0]
# 发布时间
created_title_time = re.findall(‘.*?"created_at": "(.*?)",.*?‘, html_text)[0].split(" ")
# 日期
if ‘Mar‘ in created_title_time:
title_created_YMD = "{}/{}/{}".format(created_title_time[-1], ‘03‘, created_title_time[2])
elif ‘Feb‘ in created_title_time:
title_created_YMD = "{}/{}/{}".format(created_title_time[-1], ‘02‘, created_title_time[2])
elif ‘Jan‘ in created_title_time:
title_created_YMD = "{}/{}/{}".format(created_title_time[-1], ‘01‘, created_title_time[2])
else:
pass
# 发布时间
add_title_time = created_title_time[3]
# 转发量
reposts_count = re.findall(‘.*?"reposts_count": (.*?),.*?‘, html_text)[0]
# 评论量
comments_count = re.findall(‘.*?"comments_count": (.*?),.*?‘, html_text)[0]
# 点赞量
attitudes_count = re.findall(‘.*?"attitudes_count": (.*?),.*?‘, html_text)[0]
comment_count = int(int(comments_count) / 20) # 每个ajax一次加载20条数据
position1 = (title_url, text, user_id, user_nicname, user_gender, title_created_YMD,
add_title_time, reposts_count, comments_count, attitudes_count, " ", " ", " ", " ", " ", " ")
# 写入数据
print(title_url, text, user_id, user_nicname, user_gender, title_created_YMD,
add_title_time, reposts_count, comments_count, attitudes_count)
self.writer.writerow((position1))
return comment_count
except:
pass
def get_page(self, id, max_id, id_type):
# 抓取评论信息
params = {
‘max_id‘: max_id,
‘max_id_type‘: id_type
}
url = ‘https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id‘.format(id, id )
try:
r = requests.get(url, params=params, headers=self.headers)
if r.status_code == 200:
return r.json()
except requests.ConnectionError as e:
print(‘error‘, e.args)
pass
def parse_page(self, jsondata):
if jsondata:
items = jsondata.get(‘data‘)
item_max_id = {}
item_max_id[‘max_id‘] = items[‘max_id‘]
item_max_id[‘max_id_type‘] = items[‘max_id_type‘]
return item_max_id
def write_csv(self, jsondata):
for json in jsondata[‘data‘][‘data‘]:
# 用户ID
user_id = json[‘user‘][‘id‘]
# 用户昵称
user_name = json[‘user‘][‘screen_name‘]
# 用户性别,m表示男性,表示女性
user_gender = json[‘user‘][‘gender‘]
# 获取评论
comments_text = json[‘text‘]
comment_text = re.sub(‘<(S*?)[^>]*>.*?|<.*? />‘, ‘‘, comments_text) # 正则匹配掉html标签
# 评论时间
created_times = json[‘created_at‘].split(‘ ‘)
if ‘Feb‘ in created_times:
created_YMD = "{}/{}/{}".format(created_times[-1], ‘02‘, created_times[2])
elif ‘Jan‘ in created_times:
created_YMD = "{}/{}/{}".format(created_times[-1], ‘01‘, created_times[2])
else:
print(‘该时间不在疫情范围内,估计数据有误!‘)
pass
created_time = created_times[3] # 评论时间时分秒
position2 = (
" ", " ", " ", " ", " ", " ", " ", " ", " ", " ", user_id, user_name, user_gender, created_YMD,
created_time,
comment_text)
self.writer.writerow((position2)) # 写入数据
def main(self):
self.get_title_id()
count_title = len(self.comments_ID)
for count, comment_ID in enumerate(self.comments_ID):
print("正在爬取第%s个话题,一共找到个%s话题需要爬取" % (count + 1, count_title))
# maxPage获取返回的最大评论数量
maxPage = self.spider_title(comment_ID)
m_id = 0
id_type = 0
if maxPage != 0: # 小于20条评论的不需要循环
try:
# 用评论数量控制循环
for page in range(0, maxPage):
# 自定义函数-抓取网页评论信息
jsondata = self.get_page(comment_ID, m_id, id_type)
# 自定义函数-写入CSV文件
self.write_csv(jsondata)
# 自定义函数-获取评论item最大值
results = self.parse_page(jsondata)
time.sleep(1)
m_id = results[‘max_id‘]
id_type = results[‘max_id_type‘]
except:
pass
print("--------------------------分隔符---------------------------")
self.csvfile.close()
if __name__ == ‘__main__‘:
startTime = time.time()
spider = WeiBoSpider(15)
spider.main()
endTime = time.time()
useTime = (endTime - startTime) / 60
print("该次所获的信息一共使用%s分钟" % useTime)
标签:链接 des nic error iter 定义函数 女性 BLB write
原文地址:https://www.cnblogs.com/zhouzetian/p/12569176.html