码迷,mamicode.com
首页 > 编程语言 > 详细

python 简书用户爬虫

时间:2017-10-15 21:15:12      阅读:196      评论:0      收藏:0      [点我收藏+]

标签:文件   imp   dex   apple   规则   双向   ons   users   用户id   

 1 # python
 2 # -*- coding: utf-8 -*-
 3 """
 4 __title__ = ‘‘
 5 __author__ = ‘wlc‘
 6 __mtime__ = ‘2017/10/15‘
 7 """
 8 import re
 9 import time
10 import math
11 import csv
12 import requests
13 from bs4 import BeautifulSoup
14 from collections import deque
15 
16 #建立一个csv文件保存信息
17 path = dataCollection/userInfo.csv
18 csvFile = open(path, a+, newline=‘‘, encoding=utf-8)
19 writer = csv.writer(csvFile)
20 writer.writerow((id,name,following,follower,article,word,like))
21 
22 #全局变量用来存储userid 和关注的人数
23 idContainer = set()
24 #用来放置用户的链接使用双向队列
25 linkDeque  = deque()
26 
27 class jianshu(object):
28     def __init__(self):
29         #定制url模板
30         self.url = http://www.jianshu.com/users/{userId}/following?page={page}
31         #用户id与name的匹配规则
32         self.idPattern = re.compile(<a class="name" href="/u/(.*?)">(.*?)</a>)
33         #用户的关注 粉丝 文章 文集 的匹配规则
34         self.metalPattern = re.compile(<span>关注 (\d+)</span><span>粉丝 (\d+)</span><span>文章 (\d+)</span>)
35         self.meta = re.compile(写了 (\d+) 字,获得了 (\d+) 个喜欢)
36         #伪装成浏览器
37         self.header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"}
38 
39     def createRequest(self, userId, page):
40         url = self.url.format(userId = userId, page = page)
41         requ = requests.get(url, headers = self.header).text
42         return requ
43 
44     def pageResponse(self, requ):
45         bsOBJ = BeautifulSoup(requ, lxml)
46         userContainer = bsOBJ.find_all(ul,{class:user-list})[0]
47         userContent = userContainer.contents
48         userContent = [str(user) for user in userContent if user != \n]
49         #关注用户列表
50         return userContent
51 
52     def parserUserInfo(self, user):
53         id, name = re.findall(self.idPattern, user)[0]
54         followingNum, followerNum, articleNum = re.findall(self.metalPattern, user)[0]
55         wordNum, likeNum = re.findall(self.meta, user)[0]#此处如果出现index out of range 则进行try except 就可以有的时候获取值为空
56         content = (id, name, followingNum, followerNum, articleNum, wordNum, likeNum)
57         writer.writerow(content)
58         return  content
59 
60     def getUserList(self, userId, following):
61         idContainer.add((userId, following))
62         num = int(following) / 10
63         page = math.ceil(num)
64         for pg in range(1, page + 1, 1):
65             requ = self.createRequest(userId, pg)
66             userList = self.pageResponse(requ)
67             for user in userList:
68                 content = self.parserUserInfo(user)
69                 linkDeque.append((content[0], content[2]))
70             time.sleep(1)
71         for deq in linkDeque:
72             if deq not in idContainer:
73                 self.getUserList(deq[0],deq[1])
74                 print("what")
75 jianshu = jianshu().getUserList(652fbdd1e7b3,162)

 

python 简书用户爬虫

标签:文件   imp   dex   apple   规则   双向   ons   users   用户id   

原文地址:http://www.cnblogs.com/wlc297984368/p/7673777.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!