标签:
很多做社交媒体数据分析的同学需要采集一些新浪微博上的数据,新浪微博虽然有提供api,但免费的api对获取的数据项和获取的频率都有很大的限制,商业版api据说限制较少,但是作为屌丝学生党拿来那么多钱买买商业版的api?!!!
直接写爬虫需先登录到新浪微博,否则爬虫一直返回登录页面不给数据。解决办法是:
注:以下模拟登陆部分是参考文章:python模拟新浪微博登陆功能(新浪微博爬虫) 所写,我在该文章的基础上做了一些小改动。
第一种方案操作较为繁琐,尤其是要多个微博马甲轮询,降低马甲被封概率的话。第二种方案就可以实现批量马甲登录,但是有的账号登录的时候可能需要输入验证码,验证码识别起来比较困难,目前我还没有解决这个问题。
下面是我初步写的不带验证码识别的模拟登陆代码
WeiboLogin.py:等一了登录类 WeiboLogin.最后调用WeiboLogin.Login()方法会返回一个带有当前马甲会话cookie的opener。多个马甲登录模拟登陆的话,可以实例化多个WeiboLogin,调用它们的Login()方法返回不同的opener 带有各自的会话cookie,降低账号被封的风险。
import urllib2,traceback,cookielib import WeiboEncode import WeiboSearch class WeiboLogin: def __init__(self, user, pwd, enableProxy=False): "初始化WeiboLogin,Proxy默认关闭" print "Initializing WeiboLogin..." self.userName = user self.passWord = pwd self.enableProxy = enableProxy self.cookiejar = cookielib.LWPCookieJar()#建立cookie self.serverUrl = "http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=&rsakt=mod&client=ssologin.js(v1.4.11)&_=1379834957683" self.loginUrl = "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.11)" self.postHeader = {‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0‘} def Login(self): #"登陆程序" ERROR_COUNT =0 opener = self.EnableCookie() while True: if ERROR_COUNT>3: print ‘login error!‘ return False try: url = "http://weibo.com/" req = urllib2.Request(url, None, self.postHeader) opener.open(req) serverTime, nonce, pubkey, rsakv = self.GetServerTime(opener)#登陆的第一步 postData = WeiboEncode.PostEncode(self.userName, self.passWord, serverTime, nonce, pubkey, rsakv)#加密用户和密码 print "Post data length:\n", len(postData) req = urllib2.Request(self.loginUrl, postData, self.postHeader) print "Posting request..." result = opener.open(req)#登陆的第二步——解析新浪微博的登录过程中3 text = result.read() loginUrl = WeiboSearch.sRedirectData(text)#解析重定位结果 req = urllib2.Request(loginUrl, None, self.postHeader) temp = opener.open(loginUrl) except: print traceback.format_exc() print ‘retrying......‘ ERROR_COUNT+=1 continue if WeiboSearch.sCheckLoginResult(temp.read()):#检查登录返回信息 print ‘Login sucess!‘ return opener else : print ‘login error‘ return False def EnableCookie(self):#"Enable cookie & proxy (if needed)." cookie_support = urllib2.HTTPCookieProcessor(self.cookiejar) if self.enableProxy: proxy_support = urllib2.ProxyHandler({‘http‘:‘http://xxxxx.pac‘})#使用代理 opener = urllib2.build_opener(proxy_support, cookie_support, urllib2.HTTPHandler) print "Proxy enabled" else: opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) return opener def GetServerTime(self,opener): "Get server time and nonce, which are used to encode the password" print "Getting server time and nonce..." req = urllib2.Request(self.serverUrl, None, self.postHeader) serverData = opener.open(req).read()#得到网页内容 print serverData try: serverTime, nonce, pubkey, rsakv = WeiboSearch.sServerData(serverData)#解析得到serverTime,nonce等 return serverTime, nonce, pubkey, rsakv except: print ‘Get server time & nonce error!‘ return None
WeiboSearch.py:主要是WebLogin.py需要的一些 检索、分析、检查函数
import re import json def sServerData(serverData): "Search the server time & nonce from server data" p = re.compile(‘\((.*)\)‘) jsonData = p.search(serverData).group(1) data = json.loads(jsonData) serverTime = str(data[‘servertime‘]) nonce = data[‘nonce‘] pubkey = data[‘pubkey‘]# rsakv = data[‘rsakv‘]# print "Server time is:", serverTime print "Nonce is:", nonce return serverTime, nonce, pubkey, rsakv def sRedirectData(text): p = re.compile(‘location\.replace\([\‘"](.*?)[\‘"]\)‘) loginUrl = p.search(text).group(1) print ‘loginUrl:‘,loginUrl return loginUrl def sCheckLoginResult(text): p = re.compile(‘parent\.sinaSSOController\.feedBackUrlCallBack\(\{"result":true,"userinfo":\{"uniqueid":‘) if p.search(text): return True else : return False
WeiboEncode.py:用户名密码加密操作
import urllib import base64 import rsa import binascii def PostEncode(userName, passWord, serverTime, nonce, pubkey, rsakv): "Used to generate POST data" encodedUserName = GetUserName(userName)#用户名使用base64加密 encodedPassWord = get_pwd(passWord, serverTime, nonce, pubkey)#目前密码采用rsa加密 postPara = { ‘entry‘: ‘weibo‘, ‘gateway‘: ‘1‘, ‘from‘: ‘‘, ‘savestate‘: ‘7‘, ‘userticket‘: ‘1‘, ‘ssosimplelogin‘: ‘1‘, ‘vsnf‘: ‘1‘, ‘vsnval‘: ‘‘, ‘su‘: encodedUserName, ‘service‘: ‘miniblog‘, ‘servertime‘: serverTime, ‘nonce‘: nonce, ‘pwencode‘: ‘rsa2‘, ‘sp‘: encodedPassWord, ‘encoding‘: ‘UTF-8‘, ‘prelt‘: ‘115‘, ‘rsakv‘: rsakv, ‘url‘: ‘http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack‘, ‘returntype‘: ‘META‘ } postData = urllib.urlencode(postPara)#网络编码 return postData def GetUserName(userName): "Used to encode user name" userNameTemp = urllib.quote(userName) userNameEncoded = base64.encodestring(userNameTemp)[:-1] return userNameEncoded def get_pwd(password, servertime, nonce, pubkey): rsaPublickey = int(pubkey, 16) key = rsa.PublicKey(rsaPublickey, 65537) #创建公钥 message = str(servertime) + ‘\t‘ + str(nonce) + ‘\n‘ + str(password) #拼接明文js加密文件中得到 passwd = rsa.encrypt(message, key) #加密 passwd = binascii.b2a_hex(passwd) #将加密信息转换为16进制。 return passwd
使用方法:
from WeiboLogin import WeiboLogin user_name =‘abc@sina.com‘ passwd =‘1234‘ opener = WeiboLogin(user_name, passwd).Login()
即使模拟登陆成功,写好解析程序之后,你会发现爬虫爬一小会就又不返回数据了,这是因为微博服务器监测到该ip访问频率异常,一般采取的策略是加代理ip(一般的http代理即可,最好是高匿的)。而且经测试,每次更换代理的ip的时候不用重新登录,直接用当前生成的opener添加更换的ip代理handler即可:
proxy_handler = urllib2.ProxyHandler({ "http" : proxyServer, "https" : proxyServer }) opener.add_handler(proxy_handler)
标签:
原文地址:http://www.cnblogs.com/feiqiangs/p/5719447.html